diff -Nru aom-3.8.2/.mailmap aom-3.9.0/.mailmap --- aom-3.8.2/.mailmap 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/.mailmap 2024-05-07 19:57:02.419000000 +0000 @@ -40,6 +40,7 @@ Jacky Chen James Zern Jean-Marc Valin +Jian Zhou Jim Bankoski Johann Koenig Johann Koenig diff -Nru aom-3.8.2/AUTHORS aom-3.9.0/AUTHORS --- aom-3.8.2/AUTHORS 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/AUTHORS 2024-05-07 19:57:02.419000000 +0000 @@ -51,6 +51,7 @@ Dake He Damon Shen Dandan Ding +Daniel Cheng Daniele Castagna Daniel Kang Daniel Max Valenzuela @@ -94,6 +95,7 @@ Hamsalekha S Hangyu Kuang Hanno Böck +Hari Limaye Harish Mahendrakar Henrik Lundin Hien Ho @@ -124,7 +126,7 @@ Jeff Petkau Jerome Jiang Jia Jia -Jian Zhou +Jian Zhou Jim Bankoski Jingning Han Joe Young @@ -216,6 +218,7 @@ Peter de Rivaz Peter Kasting Philip Jägenstedt +Philippe Antoine Priit Laes Qiu Jianlin Rachel Barker diff -Nru aom-3.8.2/CHANGELOG aom-3.9.0/CHANGELOG --- aom-3.8.2/CHANGELOG 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/CHANGELOG 2024-05-07 19:57:02.420000000 +0000 @@ -1,3 +1,91 @@ +2024-04-09 v3.9.0 + This release includes new codec interfaces, compression efficiency and + perceptual improvements, speedup for RTC for both video and screen content, + and many bug fixes. This release is ABI compatible with the previous release. + + - New Features + * New codec control + * AV1E_SET_SVC_FRAME_DROP_MODE is added to configure the SVC encoder to + only drop spatial layers or the whole superframe. + * Active Map is fixed and tested for RTC. + * CONFIG_QUANT_MATRIX is added to disable quantization matrices when aom + decoder is disabled with CONFIG_AV1_DECODER. Reduces ~10% binary size when + both are disabled. + * libwebm is updated to libwebm-1.0.0.31-1-gaffd7f4. + + - Compression Efficiency Improvements + * RTC encoding improvements + * 1-2% BD-rate gain for screen content with temporal layers; 5% BD-rate + gain on scrolling content. + + - Perceptual Quality Improvements + * For RTC screen content + * Reduced color artifacts for RTC screen content + * Visual quality improved for scene changes for SVC with quality layers. + * Removed visual artifacts for speed 11 + + - Speedups: + * RTC Speed 11: aggressive speedup setting added for video mode, + resolutions <= VGA: ~30% faster than speed 10. + * 5-9% speed up for high bit-depth encoding with good mode on Arm, half of + which comes from SVE/SVE2 optimizations. + + - Other improvements + * Further improvements to global motion estimation. + * Documented minimum required SIMD support: SSE4.1 on x86, Neon on Arm. + * Remove unneeded SIMD functions, saving >100 KiB from binary size. + * Cleaned up and improved pattern_search. + * Added end-to-end c vs SIMD bit-exactness test. + * Added config flag to calc psnr using libvmaf peak: use a slightly + different peak value for PSNR (1020 and 2040 for 10- and 12-bit) + + - Bug Fixes + * Fuzzing bug fixes + * b/329485898 Null-dereference WRITE in av1_cdef_frame_mt + * b/329810149 Null-dereference WRITE in av1_cdef_copy_sb8_16 + * b/329813868 Ill in av1_cdef_frame_mt + * chromium:327882824 Null-dereference WRITE in av1_cdef_init_fb_row + * b/330014723 Null-dereference WRITE in + cdef_copy_rect8_16bit_to_16bit_avx2 + * b/310455204 Null-dereference WRITE in prepare_enc_workers + * b/314858909 Heap-buffer-overflow in aom_variance64x64_avx2 + * oss-fuzz:67132 av1_dec_fuzzer: ASSERT: (pbi->tile_count_minus_1 + 1) <= + (pbi->output_frame_width_in_tiles_minus_1 + 1) + * oss-fuzz:67058 av1_dec_fuzzer: ASSERT: i == 0 || tile_w == *w + * oss-fuzz:67161 av1_dec_fuzzer: ASSERT: i == 0 || tile_h == *h + * oss-fuzz:67059 av1_dec_fuzzer: Crash in mem_get_varsize + * oss-fuzz:67162 av1_dec_fuzzer: Use-of-uninitialized-value in + od_ec_decode_bool_q15 + * oss-fuzz:67184 av1_dec_fuzzer: Heap-buffer-overflow in od_ec_dec_init + * oss-fuzz:67216 av1_dec_fuzzer: Heap-buffer-overflow in + od_ec_dec_normalize + * oss-fuzz:67055 av1_dec_fuzzer: Heap-buffer-overflow in + get_ls_tile_buffers + * libaom library + * aomedia:3510 Large value of duration could cause encoder overflow + * chromium:328105513 Fix build conflicts between Abseil and libaom/libvpx + in Win ARM64 builds + * aomedia:3544 AV1/SharpnessTestLarge.SharpnessPSNRTest failures after + 59c592bb8 + * aomedia:3531 Exception encountered with PSNR calculation + * aomedia:3541 Can not compile correctly by CYGWIN + * chromium:41482688 heap-buffer-overflow write in vpx_img_read() + (tools_common.c) with VPX_IMG_FMT_NV12 + * aomedia:3521 Assertion failures on Arm in CNNTest.* in + av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon and + av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon + * aomedia:3486 C vs NEON mismatch in AV1 encoder + * aomedia:3536 Over write in highbd_dr_prediction_z3_upsample1_neon() + * aomedia:3276 Significant progress on ensuring all allocations are + checked + * aomedia:3491 heap-buffer-overflow encoding frames of size 256x256, + 512x512 in good quality usage mode using 4 threads + * aomedia:3322 PSNR number discrepancy + * aomedia:3493 Cmake generates garbage symbols for libaom_srcs.gni + * aomedia:3478 GCC 12.2.0 emits a -Wstringop-overflow warning on + aom/av1/encoder/motion_search_facade.c + * aomedia:3484 C vs NEON mismatch in AV1 encoder for high-bitdepth case + 2024-03-08 v3.8.2 This release includes several bug fixes. This release is ABI compatible with the last release. See @@ -38,6 +126,21 @@ * b/314858909: Do not use adaptive error estimate. * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. +2024-01-18 v3.7.2 + This release includes three bug fixes. This release is ABI compatible + with the last release. See + https://aomedia.googlesource.com/aom/+log/v3.7.1..v3.7.2 for all the + commits in this release. + + - Bug Fixes + * aomedia:3520: get_cubic_kernel_dbl: Assertion `0 <= x && x < 1' + failed. + * aomedia:3526: alloc_compressor_data() is called during every + aom_codec_control() call on the encoder. Note that this partially + reverts the fix for bug aomedia:3349. + * b/310457427 and b/310766628: Only use rec_sse in CBR mode. + * Fix a hang of cmake on arm64 macOS with cmake 3.27.0 or later. + 2023-11-30 v3.8.0 This release includes new codec interfaces, compression efficiency and perceptual improvements, speedup and memory optimizations and many bug diff -Nru aom-3.8.2/CMakeLists.txt aom-3.9.0/CMakeLists.txt --- aom-3.8.2/CMakeLists.txt 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/CMakeLists.txt 2024-05-07 19:57:02.424000000 +0000 @@ -58,9 +58,9 @@ # passed to libtool. # # We set SO_FILE_VERSION = [c-a].a.r -set(LT_CURRENT 11) -set(LT_REVISION 2) -set(LT_AGE 8) +set(LT_CURRENT 12) +set(LT_REVISION 0) +set(LT_AGE 9) math(EXPR SO_VERSION "${LT_CURRENT} - ${LT_AGE}") set(SO_FILE_VERSION "${SO_VERSION}.${LT_AGE}.${LT_REVISION}") unset(LT_CURRENT) @@ -825,7 +825,8 @@ # Clang's AddressSanitizer documentation says "When linking shared libraries, # the AddressSanitizer run-time is not linked, so -Wl,-z,defs may cause link # errors (don't use it with AddressSanitizer)." See - # https://clang.llvm.org/docs/AddressSanitizer.html#usage. + # https://clang.llvm.org/docs/AddressSanitizer.html#usage. Similarly, see + # https://clang.llvm.org/docs/MemorySanitizer.html#usage. if(NOT WIN32 AND NOT APPLE AND NOT (CMAKE_C_COMPILER_ID MATCHES "Clang" AND SANITIZE)) @@ -940,7 +941,7 @@ foreach(var ${all_cmake_vars}) if("${var}" MATCHES "SOURCES$\|_INTRIN_\|_ASM_" AND NOT "${var}" MATCHES "DOXYGEN\|LIBYUV\|_PKG_\|TEST" - AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER_") + AND NOT "${var}" MATCHES "_ASM_NASM\|_ASM_COMPILER") list(APPEND aom_source_vars ${var}) endif() endforeach() diff -Nru aom-3.8.2/README.md aom-3.9.0/README.md --- aom-3.8.2/README.md 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/README.md 2024-05-07 19:57:02.427000000 +0000 @@ -46,17 +46,23 @@ ### Prerequisites {#prerequisites} - 1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version - required. - 2. [Git](https://git-scm.com/). - 3. [Perl](https://www.perl.org/). - 4. For x86 targets, [yasm](http://yasm.tortall.net/), which is preferred, or a - recent version of [nasm](http://www.nasm.us/). If you download yasm with - the intention to work with Visual Studio, please download win32.exe or - win64.exe and rename it into yasm.exe. DO NOT download or use vsyasm.exe. - 5. Building the documentation requires +1. [CMake](https://cmake.org). See CMakeLists.txt for the minimum version + required. +2. [Git](https://git-scm.com/). +3. A modern C compiler. gcc 6+, clang 7+, Microsoft Visual Studio 2019+ or + the latest version of MinGW-w64 (clang64 or ucrt toolchains) are + recommended. A C++ compiler is necessary to build the unit tests and some + features contained in the examples. +4. [Perl](https://www.perl.org/). +5. For x86 targets, [yasm](http://yasm.tortall.net/) or a recent version (2.14 + or later) of [nasm](http://www.nasm.us/). (If both yasm and nasm are + present, yasm will be used by default. Pass -DENABLE_NASM=ON to cmake to + select nasm.) If you download yasm with the intention to work with Visual + Studio, please download win32.exe or win64.exe and rename it into yasm.exe. + DO NOT download or use vsyasm.exe. +6. Building the documentation requires [doxygen version 1.8.10 or newer](http://doxygen.org). - 6. Emscripten builds require the portable +7. Emscripten builds require the portable [EMSDK](https://kripken.github.io/emscripten-site/index.html). ### Get the code {#get-the-code} diff -Nru aom-3.8.2/aom/aom_decoder.h aom-3.9.0/aom/aom_decoder.h --- aom-3.8.2/aom/aom_decoder.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom/aom_decoder.h 2024-05-07 19:57:02.430000000 +0000 @@ -30,7 +30,7 @@ extern "C" { #endif -#include "aom/aom_codec.h" +#include "aom/aom_codec.h" // IWYU pragma: export #include "aom/aom_frame_buffer.h" /*!\brief Current ABI version number diff -Nru aom-3.8.2/aom/aom_encoder.h aom-3.9.0/aom/aom_encoder.h --- aom-3.8.2/aom/aom_encoder.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom/aom_encoder.h 2024-05-07 19:57:02.431000000 +0000 @@ -30,7 +30,7 @@ extern "C" { #endif -#include "aom/aom_codec.h" +#include "aom/aom_codec.h" // IWYU pragma: export #include "aom/aom_external_partition.h" /*!\brief Current ABI version number @@ -1044,6 +1044,11 @@ * Interface is not an encoder interface. * \retval #AOM_CODEC_INVALID_PARAM * A parameter was NULL, the image format is unsupported, etc. + * + * \note + * `duration` is of the unsigned long type, which can be 32 or 64 bits. + * `duration` must be less than or equal to UINT32_MAX so that its range is + * independent of the size of unsigned long. */ aom_codec_err_t aom_codec_encode(aom_codec_ctx_t *ctx, const aom_image_t *img, aom_codec_pts_t pts, unsigned long duration, diff -Nru aom-3.8.2/aom/aom_image.h aom-3.9.0/aom/aom_image.h --- aom-3.8.2/aom/aom_image.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom/aom_image.h 2024-05-07 19:57:02.436000000 +0000 @@ -103,7 +103,8 @@ AOM_CICP_TC_SMPTE_428 = 17, /**< SMPTE ST 428 */ AOM_CICP_TC_HLG = 18, /**< BT.2100 HLG, ARIB STD-B67 */ AOM_CICP_TC_RESERVED_19 = 19 /**< For future use (values 19-255) */ -} aom_transfer_characteristics_t; /**< alias for enum aom_transfer_function */ +} aom_transfer_characteristics_t; /**< alias for enum + aom_transfer_characteristics */ /*!\brief List of supported matrix coefficients */ typedef enum aom_matrix_coefficients { @@ -125,7 +126,7 @@ AOM_CICP_MC_CHROMAT_CL = 13, /**< Chromaticity-derived constant luminance */ AOM_CICP_MC_ICTCP = 14, /**< BT.2100 ICtCp */ AOM_CICP_MC_RESERVED_15 = 15 /**< For future use (values 15-255) */ -} aom_matrix_coefficients_t; +} aom_matrix_coefficients_t; /**< alias for enum aom_matrix_coefficients */ /*!\brief List of supported color range */ typedef enum aom_color_range { @@ -144,7 +145,8 @@ /**< sample, between two vertical samples */ AOM_CSP_COLOCATED = 2, /**< Co-located with luma(0, 0) sample */ AOM_CSP_RESERVED = 3 /**< Reserved value */ -} aom_chroma_sample_position_t; /**< alias for enum aom_transfer_function */ +} aom_chroma_sample_position_t; /**< alias for enum aom_chroma_sample_position + */ /*!\brief List of insert flags for Metadata * @@ -244,10 +246,13 @@ * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be @@ -267,10 +272,12 @@ * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of each row in the image - * (stride). + * (stride). Must not exceed 65536. * \param[in] img_data Storage to use for the image * * \return Returns a pointer to the initialized image descriptor. If the img @@ -291,12 +298,17 @@ * is NULL, the storage for the descriptor will be * allocated on the heap. * \param[in] fmt Format for the image - * \param[in] d_w Width of the image - * \param[in] d_h Height of the image + * \param[in] d_w Width of the image. Must not exceed 0x08000000 + * (2^27). + * \param[in] d_h Height of the image. Must not exceed 0x08000000 + * (2^27). * \param[in] align Alignment, in bytes, of the image buffer and - * each row in the image (stride). + * each row in the image (stride). Must not exceed + * 65536. * \param[in] size_align Alignment, in pixels, of the image width and height. + * Must not exceed 65536. * \param[in] border A border that is padded on four sides of the image. + * Must not exceed 65536. * * \return Returns a pointer to the initialized image descriptor. If the img * parameter is non-null, the value of the img parameter will be diff -Nru aom-3.8.2/aom/aom_integer.h aom-3.9.0/aom/aom_integer.h --- aom-3.8.2/aom/aom_integer.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom/aom_integer.h 2024-05-07 19:57:02.437000000 +0000 @@ -12,7 +12,7 @@ #define AOM_AOM_AOM_INTEGER_H_ /* get ptrdiff_t, size_t, wchar_t, NULL */ -#include +#include // IWYU pragma: export #if defined(_MSC_VER) #define AOM_FORCE_INLINE __forceinline @@ -33,8 +33,8 @@ #endif #endif // __cplusplus -#include -#include +#include // IWYU pragma: export +#include // IWYU pragma: export #if defined(__cplusplus) extern "C" { diff -Nru aom-3.8.2/aom/aomcx.h aom-3.9.0/aom/aomcx.h --- aom-3.8.2/aom/aomcx.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom/aomcx.h 2024-05-07 19:57:02.437000000 +0000 @@ -1533,6 +1533,12 @@ */ AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR = 164, + /*!\brief Codec control to set the frame drop mode for SVC, + * unsigned int parameter. The valid values are constants of the + * AOM_SVC_FRAME_DROP_MODE enum: AOM_LAYER_DROP or AOM_FULL_SUPERFRAME_DROP. + */ + AV1E_SET_SVC_FRAME_DROP_MODE = 165, + // Any new encoder control IDs should be added above. // Maximum allowed encoder control ID is 229. // No encoder control ID should be added below. @@ -1699,6 +1705,12 @@ int use_comp_pred[3]; /** #include #include @@ -129,10 +130,9 @@ return ctx->err; } -void aom_internal_error(struct aom_internal_error_info *info, - aom_codec_err_t error, const char *fmt, ...) { - va_list ap; - +LIBAOM_FORMAT_PRINTF(3, 0) +static void set_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, va_list ap) { info->error_code = error; info->has_detail = 0; @@ -140,15 +140,45 @@ size_t sz = sizeof(info->detail); info->has_detail = 1; - va_start(ap, fmt); vsnprintf(info->detail, sz - 1, fmt, ap); - va_end(ap); info->detail[sz - 1] = '\0'; } +} + +void aom_set_error(struct aom_internal_error_info *info, aom_codec_err_t error, + const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + set_error(info, error, fmt, ap); + va_end(ap); + + assert(!info->setjmp); +} + +void aom_internal_error(struct aom_internal_error_info *info, + aom_codec_err_t error, const char *fmt, ...) { + va_list ap; + + va_start(ap, fmt); + set_error(info, error, fmt, ap); + va_end(ap); if (info->setjmp) longjmp(info->jmp, info->error_code); } +void aom_internal_error_copy(struct aom_internal_error_info *info, + const struct aom_internal_error_info *src) { + assert(info != src); + assert(!src->setjmp); + + if (!src->has_detail) { + aom_internal_error(info, src->error_code, NULL); + } else { + aom_internal_error(info, src->error_code, "%s", src->detail); + } +} + void aom_merge_corrupted_flag(int *corrupted, int value) { *corrupted |= value; } diff -Nru aom-3.8.2/aom/src/aom_encoder.c aom-3.9.0/aom/src/aom_encoder.c --- aom-3.8.2/aom/src/aom_encoder.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom/src/aom_encoder.c 2024-05-07 19:57:02.444000000 +0000 @@ -23,6 +23,7 @@ #endif #include +#include #include #include "aom/aom_encoder.h" @@ -178,6 +179,10 @@ else if (img && ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) != 0) != ((ctx->init_flags & AOM_CODEC_USE_HIGHBITDEPTH) != 0)) { res = AOM_CODEC_INVALID_PARAM; +#if ULONG_MAX > UINT32_MAX + } else if (duration > UINT32_MAX) { + res = AOM_CODEC_INVALID_PARAM; +#endif } else { /* Execute in a normalized floating point environment, if the platform * requires it. diff -Nru aom-3.8.2/aom/src/aom_image.c aom-3.9.0/aom/src/aom_image.c --- aom-3.8.2/aom/src/aom_image.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom/src/aom_image.c 2024-05-07 19:57:02.444000000 +0000 @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include #include #include @@ -36,11 +37,20 @@ /* NOTE: In this function, bit_depth is either 8 or 16 (if * AOM_IMG_FMT_HIGHBITDEPTH is set), never 10 or 12. */ - unsigned int h, w, s, xcs, ycs, bps, bit_depth; - unsigned int stride_in_bytes; + unsigned int xcs, ycs, bps, bit_depth; if (img != NULL) memset(img, 0, sizeof(aom_image_t)); + if (fmt == AOM_IMG_FMT_NONE) goto fail; + + /* Impose maximum values on input parameters so that this function can + * perform arithmetic operations without worrying about overflows. + */ + if (d_w > 0x08000000 || d_h > 0x08000000 || buf_align > 65536 || + stride_align > 65536 || size_align > 65536 || border > 65536) { + goto fail; + } + /* Treat align==0 like align==1 */ if (!buf_align) buf_align = 1; @@ -103,12 +113,17 @@ } /* Calculate storage sizes given the chroma subsampling */ - w = align_image_dimension(d_w, xcs, size_align); - h = align_image_dimension(d_h, ycs, size_align); - - s = (fmt & AOM_IMG_FMT_PLANAR) ? w : bps * w / bit_depth; - s = (s + 2 * border + stride_align - 1) & ~(stride_align - 1); - stride_in_bytes = s * bit_depth / 8; + const unsigned int w = align_image_dimension(d_w, xcs, size_align); + assert(d_w <= w); + const unsigned int h = align_image_dimension(d_h, ycs, size_align); + assert(d_h <= h); + + uint64_t s = (uint64_t)w + 2 * border; + s = (fmt & AOM_IMG_FMT_PLANAR) ? s : s * bps / bit_depth; + s = s * bit_depth / 8; + s = (s + stride_align - 1) & ~((uint64_t)stride_align - 1); + if (s > INT_MAX) goto fail; + const int stride_in_bytes = (int)s; /* Allocate the new image */ if (!img) { @@ -230,7 +245,7 @@ img->planes[AOM_PLANE_Y] = data + x * bytes_per_sample + y * img->stride[AOM_PLANE_Y]; - data += (img->h + 2 * border) * img->stride[AOM_PLANE_Y]; + data += ((size_t)img->h + 2 * border) * img->stride[AOM_PLANE_Y]; unsigned int uv_border_h = border >> img->y_chroma_shift; unsigned int uv_x = x >> img->x_chroma_shift; @@ -242,14 +257,14 @@ } else if (!(img->fmt & AOM_IMG_FMT_UV_FLIP)) { img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_U]; img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; } else { img->planes[AOM_PLANE_V] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_V]; - data += ((img->h >> img->y_chroma_shift) + 2 * uv_border_h) * + data += ((size_t)(img->h >> img->y_chroma_shift) + 2 * uv_border_h) * img->stride[AOM_PLANE_V]; img->planes[AOM_PLANE_U] = data + uv_x * bytes_per_sample + uv_y * img->stride[AOM_PLANE_U]; @@ -289,15 +304,15 @@ } int aom_img_plane_width(const aom_image_t *img, int plane) { - if (plane > 0 && img->x_chroma_shift > 0) - return (img->d_w + 1) >> img->x_chroma_shift; + if (plane > 0) + return (img->d_w + img->x_chroma_shift) >> img->x_chroma_shift; else return img->d_w; } int aom_img_plane_height(const aom_image_t *img, int plane) { - if (plane > 0 && img->y_chroma_shift > 0) - return (img->d_h + 1) >> img->y_chroma_shift; + if (plane > 0) + return (img->d_h + img->y_chroma_shift) >> img->y_chroma_shift; else return img->d_h; } diff -Nru aom-3.8.2/aom_dsp/aom_dsp.cmake aom-3.9.0/aom_dsp/aom_dsp.cmake --- aom-3.8.2/aom_dsp/aom_dsp.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/aom_dsp.cmake 2024-05-07 19:57:02.446000000 +0000 @@ -52,15 +52,12 @@ list(APPEND AOM_DSP_COMMON_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_8t_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/aom_high_subpixel_bilinear_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_asm_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/intrapred_asm_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/inv_wht_sse2.asm") list(APPEND AOM_DSP_COMMON_INTRIN_SSE2 "${AOM_ROOT}/aom_dsp/x86/aom_convolve_copy_sse2.c" - "${AOM_ROOT}/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c" "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c" "${AOM_ROOT}/aom_dsp/x86/convolve.h" "${AOM_ROOT}/aom_dsp/x86/convolve_sse2.h" @@ -145,6 +142,9 @@ "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_intrapred_neon.c" "${AOM_ROOT}/aom_dsp/arm/highbd_loopfilter_neon.c") + + list(APPEND AOM_DSP_COMMON_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/arm/highbd_convolve8_sve.c") endif() if(CONFIG_AV1_DECODER) @@ -200,7 +200,8 @@ "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_sse4.c") list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2 - "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c") + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/corner_match_avx2.c" + "${AOM_ROOT}/aom_dsp/flow_estimation/x86/disflow_avx2.c") list(APPEND AOM_DSP_ENCODER_INTRIN_NEON "${AOM_ROOT}/aom_dsp/flow_estimation/arm/disflow_neon.c") @@ -208,7 +209,6 @@ list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/sad4d_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/sad_sse2.asm" - "${AOM_ROOT}/aom_dsp/x86/subpel_variance_sse2.asm" "${AOM_ROOT}/aom_dsp/x86/subtract_sse2.asm") list(APPEND AOM_DSP_ENCODER_ASM_SSE2_X86_64 @@ -227,6 +227,9 @@ "${AOM_ROOT}/aom_dsp/x86/variance_sse2.c" "${AOM_ROOT}/aom_dsp/x86/jnt_sad_sse2.c") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3 + "${AOM_ROOT}/aom_dsp/x86/subpel_variance_ssse3.asm") + list(APPEND AOM_DSP_ENCODER_ASM_SSSE3_X86_64 "${AOM_ROOT}/aom_dsp/x86/fwd_txfm_ssse3_x86_64.asm" "${AOM_ROOT}/aom_dsp/x86/quantize_ssse3_x86_64.asm") @@ -292,6 +295,10 @@ "${AOM_ROOT}/aom_dsp/arm/sum_squares_neon_dotprod.c" "${AOM_ROOT}/aom_dsp/arm/variance_neon_dotprod.c") + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE "${AOM_ROOT}/aom_dsp/arm/avg_sve.c" + "${AOM_ROOT}/aom_dsp/arm/blk_sse_sum_sve.c" + "${AOM_ROOT}/aom_dsp/arm/sum_squares_sve.c") + if(CONFIG_AV1_HIGHBITDEPTH) list(APPEND AOM_DSP_ENCODER_ASM_SSE2 "${AOM_ROOT}/aom_dsp/x86/highbd_sad4d_sse2.asm" @@ -327,6 +334,10 @@ list(APPEND AOM_DSP_ENCODER_INTRIN_NEON_DOTPROD "${AOM_ROOT}/aom_dsp/arm/highbd_variance_neon_dotprod.c") + + list(APPEND AOM_DSP_ENCODER_INTRIN_SVE + "${AOM_ROOT}/aom_dsp/arm/highbd_sse_sve.c" + "${AOM_ROOT}/aom_dsp/arm/highbd_variance_sve.c") endif() if(CONFIG_INTERNAL_STATS) @@ -484,6 +495,15 @@ "AOM_DSP_COMMON_INTRIN_NEON_I8MM") endif() + if(HAVE_SVE) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_common" + "AOM_DSP_COMMON_INTRIN_SVE") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_dsp_encoder" + "AOM_DSP_ENCODER_INTRIN_SVE") + endif() + endif() + target_sources(aom PRIVATE $) if(BUILD_SHARED_LIBS) target_sources(aom_static PRIVATE $) diff -Nru aom-3.8.2/aom_dsp/aom_dsp_rtcd.c aom-3.9.0/aom_dsp/aom_dsp_rtcd.c --- aom-3.8.2/aom_dsp/aom_dsp_rtcd.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/aom_dsp_rtcd.c 2024-05-07 19:57:02.447000000 +0000 @@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void aom_dsp_rtcd() { aom_once(setup_rtcd_internal); } +void aom_dsp_rtcd(void) { aom_once(setup_rtcd_internal); } diff -Nru aom-3.8.2/aom_dsp/aom_dsp_rtcd_defs.pl aom-3.9.0/aom_dsp/aom_dsp_rtcd_defs.pl --- aom-3.8.2/aom_dsp/aom_dsp_rtcd_defs.pl 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/aom_dsp_rtcd_defs.pl 2024-05-07 19:57:02.448000000 +0000 @@ -498,8 +498,8 @@ add_proto qw/void aom_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"; specialize qw/aom_convolve_copy neon sse2 avx2/; -specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; -specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm sse2 ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_horiz neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; +specialize qw/aom_convolve8_vert neon neon_dotprod neon_i8mm ssse3/, "$avx2_ssse3"; add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/aom_scaled_2d ssse3 neon/; @@ -509,10 +509,10 @@ specialize qw/aom_highbd_convolve_copy sse2 avx2 neon/; add_proto qw/void aom_highbd_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; - specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon/; + specialize qw/aom_highbd_convolve8_horiz sse2 avx2 neon sve/; add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd"; - specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon/; + specialize qw/aom_highbd_convolve8_vert sse2 avx2 neon sve/; } # @@ -776,30 +776,30 @@ specialize qw/aom_sse sse4_1 avx2 neon neon_dotprod/; add_proto qw/void/, "aom_get_blk_sse_sum", "const int16_t *data, int stride, int bw, int bh, int *x_sum, int64_t *x2_sum"; - specialize qw/aom_get_blk_sse_sum sse2 avx2 neon/; + specialize qw/aom_get_blk_sse_sum sse2 avx2 neon sve/; if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void aom_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; specialize qw/aom_highbd_subtract_block sse2 neon/; add_proto qw/int64_t/, "aom_highbd_sse", "const uint8_t *a8, int a_stride, const uint8_t *b8,int b_stride, int width, int height"; - specialize qw/aom_highbd_sse sse4_1 avx2 neon/; + specialize qw/aom_highbd_sse sse4_1 avx2 neon sve/; } # # Sum of Squares # add_proto qw/uint64_t aom_sum_squares_2d_i16/, "const int16_t *src, int stride, int width, int height"; - specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon/; + specialize qw/aom_sum_squares_2d_i16 sse2 avx2 neon sve/; add_proto qw/uint64_t aom_sum_squares_i16/, "const int16_t *src, uint32_t N"; - specialize qw/aom_sum_squares_i16 sse2 neon/; + specialize qw/aom_sum_squares_i16 sse2 neon sve/; add_proto qw/uint64_t aom_var_2d_u8/, "uint8_t *src, int src_stride, int width, int height"; specialize qw/aom_var_2d_u8 sse2 avx2 neon neon_dotprod/; add_proto qw/uint64_t aom_var_2d_u16/, "uint8_t *src, int src_stride, int width, int height"; - specialize qw/aom_var_2d_u16 sse2 avx2 neon/; + specialize qw/aom_var_2d_u16 sse2 avx2 neon sve/; # # Single block SAD / Single block Avg SAD @@ -813,7 +813,7 @@ } add_proto qw/uint64_t aom_sum_sse_2d_i16/, "const int16_t *src, int src_stride, int width, int height, int *sum"; - specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2/; + specialize qw/aom_sum_sse_2d_i16 avx2 neon sse2 sve/; specialize qw/aom_sad128x128 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad128x64 avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad64x128 avx2 sse2 neon neon_dotprod/; @@ -1087,7 +1087,7 @@ specialize qw/aom_sad_skip_16x32x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x16x4d avx2 sse2 neon neon_dotprod/; specialize qw/aom_sad_skip_16x8x4d avx2 sse2 neon neon_dotprod/; - specialize qw/aom_sad_skip_16x4x4d neon neon_dotprod/; + specialize qw/aom_sad_skip_16x4x4d avx2 neon neon_dotprod/; specialize qw/aom_sad_skip_8x32x4d sse2 neon/; specialize qw/aom_sad_skip_8x16x4d sse2 neon/; specialize qw/aom_sad_skip_8x8x4d sse2 neon/; @@ -1116,7 +1116,7 @@ specialize qw/aom_sad64x16x3d avx2 neon neon_dotprod/; specialize qw/aom_sad32x8x3d avx2 neon neon_dotprod/; specialize qw/aom_sad16x64x3d avx2 neon neon_dotprod/; - specialize qw/aom_sad16x4x3d neon neon_dotprod/; + specialize qw/aom_sad16x4x3d avx2 neon neon_dotprod/; specialize qw/aom_sad8x32x3d neon/; specialize qw/aom_sad4x16x3d neon/; @@ -1263,9 +1263,7 @@ specialize qw/aom_int_pro_col avx2 sse2 neon/; add_proto qw/int aom_vector_var/, "const int16_t *ref, const int16_t *src, int bwl"; - specialize qw/aom_vector_var avx2 sse4_1 neon/; - # TODO(kyslov@) bring back SSE2 by extending it to 128 block size - #specialize qw/aom_vector_var neon sse2/; + specialize qw/aom_vector_var avx2 sse4_1 neon sve/; # # hamadard transform and satd for implmenting temporal dependency model @@ -1352,16 +1350,19 @@ add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; add_proto qw/unsigned int/, "aom_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon/; - specialize "aom_highbd_${bd}_mse16x8", qw/neon/; - specialize "aom_highbd_${bd}_mse8x16", qw/neon/; - specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon/; - } + if ($bd eq 8) { + specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon neon_dotprod/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon neon_dotprod/; + } else { + specialize "aom_highbd_${bd}_mse16x16", qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_mse16x8", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x16", qw/neon sve/; + specialize "aom_highbd_${bd}_mse8x8", qw/sse2 neon sve/; + } - specialize "aom_highbd_8_mse16x16", qw/neon_dotprod/; - specialize "aom_highbd_8_mse16x8", qw/neon_dotprod/; - specialize "aom_highbd_8_mse8x16", qw/neon_dotprod/; - specialize "aom_highbd_8_mse8x8", qw/neon_dotprod/; + } } # @@ -1403,39 +1404,39 @@ specialize qw/aom_variance4x8 sse2 neon neon_dotprod/; specialize qw/aom_variance4x4 sse2 neon neon_dotprod/; - specialize qw/aom_sub_pixel_variance128x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance128x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x8 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance4x4 neon sse2 ssse3/; - - specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x4 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance128x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance128x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance64x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x8 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x16 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance8x4 neon ssse3/; + specialize qw/aom_sub_pixel_variance4x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance4x4 neon ssse3/; + + specialize qw/aom_sub_pixel_avg_variance128x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance128x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x128 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x64 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x32 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x16 avx2 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x32 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x4 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x4 neon ssse3/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { specialize qw/aom_variance4x16 neon neon_dotprod sse2/; @@ -1445,18 +1446,18 @@ specialize qw/aom_variance16x64 neon neon_dotprod sse2 avx2/; specialize qw/aom_variance64x16 neon neon_dotprod sse2 avx2/; - specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance8x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance32x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_variance16x64 neon avx2 sse2 ssse3/; - specialize qw/aom_sub_pixel_variance64x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance4x16 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x4 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance8x32 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance32x8 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance16x64 neon sse2 ssse3/; - specialize qw/aom_sub_pixel_avg_variance64x16 neon sse2 ssse3/; + specialize qw/aom_sub_pixel_variance4x16 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x4 neon avx2 ssse3/; + specialize qw/aom_sub_pixel_variance8x32 neon ssse3/; + specialize qw/aom_sub_pixel_variance32x8 neon ssse3/; + specialize qw/aom_sub_pixel_variance16x64 neon avx2 ssse3/; + specialize qw/aom_sub_pixel_variance64x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance4x16 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x4 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance8x32 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance32x8 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance16x64 neon ssse3/; + specialize qw/aom_sub_pixel_avg_variance64x16 neon ssse3/; specialize qw/aom_dist_wtd_sub_pixel_avg_variance4x16 neon ssse3/; specialize qw/aom_dist_wtd_sub_pixel_avg_variance16x4 neon ssse3/; @@ -1495,66 +1496,66 @@ } } - specialize qw/aom_highbd_12_variance128x128 sse2 neon/; - specialize qw/aom_highbd_12_variance128x64 sse2 neon/; - specialize qw/aom_highbd_12_variance64x128 sse2 neon/; - specialize qw/aom_highbd_12_variance64x64 sse2 neon/; - specialize qw/aom_highbd_12_variance64x32 sse2 neon/; - specialize qw/aom_highbd_12_variance32x64 sse2 neon/; - specialize qw/aom_highbd_12_variance32x32 sse2 neon/; - specialize qw/aom_highbd_12_variance32x16 sse2 neon/; - specialize qw/aom_highbd_12_variance16x32 sse2 neon/; - specialize qw/aom_highbd_12_variance16x16 sse2 neon/; - specialize qw/aom_highbd_12_variance16x8 sse2 neon/; - specialize qw/aom_highbd_12_variance8x16 sse2 neon/; - specialize qw/aom_highbd_12_variance8x8 sse2 neon/; - specialize qw/aom_highbd_12_variance8x4 neon/; - specialize qw/aom_highbd_12_variance4x8 neon/; - specialize qw/aom_highbd_12_variance4x4 sse4_1 neon/; - - specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon/; - specialize qw/aom_highbd_10_variance8x4 neon/; - specialize qw/aom_highbd_10_variance4x8 neon/; - specialize qw/aom_highbd_10_variance4x4 sse4_1 neon/; - - specialize qw/aom_highbd_8_variance128x128 sse2 neon/; - specialize qw/aom_highbd_8_variance128x64 sse2 neon/; - specialize qw/aom_highbd_8_variance64x128 sse2 neon/; - specialize qw/aom_highbd_8_variance64x64 sse2 neon/; - specialize qw/aom_highbd_8_variance64x32 sse2 neon/; - specialize qw/aom_highbd_8_variance32x64 sse2 neon/; - specialize qw/aom_highbd_8_variance32x32 sse2 neon/; - specialize qw/aom_highbd_8_variance32x16 sse2 neon/; - specialize qw/aom_highbd_8_variance16x32 sse2 neon/; - specialize qw/aom_highbd_8_variance16x16 sse2 neon/; - specialize qw/aom_highbd_8_variance16x8 sse2 neon/; - specialize qw/aom_highbd_8_variance8x16 sse2 neon/; - specialize qw/aom_highbd_8_variance8x8 sse2 neon/; - specialize qw/aom_highbd_8_variance8x4 neon/; - specialize qw/aom_highbd_8_variance4x8 neon/; - specialize qw/aom_highbd_8_variance4x4 sse4_1 neon/; + specialize qw/aom_highbd_12_variance128x128 sse2 neon sve/; + specialize qw/aom_highbd_12_variance128x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x128 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance64x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x64 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance32x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x32 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance16x8 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x16 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x8 sse2 neon sve/; + specialize qw/aom_highbd_12_variance8x4 neon sve/; + specialize qw/aom_highbd_12_variance4x8 neon sve/; + specialize qw/aom_highbd_12_variance4x4 sse4_1 neon sve/; + + specialize qw/aom_highbd_10_variance128x128 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance128x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x128 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance64x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x64 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance32x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x32 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance16x8 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x16 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x8 sse2 avx2 neon sve/; + specialize qw/aom_highbd_10_variance8x4 neon sve/; + specialize qw/aom_highbd_10_variance4x8 neon sve/; + specialize qw/aom_highbd_10_variance4x4 sse4_1 neon sve/; + + specialize qw/aom_highbd_8_variance128x128 sse2 neon sve/; + specialize qw/aom_highbd_8_variance128x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x128 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance64x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x64 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance32x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x32 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance16x8 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x16 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x8 sse2 neon sve/; + specialize qw/aom_highbd_8_variance8x4 neon sve/; + specialize qw/aom_highbd_8_variance4x8 neon sve/; + specialize qw/aom_highbd_8_variance4x4 sse4_1 neon sve/; if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { foreach $bd (8, 10, 12) { my $avx2 = ($bd == 10) ? "avx2" : ""; - specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance16x4" , qw/neon/; - specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon/; - specialize "aom_highbd_${bd}_variance4x16" , qw/neon/; + specialize "aom_highbd_${bd}_variance64x16" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance32x8" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance16x64" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance16x4" , qw/neon sve/; + specialize "aom_highbd_${bd}_variance8x32" , $avx2, qw/sse2 neon sve/; + specialize "aom_highbd_${bd}_variance4x16" , qw/neon sve/; } } @@ -1773,7 +1774,7 @@ specialize qw/aom_highbd_dist_wtd_comp_avg_pred sse2 neon/; add_proto qw/uint64_t/, "aom_mse_wxh_16bit_highbd", "uint16_t *dst, int dstride,uint16_t *src, int sstride, int w, int h"; - specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon/; + specialize qw/aom_mse_wxh_16bit_highbd sse2 avx2 neon sve/; } add_proto qw/void aom_comp_mask_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask"; @@ -1786,11 +1787,14 @@ # Flow estimation library if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") { - add_proto qw/double av1_compute_cross_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, const unsigned char *frame2, int stride2, int x2, int y2"; - specialize qw/av1_compute_cross_correlation sse4_1 avx2/; + add_proto qw/bool aom_compute_mean_stddev/, "const unsigned char *frame, int stride, int x, int y, double *mean, double *one_over_stddev"; + specialize qw/aom_compute_mean_stddev sse4_1 avx2/; + + add_proto qw/double aom_compute_correlation/, "const unsigned char *frame1, int stride1, int x1, int y1, double mean1, double one_over_stddev1, const unsigned char *frame2, int stride2, int x2, int y2, double mean2, double one_over_stddev2"; + specialize qw/aom_compute_correlation sse4_1 avx2/; add_proto qw/void aom_compute_flow_at_point/, "const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v"; - specialize qw/aom_compute_flow_at_point sse4_1 neon/; + specialize qw/aom_compute_flow_at_point sse4_1 avx2 neon/; } } # CONFIG_AV1_ENCODER diff -Nru aom-3.8.2/aom_dsp/aom_simd.h aom-3.9.0/aom_dsp/aom_simd.h --- aom-3.8.2/aom_dsp/aom_simd.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/aom_simd.h 2024-05-07 19:57:02.450000000 +0000 @@ -24,12 +24,10 @@ #define SIMD_CHECK 1 // Sanity checks in C equivalents -#if HAVE_NEON -#include "simd/v256_intrinsics_arm.h" // VS compiling for 32 bit targets does not support vector types in // structs as arguments, which makes the v256 type of the intrinsics // hard to support, so optimizations for this target are disabled. -#elif HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) +#if HAVE_SSE2 && (defined(_WIN64) || !defined(_MSC_VER) || defined(__clang__)) #include "simd/v256_intrinsics_x86.h" #else #include "simd/v256_intrinsics.h" diff -Nru aom-3.8.2/aom_dsp/arm/aom_convolve8_neon.c aom-3.9.0/aom_dsp/arm/aom_convolve8_neon.c --- aom-3.8.2/aom_dsp/arm/aom_convolve8_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/aom_convolve8_neon.c 2024-05-07 19:57:02.451000000 +0000 @@ -113,10 +113,8 @@ transpose_elems_inplace_u8_4x4(&d01, &d23); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d23, 0); - store_u8_4x1(dst + 2 * dst_stride, d01, 1); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, 2 * dst_stride, d01); + store_u8x4_strided_x2(dst + 1 * dst_stride, 2 * dst_stride, d23); s0 = s4; s1 = s5; @@ -161,14 +159,10 @@ transpose_elems_inplace_u8_8x4(&d0, &d1, &d2, &d3); - store_u8_4x1(dst + 0 * dst_stride, d0, 0); - store_u8_4x1(dst + 1 * dst_stride, d1, 0); - store_u8_4x1(dst + 2 * dst_stride, d2, 0); - store_u8_4x1(dst + 3 * dst_stride, d3, 0); - store_u8_4x1(dst + 4 * dst_stride, d0, 1); - store_u8_4x1(dst + 5 * dst_stride, d1, 1); - store_u8_4x1(dst + 6 * dst_stride, d2, 1); - store_u8_4x1(dst + 7 * dst_stride, d3, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, 4 * dst_stride, d0); + store_u8x4_strided_x2(dst + 1 * dst_stride, 4 * dst_stride, d1); + store_u8x4_strided_x2(dst + 2 * dst_stride, 4 * dst_stride, d2); + store_u8x4_strided_x2(dst + 3 * dst_stride, 4 * dst_stride, d3); src += 8 * src_stride; dst += 8 * dst_stride; @@ -287,10 +281,8 @@ d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; diff -Nru aom-3.8.2/aom_dsp/arm/aom_convolve8_neon_dotprod.c aom-3.9.0/aom_dsp/arm/aom_convolve8_neon_dotprod.c --- aom-3.8.2/aom_dsp/arm/aom_convolve8_neon_dotprod.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/aom_convolve8_neon_dotprod.c 2024-05-07 19:57:02.451000000 +0000 @@ -137,10 +137,8 @@ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; @@ -269,8 +267,6 @@ const int32x4_t correction = vdupq_n_s32((int32_t)vaddvq_s16(correct_tmp)); const uint8x8_t range_limit = vdup_n_u8(128); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t t0, t1, t2, t3, t4, t5, t6; - int8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; int8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -284,46 +280,39 @@ if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(src, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); src += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + int8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(src, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -333,17 +322,16 @@ s5678 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); - d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); - d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); - d3 = convolve8_4_sdot_partial(s3456, s78910, correction, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + int16x4_t d0 = convolve8_4_sdot_partial(s0123, s4567, correction, filter); + int16x4_t d1 = convolve8_4_sdot_partial(s1234, s5678, correction, filter); + int16x4_t d2 = convolve8_4_sdot_partial(s2345, s6789, correction, filter); + int16x4_t d3 = + convolve8_4_sdot_partial(s3456, s78910, correction, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -358,37 +346,30 @@ } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; do { - height = h; - s = src; - d = dst; + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + uint8x8_t t0, t1, t2, t3, t4, t5, t6; load_u8_8x7(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6); s += 7 * src_stride; /* Clamp sample range to [-128, 127] for 8-bit signed dot product. */ - s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); - s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); - s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); - s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); - s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); - s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); - s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); - s7 = vdup_n_s8(0); - s8 = vdup_n_s8(0); - s9 = vdup_n_s8(0); + int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, range_limit)); + int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, range_limit)); + int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, range_limit)); + int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, range_limit)); + int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, range_limit)); + int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, range_limit)); + int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, range_limit)); /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, @@ -397,23 +378,18 @@ tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { uint8x8_t t7, t8, t9, t10; - load_u8_8x4(s, src_stride, &t7, &t8, &t9, &t10); - s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); - s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); - s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); - s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, range_limit)); + int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, range_limit)); + int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, range_limit)); + int8x8_t s10 = vreinterpret_s8_u8(vsub_u8(t10, range_limit)); + int8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); @@ -430,14 +406,14 @@ s5678_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - correction, filter); - d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - correction, filter); - d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - correction, filter); - d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - correction, filter); + uint8x8_t d0 = convolve8_8_sdot_partial(s0123_lo, s4567_lo, s0123_hi, + s4567_hi, correction, filter); + uint8x8_t d1 = convolve8_8_sdot_partial(s1234_lo, s5678_lo, s1234_hi, + s5678_hi, correction, filter); + uint8x8_t d2 = convolve8_8_sdot_partial(s2345_lo, s6789_lo, s2345_hi, + s6789_hi, correction, filter); + uint8x8_t d3 = convolve8_8_sdot_partial(s3456_lo, s78910_lo, s3456_hi, + s78910_hi, correction, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); diff -Nru aom-3.8.2/aom_dsp/arm/aom_convolve8_neon_i8mm.c aom-3.9.0/aom_dsp/arm/aom_convolve8_neon_i8mm.c --- aom-3.8.2/aom_dsp/arm/aom_convolve8_neon_i8mm.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/aom_convolve8_neon_i8mm.c 2024-05-07 19:57:02.452000000 +0000 @@ -15,7 +15,6 @@ #include #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/aom_dsp_common.h" @@ -44,7 +43,7 @@ 3, 16, 17, 18, 7, 20, 21, 22, 11, 24, 25, 26, 15, 28, 29, 30 }; -static INLINE int16x4_t convolve8_4_usdot(uint8x16_t samples, +static INLINE int16x4_t convolve8_4_usdot(const uint8x16_t samples, const int8x8_t filter, const uint8x16x2_t permute_tbl) { uint8x16_t permuted_samples[2]; @@ -56,7 +55,6 @@ /* { 4, 5, 6, 7, 5, 6, 7, 8, 6, 7, 8, 9, 7, 8, 9, 10 } */ permuted_samples[1] = vqtbl1q_u8(samples, permute_tbl.val[1]); - /* Accumulate dot product into 'correction' to account for range clamp. */ sum = vusdotq_lane_s32(vdupq_n_s32(0), permuted_samples[0], filter, 0); sum = vusdotq_lane_s32(sum, permuted_samples[1], filter, 1); @@ -64,7 +62,7 @@ return vqmovn_s32(sum); } -static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, +static INLINE uint8x8_t convolve8_8_usdot(const uint8x16_t samples, const int8x8_t filter, const uint8x16x3_t permute_tbl) { uint8x16_t permuted_samples[3]; @@ -123,10 +121,8 @@ d01 = vqrshrun_n_s16(vcombine_s16(t0, t1), FILTER_BITS); d23 = vqrshrun_n_s16(vcombine_s16(t2, t3), FILTER_BITS); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; @@ -249,7 +245,6 @@ int h) { const int8x8_t filter = vmovn_s16(vld1q_s16(filter_y)); const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(dot_prod_merge_block_tbl); - uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10; uint8x16x2_t samples_LUT; assert((intptr_t)dst % 4 == 0); @@ -263,31 +258,25 @@ if (w == 4) { const uint8x16_t tran_concat_tbl = vld1q_u8(dot_prod_tran_concat_tbl); - uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s78910; - int16x4_t d0, d1, d2, d3; - uint8x8_t d01, d23; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(src, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); src += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + uint8x16_t s0123, s1234, s2345, s3456; transpose_concat_4x4(s0, s1, s2, s3, &s0123, tran_concat_tbl); transpose_concat_4x4(s1, s2, s3, s4, &s1234, tran_concat_tbl); transpose_concat_4x4(s2, s3, s4, s5, &s2345, tran_concat_tbl); transpose_concat_4x4(s3, s4, s5, s6, &s3456, tran_concat_tbl); - transpose_concat_4x4(s4, s5, s6, s7, &s4567, tran_concat_tbl); - transpose_concat_4x4(s5, s6, s7, s8, &s5678, tran_concat_tbl); - transpose_concat_4x4(s6, s7, s8, s9, &s6789, tran_concat_tbl); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(src, src_stride, &s7, &s8, &s9, &s10); + uint8x16_t s4567, s5678, s6789, s78910; transpose_concat_4x4(s7, s8, s9, s10, &s78910, tran_concat_tbl); /* Merge new data into block from previous iteration. */ @@ -297,17 +286,15 @@ s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_4_usdot_partial(s0123, s4567, filter); - d1 = convolve8_4_usdot_partial(s1234, s5678, filter); - d2 = convolve8_4_usdot_partial(s2345, s6789, filter); - d3 = convolve8_4_usdot_partial(s3456, s78910, filter); - d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); - d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + int16x4_t d0 = convolve8_4_usdot_partial(s0123, s4567, filter); + int16x4_t d1 = convolve8_4_usdot_partial(s1234, s5678, filter); + int16x4_t d2 = convolve8_4_usdot_partial(s2345, s6789, filter); + int16x4_t d3 = convolve8_4_usdot_partial(s3456, s78910, filter); + uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); + uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); + + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); /* Prepare block for next iteration - re-using as much as possible. */ /* Shuffle everything up four rows. */ @@ -322,29 +309,21 @@ } while (h != 0); } else { const uint8x16x2_t tran_concat_tbl = vld1q_u8_x2(dot_prod_tran_concat_tbl); - uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, - s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, - s6789_hi, s78910_lo, s78910_hi; - uint8x8_t d0, d1, d2, d3; - const uint8_t *s; - uint8_t *d; - int height; do { - height = h; - s = src; - d = dst; + int height = h; + const uint8_t *s = src; + uint8_t *d = dst; + uint8x8_t s0, s1, s2, s3, s4, s5, s6; load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); s += 7 * src_stride; - s7 = vdup_n_u8(0); - s8 = vdup_n_u8(0); - s9 = vdup_n_u8(0); - /* This operation combines a conventional transpose and the sample permute * (see horizontal case) required before computing the dot product. */ + uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi, + s3456_lo, s3456_hi; transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi, tran_concat_tbl); transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi, @@ -353,16 +332,13 @@ tran_concat_tbl); transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi, tran_concat_tbl); - transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi, - tran_concat_tbl); - transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi, - tran_concat_tbl); - transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi, - tran_concat_tbl); do { + uint8x8_t s7, s8, s9, s10; load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10); + uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi, + s78910_lo, s78910_hi; transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi, tran_concat_tbl); @@ -379,14 +355,14 @@ s5678_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]); s6789_hi = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]); - d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, s4567_hi, - filter); - d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, s5678_hi, - filter); - d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, s6789_hi, - filter); - d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, s78910_hi, - filter); + uint8x8_t d0 = convolve8_8_usdot_partial(s0123_lo, s4567_lo, s0123_hi, + s4567_hi, filter); + uint8x8_t d1 = convolve8_8_usdot_partial(s1234_lo, s5678_lo, s1234_hi, + s5678_hi, filter); + uint8x8_t d2 = convolve8_8_usdot_partial(s2345_lo, s6789_lo, s2345_hi, + s6789_hi, filter); + uint8x8_t d3 = convolve8_8_usdot_partial(s3456_lo, s78910_lo, s3456_hi, + s78910_hi, filter); store_u8_8x4(d, dst_stride, d0, d1, d2, d3); diff -Nru aom-3.8.2/aom_dsp/arm/aom_neon_sve2_bridge.h aom-3.9.0/aom_dsp/arm/aom_neon_sve2_bridge.h --- aom-3.8.2/aom_dsp/arm/aom_neon_sve2_bridge.h 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/aom_neon_sve2_bridge.h 2024-05-07 19:57:02.453000000 +0000 @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ +#define AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +// We can access instructions exclusive to the SVE2 instruction set from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE int16x8_t aom_tbl2_s16(int16x8_t s0, int16x8_t s1, + uint16x8_t tbl) { + svint16x2_t samples = svcreate2_s16(svset_neonq_s16(svundef_s16(), s0), + svset_neonq_s16(svundef_s16(), s1)); + return svget_neonq_s16( + svtbl2_s16(samples, svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE2_BRIDGE_H_ diff -Nru aom-3.8.2/aom_dsp/arm/aom_neon_sve_bridge.h aom-3.9.0/aom_dsp/arm/aom_neon_sve_bridge.h --- aom-3.8.2/aom_dsp/arm/aom_neon_sve_bridge.h 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/aom_neon_sve_bridge.h 2024-05-07 19:57:02.453000000 +0000 @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ +#define AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ + +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +// We can access instructions exclusive to the SVE instruction set from a +// predominantly Neon context by making use of the Neon-SVE bridge intrinsics +// to reinterpret Neon vectors as SVE vectors - with the high part of the SVE +// vector (if it's longer than 128 bits) being "don't care". + +// While sub-optimal on machines that have SVE vector length > 128-bit - as the +// remainder of the vector is unused - this approach is still beneficial when +// compared to a Neon-only solution. + +static INLINE uint64x2_t aom_udotq_u16(uint64x2_t acc, uint16x8_t x, + uint16x8_t y) { + return svget_neonq_u64(svdot_u64(svset_neonq_u64(svundef_u64(), acc), + svset_neonq_u16(svundef_u16(), x), + svset_neonq_u16(svundef_u16(), y))); +} + +static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { + return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), + svset_neonq_s16(svundef_s16(), x), + svset_neonq_s16(svundef_s16(), y))); +} + +#define aom_svdot_lane_s16(sum, s0, f, lane) \ + svget_neonq_s64(svdot_lane_s64(svset_neonq_s64(svundef_s64(), sum), \ + svset_neonq_s16(svundef_s16(), s0), \ + svset_neonq_s16(svundef_s16(), f), lane)) + +static INLINE uint16x8_t aom_tbl_u16(uint16x8_t s, uint16x8_t tbl) { + return svget_neonq_u16(svtbl_u16(svset_neonq_u16(svundef_u16(), s), + svset_neonq_u16(svundef_u16(), tbl))); +} + +static INLINE int16x8_t aom_tbl_s16(int16x8_t s, uint16x8_t tbl) { + return svget_neonq_s16(svtbl_s16(svset_neonq_s16(svundef_s16(), s), + svset_neonq_u16(svundef_u16(), tbl))); +} + +#endif // AOM_AOM_DSP_ARM_AOM_NEON_SVE_BRIDGE_H_ diff -Nru aom-3.8.2/aom_dsp/arm/avg_sve.c aom-3.9.0/aom_dsp/arm/avg_sve.c --- aom-3.8.2/aom_dsp/arm/avg_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/avg_sve.c 2024-05-07 19:57:02.454000000 +0000 @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "aom/aom_integer.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +int aom_vector_var_sve(const int16_t *ref, const int16_t *src, int bwl) { + assert(bwl >= 2 && bwl <= 5); + int width = 4 << bwl; + + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int16x8_t v_mean[2] = { vdupq_n_s16(0), vdupq_n_s16(0) }; + + do { + int16x8_t r0 = vld1q_s16(ref); + int16x8_t s0 = vld1q_s16(src); + + // diff: dynamic range [-510, 510] 10 (signed) bits. + int16x8_t diff0 = vsubq_s16(r0, s0); + // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. + v_mean[0] = vaddq_s16(v_mean[0], diff0); + + // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. + sse_s64[0] = aom_sdotq_s16(sse_s64[0], diff0, diff0); + + int16x8_t r1 = vld1q_s16(ref + 8); + int16x8_t s1 = vld1q_s16(src + 8); + + // diff: dynamic range [-510, 510] 10 (signed) bits. + int16x8_t diff1 = vsubq_s16(r1, s1); + // v_mean: dynamic range 16 * diff -> [-8160, 8160], 14 (signed) bits. + v_mean[1] = vaddq_s16(v_mean[1], diff1); + + // v_sse: dynamic range 2 * 16 * diff^2 -> [0, 8,323,200], 24 (signed) bits. + sse_s64[1] = aom_sdotq_s16(sse_s64[1], diff1, diff1); + + ref += 16; + src += 16; + width -= 16; + } while (width != 0); + + // Dynamic range [0, 65280], 16 (unsigned) bits. + const uint32_t mean_abs = abs(vaddlvq_s16(vaddq_s16(v_mean[0], v_mean[1]))); + const int64_t sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); + + // (mean_abs * mean_abs): dynamic range 32 (unsigned) bits. + return (int)(sse - ((mean_abs * mean_abs) >> (bwl + 2))); +} diff -Nru aom-3.8.2/aom_dsp/arm/blend_a64_mask_neon.c aom-3.9.0/aom_dsp/arm/blend_a64_mask_neon.c --- aom-3.8.2/aom_dsp/arm/blend_a64_mask_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/blend_a64_mask_neon.c 2024-05-07 19:57:02.454000000 +0000 @@ -91,7 +91,7 @@ uint8x8_t blend = alpha_blend_a64_d16_u16x8(m0, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -139,7 +139,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -181,7 +181,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -225,7 +225,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); uint8x8_t blend = alpha_blend_a64_d16_u16x8(m_avg, s0, s1, offset_vec); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -293,7 +293,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -358,7 +358,7 @@ uint8x8_t m_avg = avg_blend_pairwise_u8x8_4(m0, m1, m2, m3); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -418,7 +418,7 @@ uint8x8_t m_avg = avg_blend_pairwise_u8x8(m0, m1); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -479,7 +479,7 @@ uint8x8_t m_avg = avg_blend_u8x8(m0_2, m1_3); uint8x8_t blend = alpha_blend_a64_u8x8(m_avg, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; diff -Nru aom-3.8.2/aom_dsp/arm/blk_sse_sum_sve.c aom-3.9.0/aom_dsp/arm/blk_sse_sum_sve.c --- aom-3.8.2/aom_dsp/arm/blk_sse_sum_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/blk_sse_sum_sve.c 2024-05-07 19:57:02.455000000 +0000 @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_dsp_rtcd.h" +#include "config/aom_config.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE void get_blk_sse_sum_4xh_sve(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum = vdupq_n_s32(0); + int64x2_t sse = vdupq_n_s64(0); + + do { + int16x8_t d = vcombine_s16(vld1_s16(data), vld1_s16(data + stride)); + + sum = vpadalq_s16(sum, d); + + sse = aom_sdotq_s16(sse, d, d); + + data += 2 * stride; + bh -= 2; + } while (bh != 0); + + *x_sum = vaddvq_s32(sum); + *x2_sum = vaddvq_s64(sse); +} + +static INLINE void get_blk_sse_sum_8xh_sve(const int16_t *data, int stride, + int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int16x8_t d0 = vld1q_s16(data); + int16x8_t d1 = vld1q_s16(data + stride); + + sum[0] = vpadalq_s16(sum[0], d0); + sum[1] = vpadalq_s16(sum[1], d1); + + sse[0] = aom_sdotq_s16(sse[0], d0, d0); + sse[1] = aom_sdotq_s16(sse[1], d1, d1); + + data += 2 * stride; + bh -= 2; + } while (bh != 0); + + *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); + *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +static INLINE void get_blk_sse_sum_large_sve(const int16_t *data, int stride, + int bw, int bh, int *x_sum, + int64_t *x2_sum) { + int32x4_t sum[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int j = bw; + const int16_t *data_ptr = data; + do { + int16x8_t d0 = vld1q_s16(data_ptr); + int16x8_t d1 = vld1q_s16(data_ptr + 8); + + sum[0] = vpadalq_s16(sum[0], d0); + sum[1] = vpadalq_s16(sum[1], d1); + + sse[0] = aom_sdotq_s16(sse[0], d0, d0); + sse[1] = aom_sdotq_s16(sse[1], d1, d1); + + data_ptr += 16; + j -= 16; + } while (j != 0); + + data += stride; + } while (--bh != 0); + + *x_sum = vaddvq_s32(vaddq_s32(sum[0], sum[1])); + *x2_sum = vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +void aom_get_blk_sse_sum_sve(const int16_t *data, int stride, int bw, int bh, + int *x_sum, int64_t *x2_sum) { + if (bw == 4) { + get_blk_sse_sum_4xh_sve(data, stride, bh, x_sum, x2_sum); + } else if (bw == 8) { + get_blk_sse_sum_8xh_sve(data, stride, bh, x_sum, x2_sum); + } else { + assert(bw % 16 == 0); + get_blk_sse_sum_large_sve(data, stride, bw, bh, x_sum, x2_sum); + } +} diff -Nru aom-3.8.2/aom_dsp/arm/fwd_txfm_neon.c aom-3.9.0/aom_dsp/arm/fwd_txfm_neon.c --- aom-3.8.2/aom_dsp/arm/fwd_txfm_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/fwd_txfm_neon.c 2024-05-07 19:57:02.456000000 +0000 @@ -302,20 +302,3 @@ vst1q_s16(&final_output[7 * 8], input_7); } } - -void aom_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { - int r; - int16x8_t sum = vld1q_s16(&input[0]); - for (r = 1; r < 8; ++r) { - const int16x8_t input_00 = vld1q_s16(&input[r * stride]); - sum = vaddq_s16(sum, input_00); - } - { - const int32x4_t a = vpaddlq_s16(sum); - const int64x2_t b = vpaddlq_s32(a); - const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), - vreinterpret_s32_s64(vget_high_s64(b))); - output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); - output[1] = 0; - } -} diff -Nru aom-3.8.2/aom_dsp/arm/highbd_blend_a64_hmask_neon.c aom-3.9.0/aom_dsp/arm/highbd_blend_a64_hmask_neon.c --- aom-3.8.2/aom_dsp/arm/highbd_blend_a64_hmask_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_blend_a64_hmask_neon.c 2024-05-07 19:57:02.458000000 +0000 @@ -67,7 +67,7 @@ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -83,7 +83,7 @@ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); - store_unaligned_u16_2x2(dst, dst_stride, blend); + store_u16x2_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; diff -Nru aom-3.8.2/aom_dsp/arm/highbd_blend_a64_mask_neon.c aom-3.9.0/aom_dsp/arm/highbd_blend_a64_mask_neon.c --- aom-3.8.2/aom_dsp/arm/highbd_blend_a64_mask_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_blend_a64_mask_neon.c 2024-05-07 19:57:02.458000000 +0000 @@ -91,7 +91,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m0, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 2 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -139,7 +139,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 4 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -182,7 +182,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 2 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -227,7 +227,7 @@ uint16x8_t blend = \ alpha_##bd##_blend_a64_d16_u16x8(m_avg, s0, s1, offset); \ \ - store_unaligned_u16_4x2(dst, dst_stride, blend); \ + store_u16x4_strided_x2(dst, dst_stride, blend); \ \ mask += 4 * mask_stride; \ src0 += 2 * src0_stride; \ @@ -325,7 +325,7 @@ uint16x8_t blend = alpha_blend_a64_u16x8(m0, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -373,7 +373,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8_4(m0, m1, m2, m3)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; @@ -416,7 +416,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_pairwise_u8x8(m0, m1)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2 * mask_stride; src0 += 2 * src0_stride; @@ -460,7 +460,7 @@ uint16x8_t m_avg = vmovl_u8(avg_blend_u8x8(m0_2, m1_3)); uint16x8_t blend = alpha_blend_a64_u16x8(m_avg, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 4 * mask_stride; src0 += 2 * src0_stride; diff -Nru aom-3.8.2/aom_dsp/arm/highbd_blend_a64_vmask_neon.c aom-3.9.0/aom_dsp/arm/highbd_blend_a64_vmask_neon.c --- aom-3.8.2/aom_dsp/arm/highbd_blend_a64_vmask_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_blend_a64_vmask_neon.c 2024-05-07 19:57:02.458000000 +0000 @@ -70,7 +70,7 @@ uint16x8_t blend = alpha_blend_a64_u16x8(m, s0, s1); - store_unaligned_u16_4x2(dst, dst_stride, blend); + store_u16x4_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; @@ -90,7 +90,7 @@ uint16x4_t blend = alpha_blend_a64_u16x4(m0, s0, s1); - store_unaligned_u16_2x2(dst, dst_stride, blend); + store_u16x2_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; diff -Nru aom-3.8.2/aom_dsp/arm/highbd_convolve8_sve.c aom-3.9.0/aom_dsp/arm/highbd_convolve8_sve.c --- aom-3.8.2/aom_dsp/arm/highbd_convolve8_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_convolve8_sve.c 2024-05-07 19:57:02.459000000 +0000 @@ -0,0 +1,440 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" + +static INLINE uint16x4_t highbd_convolve8_4_h(int16x8_t s[4], int16x8_t filter, + uint16x4_t max) { + int64x2_t sum[4]; + + sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_h(int16x8_t s[8], int16x8_t filter, + uint16x8_t max) { + int64x2_t sum[8]; + + sum[0] = aom_sdotq_s16(vdupq_n_s64(0), s[0], filter); + sum[1] = aom_sdotq_s16(vdupq_n_s64(0), s[1], filter); + sum[2] = aom_sdotq_s16(vdupq_n_s64(0), s[2], filter); + sum[3] = aom_sdotq_s16(vdupq_n_s64(0), s[3], filter); + sum[4] = aom_sdotq_s16(vdupq_n_s64(0), s[4], filter); + sum[5] = aom_sdotq_s16(vdupq_n_s64(0), s[5], filter); + sum[6] = aom_sdotq_s16(vdupq_n_s64(0), s[6], filter); + sum[7] = aom_sdotq_s16(vdupq_n_s64(0), s[7], filter); + + int64x2_t sum01 = vpaddq_s64(sum[0], sum[1]); + int64x2_t sum23 = vpaddq_s64(sum[2], sum[3]); + int64x2_t sum45 = vpaddq_s64(sum[4], sum[5]); + int64x2_t sum67 = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +void aom_highbd_convolve8_horiz_sve(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int width, int height, int bd) { + assert(x_step_q4 == 16); + assert(width >= 4 && height >= 4); + (void)filter_y; + (void)x_step_q4; + (void)y_step_q4; + + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= SUBPEL_TAPS / 2 - 1; + + const int16x8_t filter = vld1q_s16(filter_x); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x4_t d0 = highbd_convolve8_4_h(s0, filter, max); + uint16x4_t d1 = highbd_convolve8_4_h(s1, filter, max); + uint16x4_t d2 = highbd_convolve8_4_h(s2, filter, max); + uint16x4_t d3 = highbd_convolve8_4_h(s3, filter, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + d += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + do { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = highbd_convolve8_8_h(s0, filter, max); + uint16x8_t d1 = highbd_convolve8_8_h(s1, filter, max); + uint16x8_t d2 = highbd_convolve8_8_h(s2, filter, max); + uint16x8_t d3 = highbd_convolve8_8_h(s3, filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdTranConcatTbl[32]) = { + 0, 1, 8, 9, 16, 17, 24, 25, 2, 3, 10, 11, 18, 19, 26, 27, + 4, 5, 12, 13, 20, 21, 28, 29, 6, 7, 14, 15, 22, 23, 30, 31 +}; + +DECLARE_ALIGNED(16, static const uint8_t, kDotProdMergeBlockTbl[48]) = { + // Shift left and insert new last column in transposed 4x4 block. + 2, 3, 4, 5, 6, 7, 16, 17, 10, 11, 12, 13, 14, 15, 24, 25, + // Shift left and insert two new columns in transposed 4x4 block. + 4, 5, 6, 7, 16, 17, 18, 19, 12, 13, 14, 15, 24, 25, 26, 27, + // Shift left and insert three new columns in transposed 4x4 block. + 6, 7, 16, 17, 18, 19, 20, 21, 14, 15, 24, 25, 26, 27, 28, 29 +}; + +static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1, + int16x4_t s2, int16x4_t s3, + int16x8_t res[2], + uint8x16_t permute_tbl[2]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + // + // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it + // as an argument is preferable to loading it directly from memory as this + // inline helper is called many times from the same parent function. + + int8x16x2_t samples = { vreinterpretq_s8_s16(vcombine_s16(s0, s1)), + vreinterpretq_s8_s16(vcombine_s16(s2, s3)) }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[0])); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples, permute_tbl[1])); +} + +static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t s3, + int16x8_t res[4], + uint8x16_t permute_tbl[2]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res_lo[0]: 00 10 20 30 01 11 21 31 + // res_lo[1]: 02 12 22 32 03 13 23 33 + // res_hi[0]: 04 14 24 34 05 15 25 35 + // res_hi[1]: 06 16 26 36 07 17 27 37 + // + // The 'permute_tbl' is always 'kDotProdTranConcatTbl' above. Passing it + // as an argument is preferable to loading it directly from memory as this + // inline helper is called many times from the same parent function. + + int8x16x2_t samples_lo = { + vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s0), vget_low_s16(s1))), + vreinterpretq_s8_s16(vcombine_s16(vget_low_s16(s2), vget_low_s16(s3))) + }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[0])); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_lo, permute_tbl[1])); + + int8x16x2_t samples_hi = { + vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s0), vget_high_s16(s1))), + vreinterpretq_s8_s16(vcombine_s16(vget_high_s16(s2), vget_high_s16(s3))) + }; + + res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[0])); + res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples_hi, permute_tbl[1])); +} + +static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4], + uint8x16_t tbl, int16x8_t res[4]) { + int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), + vreinterpretq_s8_s16(t1[0]) }; + int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), + vreinterpretq_s8_s16(t1[1]) }; + int8x16x2_t samples2 = { vreinterpretq_s8_s16(t0[2]), + vreinterpretq_s8_s16(t1[2]) }; + int8x16x2_t samples3 = { vreinterpretq_s8_s16(t0[3]), + vreinterpretq_s8_s16(t1[3]) }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); + res[2] = vreinterpretq_s16_s8(vqtbl2q_s8(samples2, tbl)); + res[3] = vreinterpretq_s16_s8(vqtbl2q_s8(samples3, tbl)); +} + +static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2], + uint8x16_t tbl, int16x8_t res[2]) { + int8x16x2_t samples0 = { vreinterpretq_s8_s16(t0[0]), + vreinterpretq_s8_s16(t1[0]) }; + int8x16x2_t samples1 = { vreinterpretq_s8_s16(t0[1]), + vreinterpretq_s8_s16(t1[1]) }; + + res[0] = vreinterpretq_s16_s8(vqtbl2q_s8(samples0, tbl)); + res[1] = vreinterpretq_s16_s8(vqtbl2q_s8(samples1, tbl)); +} + +static INLINE uint16x4_t highbd_convolve8_4_v(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum[2]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + + uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_v(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + uint16x8_t max) { + int64x2_t sum[4]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum[0] = aom_svdot_lane_s16(sum[0], samples_hi[0], filter, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum[1] = aom_svdot_lane_s16(sum[1], samples_hi[1], filter, 1); + + sum[2] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); + sum[2] = aom_svdot_lane_s16(sum[2], samples_hi[2], filter, 1); + + sum[3] = aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); + sum[3] = aom_svdot_lane_s16(sum[3], samples_hi[3], filter, 1); + + int32x4_t res0 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + int32x4_t res1 = vcombine_s32(vmovn_s64(sum[2]), vmovn_s64(sum[3])); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(res0, FILTER_BITS), + vqrshrun_n_s32(res1, FILTER_BITS)); + + return vminq_u16(res, max); +} + +void aom_highbd_convolve8_vert_sve(const uint8_t *src8, ptrdiff_t src_stride, + uint8_t *dst8, ptrdiff_t dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int width, int height, int bd) { + assert(y_step_q4 == 16); + assert(w >= 4 && h >= 4); + (void)filter_x; + (void)y_step_q4; + (void)x_step_q4; + + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride; + + const int16x8_t y_filter = vld1q_s16(filter_y); + + uint8x16_t tran_concat_tbl[2]; + tran_concat_tbl[0] = vld1q_u8(kDotProdTranConcatTbl); + tran_concat_tbl[1] = vld1q_u8(kDotProdTranConcatTbl + 16); + uint8x16_t merge_block_tbl[3]; + merge_block_tbl[0] = vld1q_u8(kDotProdMergeBlockTbl); + merge_block_tbl[1] = vld1q_u8(kDotProdMergeBlockTbl + 16); + merge_block_tbl[2] = vld1q_u8(kDotProdMergeBlockTbl + 32); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123, tran_concat_tbl); + transpose_concat_4x4(s1, s2, s3, s4, s1234, tran_concat_tbl); + transpose_concat_4x4(s2, s3, s4, s5, s2345, tran_concat_tbl); + transpose_concat_4x4(s3, s4, s5, s6, s3456, tran_concat_tbl); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s78910[2]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s78910, tran_concat_tbl); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[0], s4567); + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[1], s5678); + aom_tbl2x2_s16(s3456, s78910, merge_block_tbl[2], s6789); + + uint16x4_t d0 = highbd_convolve8_4_v(s0123, s4567, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4_v(s1234, s5678, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4_v(s2345, s6789, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4_v(s3456, s78910, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s78910[0]; + s3456[1] = s78910[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123, tran_concat_tbl); + transpose_concat_8x4(s1, s2, s3, s4, s1234, tran_concat_tbl); + transpose_concat_8x4(s2, s3, s4, s5, s2345, tran_concat_tbl); + transpose_concat_8x4(s3, s4, s5, s6, s3456, tran_concat_tbl); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[4], s5678[4], s6789[4], s78910[4]; + + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s78910, tran_concat_tbl); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[0], s4567); + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[1], s5678); + aom_tbl2x4_s16(s3456, s78910, merge_block_tbl[2], s6789); + + uint16x8_t d0 = highbd_convolve8_8_v(s0123, s4567, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8_v(s1234, s5678, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8_v(s2345, s6789, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8_v(s3456, s78910, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + + s3456[0] = s78910[0]; + s3456[1] = s78910[1]; + s3456[2] = s78910[2]; + s3456[3] = s78910[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} diff -Nru aom-3.8.2/aom_dsp/arm/highbd_intrapred_neon.c aom-3.9.0/aom_dsp/arm/highbd_intrapred_neon.c --- aom-3.8.2/aom_dsp/arm/highbd_intrapred_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_intrapred_neon.c 2024-05-07 19:57:02.461000000 +0000 @@ -13,9 +13,11 @@ #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/intrapred_common.h" // ----------------------------------------------------------------------------- @@ -1265,3 +1267,1464 @@ HIGHBD_SMOOTH_H_NXM_WIDE(64, 64) #undef HIGHBD_SMOOTH_H_NXM_WIDE + +// ----------------------------------------------------------------------------- +// Z1 + +static int16_t iota1_s16[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8 }; +static int16_t iota2_s16[] = { 0, 2, 4, 6, 8, 10, 12, 14 }; + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_z1_apply_shift_x4(uint16x4_t a0, + uint16x4_t a1, + int shift) { + // The C implementation of the z1 predictor uses (32 - shift) and a right + // shift by 5, however we instead double shift to avoid an unnecessary right + // shift by 1. + uint32x4_t res = vmull_n_u16(a1, shift); + res = vmlal_n_u16(res, a0, 64 - shift); + return vrshrn_n_u32(res, 6); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_z1_apply_shift_x8(uint16x8_t a0, + uint16x8_t a1, + int shift) { + return vcombine_u16( + highbd_dr_z1_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), shift), + highbd_dr_z1_apply_shift_x4(vget_high_u16(a0), vget_high_u16(a1), shift)); +} + +static void highbd_dr_prediction_z1_upsample0_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, + const uint16_t *above, + int dx) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx > 0); + + const int max_base_x = (bw + bh) - 1; + const int above_max = above[max_base_x]; + + const int16x8_t iota1x8 = vld1q_s16(iota1_s16); + const int16x4_t iota1x4 = vget_low_s16(iota1x8); + + int x = dx; + int r = 0; + do { + const int base = x >> 6; + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above_max, bw); + dst += stride; + } + return; + } + + // The C implementation of the z1 predictor when not upsampling uses: + // ((x & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const int shift = x & 0x3e; + + if (bw == 4) { + const uint16x4_t a0 = vld1_u16(&above[base]); + const uint16x4_t a1 = vld1_u16(&above[base + 1]); + const uint16x4_t val = highbd_dr_z1_apply_shift_x4(a0, a1, shift); + const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota1x4); + const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); + vst1_u16(dst, res); + } else { + int c = 0; + do { + const uint16x8_t a0 = vld1q_u16(&above[base + c]); + const uint16x8_t a1 = vld1q_u16(&above[base + c + 1]); + const uint16x8_t val = highbd_dr_z1_apply_shift_x8(a0, a1, shift); + const uint16x8_t cmp = + vcgtq_s16(vdupq_n_s16(max_base_x - base - c), iota1x8); + const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max)); + vst1q_u16(dst + c, res); + c += 8; + } while (c < bw); + } + + dst += stride; + x += dx; + } while (++r < bh); +} + +static void highbd_dr_prediction_z1_upsample1_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, + const uint16_t *above, + int dx) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx > 0); + + const int max_base_x = ((bw + bh) - 1) << 1; + const int above_max = above[max_base_x]; + + const int16x8_t iota2x8 = vld1q_s16(iota2_s16); + const int16x4_t iota2x4 = vget_low_s16(iota2x8); + + int x = dx; + int r = 0; + do { + const int base = x >> 5; + if (base >= max_base_x) { + for (int i = r; i < bh; ++i) { + aom_memset16(dst, above_max, bw); + dst += stride; + } + return; + } + + // The C implementation of the z1 predictor when upsampling uses: + // (((x << 1) & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const int shift = (x << 1) & 0x3e; + + if (bw == 4) { + const uint16x4x2_t a01 = vld2_u16(&above[base]); + const uint16x4_t val = + highbd_dr_z1_apply_shift_x4(a01.val[0], a01.val[1], shift); + const uint16x4_t cmp = vcgt_s16(vdup_n_s16(max_base_x - base), iota2x4); + const uint16x4_t res = vbsl_u16(cmp, val, vdup_n_u16(above_max)); + vst1_u16(dst, res); + } else { + int c = 0; + do { + const uint16x8x2_t a01 = vld2q_u16(&above[base + 2 * c]); + const uint16x8_t val = + highbd_dr_z1_apply_shift_x8(a01.val[0], a01.val[1], shift); + const uint16x8_t cmp = + vcgtq_s16(vdupq_n_s16(max_base_x - base - 2 * c), iota2x8); + const uint16x8_t res = vbslq_u16(cmp, val, vdupq_n_u16(above_max)); + vst1q_u16(dst + c, res); + c += 8; + } while (c < bw); + } + + dst += stride; + x += dx; + } while (++r < bh); +} + +// Directional prediction, zone 1: 0 < angle < 90 +void av1_highbd_dr_prediction_z1_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int dx, int dy, int bd) { + (void)left; + (void)dy; + (void)bd; + assert(dy == 1); + + if (upsample_above) { + highbd_dr_prediction_z1_upsample1_neon(dst, stride, bw, bh, above, dx); + } else { + highbd_dr_prediction_z1_upsample0_neon(dst, stride, bw, bh, above, dx); + } +} + +// ----------------------------------------------------------------------------- +// Z2 + +#if AOM_ARCH_AARCH64 +// Incrementally shift more elements from `above` into the result, merging with +// existing `left` elements. +// X0, X1, X2, X3 +// Y0, X0, X1, X2 +// Y0, Y1, X0, X1 +// Y0, Y1, Y2, X0 +// Y0, Y1, Y2, Y3 +// clang-format off +static const uint8_t z2_merge_shuffles_u16x4[5][8] = { + { 8, 9, 10, 11, 12, 13, 14, 15 }, + { 0, 1, 8, 9, 10, 11, 12, 13 }, + { 0, 1, 2, 3, 8, 9, 10, 11 }, + { 0, 1, 2, 3, 4, 5, 8, 9 }, + { 0, 1, 2, 3, 4, 5, 6, 7 }, +}; +// clang-format on + +// Incrementally shift more elements from `above` into the result, merging with +// existing `left` elements. +// X0, X1, X2, X3, X4, X5, X6, X7 +// Y0, X0, X1, X2, X3, X4, X5, X6 +// Y0, Y1, X0, X1, X2, X3, X4, X5 +// Y0, Y1, Y2, X0, X1, X2, X3, X4 +// Y0, Y1, Y2, Y3, X0, X1, X2, X3 +// Y0, Y1, Y2, Y3, Y4, X0, X1, X2 +// Y0, Y1, Y2, Y3, Y4, Y5, X0, X1 +// Y0, Y1, Y2, Y3, Y4, Y5, Y6, X0 +// Y0, Y1, Y2, Y3, Y4, Y5, Y6, Y7 +// clang-format off +static const uint8_t z2_merge_shuffles_u16x8[9][16] = { + { 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 }, + { 0, 1, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29 }, + { 0, 1, 2, 3, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 }, + { 0, 1, 2, 3, 4, 5, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 16, 17, 18, 19 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17 }, + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, +}; +// clang-format on + +// clang-format off +static const uint16_t z2_y_iter_masks_u16x4[5][4] = { + { 0U, 0U, 0U, 0U }, + { 0xffffU, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU }, +}; +// clang-format on + +// clang-format off +static const uint16_t z2_y_iter_masks_u16x8[9][8] = { + { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0U }, + { 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU, 0xffffU }, +}; +// clang-format on + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x8( + const uint16x8_t left_data, const int16x4_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x8_t left_indices = + vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); + left_indices = vtrn1_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); + const uint16x4_t ret = vreinterpret_u16_u8( + vqtbl1_u8(vreinterpretq_u8_u16(left_data), left_indices)); + return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_tbl_left_x4_from_x16( + const uint16x8x2_t left_data, const int16x4_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x8_t left_indices = + vreinterpret_u8_s16(vsub_s16(indices, vdup_n_s16(base))); + left_indices = vtrn1_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, left_indices); + left_indices = vadd_u8(left_indices, vreinterpret_u8_u16(vdup_n_u16(0x0100))); + uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), + vreinterpretq_u8_u16(left_data.val[1]) } }; + const uint16x4_t ret = vreinterpret_u16_u8(vqtbl2_u8(data_u8, left_indices)); + return vand_u16(ret, vld1_u16(z2_y_iter_masks_u16x4[n])); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x8( + const uint16x8_t left_data, const int16x8_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x16_t left_indices = + vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); + left_indices = vtrn1q_u8(left_indices, left_indices); + left_indices = vaddq_u8(left_indices, left_indices); + left_indices = + vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); + const uint16x8_t ret = vreinterpretq_u16_u8( + vqtbl1q_u8(vreinterpretq_u8_u16(left_data), left_indices)); + return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_tbl_left_x8_from_x16( + const uint16x8x2_t left_data, const int16x8_t indices, int base, int n) { + // Need to adjust indices to operate on 0-based indices rather than + // `base`-based indices and then adjust from uint16x4 indices to uint8x8 + // indices so we can use a tbl instruction (which only operates on bytes). + uint8x16_t left_indices = + vreinterpretq_u8_s16(vsubq_s16(indices, vdupq_n_s16(base))); + left_indices = vtrn1q_u8(left_indices, left_indices); + left_indices = vaddq_u8(left_indices, left_indices); + left_indices = + vaddq_u8(left_indices, vreinterpretq_u8_u16(vdupq_n_u16(0x0100))); + uint8x16x2_t data_u8 = { { vreinterpretq_u8_u16(left_data.val[0]), + vreinterpretq_u8_u16(left_data.val[1]) } }; + const uint16x8_t ret = + vreinterpretq_u16_u8(vqtbl2q_u8(data_u8, left_indices)); + return vandq_u16(ret, vld1q_u16(z2_y_iter_masks_u16x8[n])); +} +#endif // AOM_ARCH_AARCH64 + +static AOM_FORCE_INLINE uint16x4x2_t highbd_dr_prediction_z2_gather_left_x4( + const uint16_t *left, const int16x4_t indices, int n) { + assert(n > 0); + assert(n <= 4); + // Load two elements at a time and then uzp them into separate vectors, to + // reduce the number of memory accesses. + uint32x2_t ret0_u32 = vdup_n_u32(0); + uint32x2_t ret1_u32 = vdup_n_u32(0); + + // Use a single vget_lane_u64 to minimize vector to general purpose register + // transfers and then mask off the bits we actually want. + const uint64_t indices0123 = vget_lane_u64(vreinterpret_u64_s16(indices), 0); + const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); + const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); + const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); + const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); + + // At time of writing both Clang and GCC produced better code with these + // nested if-statements compared to a switch statement with fallthrough. + ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0); + if (n > 1) { + ret0_u32 = vld1_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1); + if (n > 2) { + ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx2), ret1_u32, 0); + if (n > 3) { + ret1_u32 = vld1_lane_u32((const uint32_t *)(left + idx3), ret1_u32, 1); + } + } + } + return vuzp_u16(vreinterpret_u16_u32(ret0_u32), + vreinterpret_u16_u32(ret1_u32)); +} + +static AOM_FORCE_INLINE uint16x8x2_t highbd_dr_prediction_z2_gather_left_x8( + const uint16_t *left, const int16x8_t indices, int n) { + assert(n > 0); + assert(n <= 8); + // Load two elements at a time and then uzp them into separate vectors, to + // reduce the number of memory accesses. + uint32x4_t ret0_u32 = vdupq_n_u32(0); + uint32x4_t ret1_u32 = vdupq_n_u32(0); + + // Use a pair of vget_lane_u64 to minimize vector to general purpose register + // transfers and then mask off the bits we actually want. + const uint64_t indices0123 = + vgetq_lane_u64(vreinterpretq_u64_s16(indices), 0); + const uint64_t indices4567 = + vgetq_lane_u64(vreinterpretq_u64_s16(indices), 1); + const int idx0 = (int16_t)((indices0123 >> 0) & 0xffffU); + const int idx1 = (int16_t)((indices0123 >> 16) & 0xffffU); + const int idx2 = (int16_t)((indices0123 >> 32) & 0xffffU); + const int idx3 = (int16_t)((indices0123 >> 48) & 0xffffU); + const int idx4 = (int16_t)((indices4567 >> 0) & 0xffffU); + const int idx5 = (int16_t)((indices4567 >> 16) & 0xffffU); + const int idx6 = (int16_t)((indices4567 >> 32) & 0xffffU); + const int idx7 = (int16_t)((indices4567 >> 48) & 0xffffU); + + // At time of writing both Clang and GCC produced better code with these + // nested if-statements compared to a switch statement with fallthrough. + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx0), ret0_u32, 0); + if (n > 1) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx1), ret0_u32, 1); + if (n > 2) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx2), ret0_u32, 2); + if (n > 3) { + ret0_u32 = vld1q_lane_u32((const uint32_t *)(left + idx3), ret0_u32, 3); + if (n > 4) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx4), ret1_u32, 0); + if (n > 5) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx5), ret1_u32, 1); + if (n > 6) { + ret1_u32 = + vld1q_lane_u32((const uint32_t *)(left + idx6), ret1_u32, 2); + if (n > 7) { + ret1_u32 = vld1q_lane_u32((const uint32_t *)(left + idx7), + ret1_u32, 3); + } + } + } + } + } + } + } + return vuzpq_u16(vreinterpretq_u16_u32(ret0_u32), + vreinterpretq_u16_u32(ret1_u32)); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_merge_x4( + uint16x4_t out_x, uint16x4_t out_y, int base_shift) { + assert(base_shift >= 0); + assert(base_shift <= 4); + // On AArch64 we can permute the data from the `above` and `left` vectors + // into a single vector in a single load (of the permute vector) + tbl. +#if AOM_ARCH_AARCH64 + const uint8x8x2_t out_yx = { { vreinterpret_u8_u16(out_y), + vreinterpret_u8_u16(out_x) } }; + return vreinterpret_u16_u8( + vtbl2_u8(out_yx, vld1_u8(z2_merge_shuffles_u16x4[base_shift]))); +#else + uint16x4_t out = out_y; + for (int c2 = base_shift, x_idx = 0; c2 < 4; ++c2, ++x_idx) { + out[c2] = out_x[x_idx]; + } + return out; +#endif +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_merge_x8( + uint16x8_t out_x, uint16x8_t out_y, int base_shift) { + assert(base_shift >= 0); + assert(base_shift <= 8); + // On AArch64 we can permute the data from the `above` and `left` vectors + // into a single vector in a single load (of the permute vector) + tbl. +#if AOM_ARCH_AARCH64 + const uint8x16x2_t out_yx = { { vreinterpretq_u8_u16(out_y), + vreinterpretq_u8_u16(out_x) } }; + return vreinterpretq_u16_u8( + vqtbl2q_u8(out_yx, vld1q_u8(z2_merge_shuffles_u16x8[base_shift]))); +#else + uint16x8_t out = out_y; + for (int c2 = base_shift, x_idx = 0; c2 < 8; ++c2, ++x_idx) { + out[c2] = out_x[x_idx]; + } + return out; +#endif +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_apply_shift_x4( + uint16x4_t a0, uint16x4_t a1, int16x4_t shift) { + uint32x4_t res = vmull_u16(a1, vreinterpret_u16_s16(shift)); + res = + vmlal_u16(res, a0, vsub_u16(vdup_n_u16(32), vreinterpret_u16_s16(shift))); + return vrshrn_n_u32(res, 5); +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_apply_shift_x8( + uint16x8_t a0, uint16x8_t a1, int16x8_t shift) { + return vcombine_u16( + highbd_dr_prediction_z2_apply_shift_x4(vget_low_u16(a0), vget_low_u16(a1), + vget_low_s16(shift)), + highbd_dr_prediction_z2_apply_shift_x4( + vget_high_u16(a0), vget_high_u16(a1), vget_high_s16(shift))); +} + +static AOM_FORCE_INLINE uint16x4_t highbd_dr_prediction_z2_step_x4( + const uint16_t *above, const uint16x4_t above0, const uint16x4_t above1, + const uint16_t *left, int dx, int dy, int r, int c) { + const int16x4_t iota = vld1_s16(iota1_s16); + + const int x0 = (c << 6) - (r + 1) * dx; + const int y0 = (r << 6) - (c + 1) * dy; + + const int16x4_t x0123 = vadd_s16(vdup_n_s16(x0), vshl_n_s16(iota, 6)); + const int16x4_t y0123 = vsub_s16(vdup_n_s16(y0), vmul_n_s16(iota, dy)); + const int16x4_t shift_x0123 = + vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); + const int16x4_t shift_y0123 = + vshr_n_s16(vand_s16(y0123, vdup_n_s16(0x3F)), 1); + const int16x4_t base_y0123 = vshr_n_s16(y0123, 6); + + const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; + + // Based on the value of `base_shift` there are three possible cases to + // compute the result: + // 1) base_shift <= 0: We can load and operate entirely on data from the + // `above` input vector. + // 2) base_shift < vl: We can load from `above[-1]` and shift + // `vl - base_shift` elements across to the end of the + // vector, then compute the remainder from `left`. + // 3) base_shift >= vl: We can load and operate entirely on data from the + // `left` input vector. + + if (base_shift <= 0) { + const int base_x = x0 >> 6; + const uint16x4_t a0 = vld1_u16(above + base_x); + const uint16x4_t a1 = vld1_u16(above + base_x + 1); + return highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + } else if (base_shift < 4) { + const uint16x4x2_t l01 = highbd_dr_prediction_z2_gather_left_x4( + left + 1, base_y0123, base_shift); + const uint16x4_t out16_y = highbd_dr_prediction_z2_apply_shift_x4( + l01.val[0], l01.val[1], shift_y0123); + + // No need to reload from above in the loop, just use pre-loaded constants. + const uint16x4_t out16_x = + highbd_dr_prediction_z2_apply_shift_x4(above0, above1, shift_x0123); + + return highbd_dr_prediction_z2_merge_x4(out16_x, out16_y, base_shift); + } else { + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left + 1, base_y0123, 4); + return highbd_dr_prediction_z2_apply_shift_x4(l01.val[0], l01.val[1], + shift_y0123); + } +} + +static AOM_FORCE_INLINE uint16x8_t highbd_dr_prediction_z2_step_x8( + const uint16_t *above, const uint16x8_t above0, const uint16x8_t above1, + const uint16_t *left, int dx, int dy, int r, int c) { + const int16x8_t iota = vld1q_s16(iota1_s16); + + const int x0 = (c << 6) - (r + 1) * dx; + const int y0 = (r << 6) - (c + 1) * dy; + + const int16x8_t x01234567 = vaddq_s16(vdupq_n_s16(x0), vshlq_n_s16(iota, 6)); + const int16x8_t y01234567 = vsubq_s16(vdupq_n_s16(y0), vmulq_n_s16(iota, dy)); + const int16x8_t shift_x01234567 = + vshrq_n_s16(vandq_s16(x01234567, vdupq_n_s16(0x3F)), 1); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(y01234567, vdupq_n_s16(0x3F)), 1); + const int16x8_t base_y01234567 = vshrq_n_s16(y01234567, 6); + + const int base_shift = ((((r + 1) * dx) - 1) >> 6) - c; + + // Based on the value of `base_shift` there are three possible cases to + // compute the result: + // 1) base_shift <= 0: We can load and operate entirely on data from the + // `above` input vector. + // 2) base_shift < vl: We can load from `above[-1]` and shift + // `vl - base_shift` elements across to the end of the + // vector, then compute the remainder from `left`. + // 3) base_shift >= vl: We can load and operate entirely on data from the + // `left` input vector. + + if (base_shift <= 0) { + const int base_x = x0 >> 6; + const uint16x8_t a0 = vld1q_u16(above + base_x); + const uint16x8_t a1 = vld1q_u16(above + base_x + 1); + return highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + } else if (base_shift < 8) { + const uint16x8x2_t l01 = highbd_dr_prediction_z2_gather_left_x8( + left + 1, base_y01234567, base_shift); + const uint16x8_t out16_y = highbd_dr_prediction_z2_apply_shift_x8( + l01.val[0], l01.val[1], shift_y01234567); + + // No need to reload from above in the loop, just use pre-loaded constants. + const uint16x8_t out16_x = + highbd_dr_prediction_z2_apply_shift_x8(above0, above1, shift_x01234567); + + return highbd_dr_prediction_z2_merge_x8(out16_x, out16_y, base_shift); + } else { + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left + 1, base_y01234567, 8); + return highbd_dr_prediction_z2_apply_shift_x8(l01.val[0], l01.val[1], + shift_y01234567); + } +} + +// Left array is accessed from -1 through `bh - 1` inclusive. +// Above array is accessed from -1 through `bw - 1` inclusive. +#define HIGHBD_DR_PREDICTOR_Z2_WXH(bw, bh) \ + static void highbd_dr_prediction_z2_##bw##x##bh##_neon( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int upsample_above, int upsample_left, int dx, \ + int dy, int bd) { \ + (void)bd; \ + (void)upsample_above; \ + (void)upsample_left; \ + assert(!upsample_above); \ + assert(!upsample_left); \ + assert(bw % 4 == 0); \ + assert(bh % 4 == 0); \ + assert(dx > 0); \ + assert(dy > 0); \ + \ + uint16_t left_data[bh + 1]; \ + memcpy(left_data, left - 1, (bh + 1) * sizeof(uint16_t)); \ + \ + uint16x8_t a0, a1; \ + if (bw == 4) { \ + a0 = vcombine_u16(vld1_u16(above - 1), vdup_n_u16(0)); \ + a1 = vcombine_u16(vld1_u16(above + 0), vdup_n_u16(0)); \ + } else { \ + a0 = vld1q_u16(above - 1); \ + a1 = vld1q_u16(above + 0); \ + } \ + \ + int r = 0; \ + do { \ + if (bw == 4) { \ + vst1_u16(dst, highbd_dr_prediction_z2_step_x4( \ + above, vget_low_u16(a0), vget_low_u16(a1), \ + left_data, dx, dy, r, 0)); \ + } else { \ + int c = 0; \ + do { \ + vst1q_u16(dst + c, highbd_dr_prediction_z2_step_x8( \ + above, a0, a1, left_data, dx, dy, r, c)); \ + c += 8; \ + } while (c < bw); \ + } \ + dst += stride; \ + } while (++r < bh); \ + } + +HIGHBD_DR_PREDICTOR_Z2_WXH(4, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(8, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(8, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 4) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 8) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(16, 64) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 8) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(32, 64) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 16) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 32) +HIGHBD_DR_PREDICTOR_Z2_WXH(64, 64) + +#undef HIGHBD_DR_PREDICTOR_Z2_WXH + +typedef void (*highbd_dr_prediction_z2_ptr)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd); + +static void highbd_dr_prediction_z2_4x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 6 inclusive from `left`. + // else we only need -1 through 3 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16(left - 2); + left_data1 = vld1q_u16(left - 1); + } else { + left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); + left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); + } +#endif + + const int16x4_t iota0123 = vld1_s16(iota1_s16); + const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); + + for (int r = 0; r < 4; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above + base_x0); + a1 = vld1_u16(above + base_x0 + 1); + shift_x0123 = vshr_n_s16(vand_s16(x0123, vdup_n_s16(0x3F)), 1); + } + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); + } else if (base_shift < 4) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, + left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, + left_data_base, y_iters); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x4_t out_y = + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); + + // Calculate X component from `above`. + const int16x4_t shift_x0123 = vshr_n_s16( + vand_s16(vmul_n_s16(x0123, 1 << upsample_above), vdup_n_s16(0x3F)), + 1); + uint16x4_t a0, a1; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + } else { + a0 = vld1_u16(above - 1); + a1 = vld1_u16(above + 0); + } + const uint16x4_t out_x = + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + + // Combine X and Y vectors. + const uint16x4_t out = + highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); + vst1_u16(dst, out); + } else { + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data0, base_y0123, + left_data_base, 4); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x8(left_data1, base_y0123, + left_data_base, 4); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_4x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 14 inclusive from `left`. + // else we only need -1 through 6 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8x2_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16_x2(left - 2); + left_data1 = vld1q_u16_x2(left - 1); + } else { + left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; + left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; + } +#endif + + const int16x4_t iota0123 = vld1_s16(iota1_s16); + const int16x4_t iota1234 = vld1_s16(iota1_s16 + 1); + + for (int r = 0; r < 8; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x4_t x0123 = vsub_s16(vshl_n_s16(iota0123, 6), vdup_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above + base_x0); + a1 = vld1_u16(above + base_x0 + 1); + shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); + } + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123)); + } else if (base_shift < 4) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( + left_data0, base_y0123, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16( + left_data1, base_y0123, left_data_base, y_iters); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x4_t out_y = + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123); + + // Calculate X component from `above`. + uint16x4_t a0, a1; + int16x4_t shift_x0123; + if (upsample_above) { + const uint16x4x2_t a01 = vld2_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x0123 = vand_s16(x0123, vdup_n_s16(0x1F)); + } else { + a0 = vld1_u16(above - 1); + a1 = vld1_u16(above + 0); + shift_x0123 = vand_s16(vshr_n_s16(x0123, 1), vdup_n_s16(0x1F)); + } + const uint16x4_t out_x = + highbd_dr_prediction_z2_apply_shift_x4(a0, a1, shift_x0123); + + // Combine X and Y vectors. + const uint16x4_t out = + highbd_dr_prediction_z2_merge_x4(out_x, out_y, base_shift); + vst1_u16(dst, out); + } else { + const int16x4_t y0123 = + vsub_s16(vdup_n_s16(r << 6), vmul_n_s16(iota1234, dy)); + const int16x4_t base_y0123 = vshl_s16(y0123, vdup_n_s16(-frac_bits_y)); + const int16x4_t shift_y0123 = vshr_n_s16( + vand_s16(vmul_n_s16(y0123, 1 << upsample_left), vdup_n_s16(0x3F)), 1); + + uint16x4_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data0, base_y0123, + left_data_base, 4); + l1 = highbd_dr_prediction_z2_tbl_left_x4_from_x16(left_data1, base_y0123, + left_data_base, 4); +#else + const uint16x4x2_t l01 = + highbd_dr_prediction_z2_gather_left_x4(left, base_y0123, 4); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1_u16(dst, + highbd_dr_prediction_z2_apply_shift_x4(l0, l1, shift_y0123)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_8x4_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 6 inclusive from `left`. + // else we only need -1 through 3 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16(left - 2); + left_data1 = vld1q_u16(left - 1); + } else { + left_data0 = vcombine_u16(vld1_u16(left - 1), vdup_n_u16(0)); + left_data1 = vcombine_u16(vld1_u16(left + 0), vdup_n_u16(0)); + } +#endif + + const int16x8_t iota01234567 = vld1q_s16(iota1_s16); + const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); + + for (int r = 0; r < 4; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x8_t x01234567 = + vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = vld2q_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above + base_x0); + a1 = vld1q_u16(above + base_x0 + 1); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); + } else if (base_shift < 8) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data0, base_y01234567, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data1, base_y01234567, left_data_base, y_iters); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x8_t out_y = + highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); + + // Calculate X component from `above`. + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = + vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above - 1); + a1 = vld1q_u16(above + 0); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + const uint16x8_t out_x = + highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + + // Combine X and Y vectors. + const uint16x8_t out = + highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); + vst1q_u16(dst, out); + } else { + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data0, base_y01234567, left_data_base, 8); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x8( + left_data1, base_y01234567, left_data_base, 8); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); + } + dst += stride; + } +} + +static void highbd_dr_prediction_z2_8x8_neon(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, + int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + (void)bd; + assert(dx > 0); + assert(dy > 0); + + const int frac_bits_x = 6 - upsample_above; + const int frac_bits_y = 6 - upsample_left; + const int min_base_x = -(1 << (upsample_above + frac_bits_x)); + + // if `upsample_left` then we need -2 through 14 inclusive from `left`. + // else we only need -1 through 6 inclusive. + +#if AOM_ARCH_AARCH64 + uint16x8x2_t left_data0, left_data1; + if (upsample_left) { + left_data0 = vld1q_u16_x2(left - 2); + left_data1 = vld1q_u16_x2(left - 1); + } else { + left_data0 = (uint16x8x2_t){ { vld1q_u16(left - 1), vdupq_n_u16(0) } }; + left_data1 = (uint16x8x2_t){ { vld1q_u16(left + 0), vdupq_n_u16(0) } }; + } +#endif + + const int16x8_t iota01234567 = vld1q_s16(iota1_s16); + const int16x8_t iota12345678 = vld1q_s16(iota1_s16 + 1); + + for (int r = 0; r < 8; ++r) { + const int base_shift = (min_base_x + (r + 1) * dx + 63) >> 6; + const int x0 = (r + 1) * dx; + const int16x8_t x01234567 = + vsubq_s16(vshlq_n_s16(iota01234567, 6), vdupq_n_s16(x0)); + const int base_x0 = (-x0) >> frac_bits_x; + if (base_shift <= 0) { + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = vld2q_u16(above + base_x0); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above + base_x0); + a1 = vld1q_u16(above + base_x0 + 1); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567)); + } else if (base_shift < 8) { + // Calculate Y component from `left`. + const int y_iters = base_shift; + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data0, base_y01234567, left_data_base, y_iters); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data1, base_y01234567, left_data_base, y_iters); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, y_iters); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + const uint16x8_t out_y = + highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567); + + // Calculate X component from `above`. + uint16x8_t a0, a1; + int16x8_t shift_x01234567; + if (upsample_above) { + const uint16x8x2_t a01 = + vld2q_u16(above + (base_x0 % 2 == 0 ? -2 : -1)); + a0 = a01.val[0]; + a1 = a01.val[1]; + shift_x01234567 = vandq_s16(x01234567, vdupq_n_s16(0x1F)); + } else { + a0 = vld1q_u16(above - 1); + a1 = vld1q_u16(above + 0); + shift_x01234567 = + vandq_s16(vshrq_n_s16(x01234567, 1), vdupq_n_s16(0x1F)); + } + const uint16x8_t out_x = + highbd_dr_prediction_z2_apply_shift_x8(a0, a1, shift_x01234567); + + // Combine X and Y vectors. + const uint16x8_t out = + highbd_dr_prediction_z2_merge_x8(out_x, out_y, base_shift); + vst1q_u16(dst, out); + } else { + const int16x8_t y01234567 = + vsubq_s16(vdupq_n_s16(r << 6), vmulq_n_s16(iota12345678, dy)); + const int16x8_t base_y01234567 = + vshlq_s16(y01234567, vdupq_n_s16(-frac_bits_y)); + const int16x8_t shift_y01234567 = + vshrq_n_s16(vandq_s16(vmulq_n_s16(y01234567, 1 << upsample_left), + vdupq_n_s16(0x3F)), + 1); + + uint16x8_t l0, l1; +#if AOM_ARCH_AARCH64 + const int left_data_base = upsample_left ? -2 : -1; + l0 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data0, base_y01234567, left_data_base, 8); + l1 = highbd_dr_prediction_z2_tbl_left_x8_from_x16( + left_data1, base_y01234567, left_data_base, 8); +#else + const uint16x8x2_t l01 = + highbd_dr_prediction_z2_gather_left_x8(left, base_y01234567, 8); + l0 = l01.val[0]; + l1 = l01.val[1]; +#endif + + vst1q_u16( + dst, highbd_dr_prediction_z2_apply_shift_x8(l0, l1, shift_y01234567)); + } + dst += stride; + } +} + +static highbd_dr_prediction_z2_ptr dr_predictor_z2_arr_neon[7][7] = { + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_4x4_neon, + &highbd_dr_prediction_z2_4x8_neon, &highbd_dr_prediction_z2_4x16_neon, NULL, + NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_8x4_neon, + &highbd_dr_prediction_z2_8x8_neon, &highbd_dr_prediction_z2_8x16_neon, + &highbd_dr_prediction_z2_8x32_neon, NULL }, + { NULL, NULL, &highbd_dr_prediction_z2_16x4_neon, + &highbd_dr_prediction_z2_16x8_neon, &highbd_dr_prediction_z2_16x16_neon, + &highbd_dr_prediction_z2_16x32_neon, &highbd_dr_prediction_z2_16x64_neon }, + { NULL, NULL, NULL, &highbd_dr_prediction_z2_32x8_neon, + &highbd_dr_prediction_z2_32x16_neon, &highbd_dr_prediction_z2_32x32_neon, + &highbd_dr_prediction_z2_32x64_neon }, + { NULL, NULL, NULL, NULL, &highbd_dr_prediction_z2_64x16_neon, + &highbd_dr_prediction_z2_64x32_neon, &highbd_dr_prediction_z2_64x64_neon }, +}; + +// Directional prediction, zone 2: 90 < angle < 180 +void av1_highbd_dr_prediction_z2_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_above, + int upsample_left, int dx, int dy, + int bd) { + highbd_dr_prediction_z2_ptr f = + dr_predictor_z2_arr_neon[get_msb(bw)][get_msb(bh)]; + assert(f != NULL); + f(dst, stride, above, left, upsample_above, upsample_left, dx, dy, bd); +} + +// ----------------------------------------------------------------------------- +// Z3 + +// Both the lane to the use and the shift amount must be immediates. +#define HIGHBD_DR_PREDICTOR_Z3_STEP_X4(out, iota, base, in0, in1, s0, s1, \ + lane, shift) \ + do { \ + uint32x4_t val = vmull_lane_u16((in0), (s0), (lane)); \ + val = vmlal_lane_u16(val, (in1), (s1), (lane)); \ + const uint16x4_t cmp = vadd_u16((iota), vdup_n_u16(base)); \ + const uint16x4_t res = vrshrn_n_u32(val, (shift)); \ + *(out) = vbsl_u16(vclt_u16(cmp, vdup_n_u16(max_base_y)), res, \ + vdup_n_u16(left_max)); \ + } while (0) + +#define HIGHBD_DR_PREDICTOR_Z3_STEP_X8(out, iota, base, in0, in1, s0, s1, \ + lane, shift) \ + do { \ + uint32x4_t val_lo = vmull_lane_u16(vget_low_u16(in0), (s0), (lane)); \ + val_lo = vmlal_lane_u16(val_lo, vget_low_u16(in1), (s1), (lane)); \ + uint32x4_t val_hi = vmull_lane_u16(vget_high_u16(in0), (s0), (lane)); \ + val_hi = vmlal_lane_u16(val_hi, vget_high_u16(in1), (s1), (lane)); \ + const uint16x8_t cmp = vaddq_u16((iota), vdupq_n_u16(base)); \ + const uint16x8_t res = vcombine_u16(vrshrn_n_u32(val_lo, (shift)), \ + vrshrn_n_u32(val_hi, (shift))); \ + *(out) = vbslq_u16(vcltq_u16(cmp, vdupq_n_u16(max_base_y)), res, \ + vdupq_n_u16(left_max)); \ + } while (0) + +static void highbd_dr_prediction_z3_upsample0_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, const uint16_t *left, + int dy) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dy > 0); + + // Factor out left + 1 to give the compiler a better chance of recognising + // that the offsets used for the loads from left and left + 1 are otherwise + // identical. + const uint16_t *left1 = left + 1; + + const int max_base_y = (bw + bh - 1); + const int left_max = left[max_base_y]; + const int frac_bits = 6; + + const uint16x8_t iota1x8 = vreinterpretq_u16_s16(vld1q_s16(iota1_s16)); + const uint16x4_t iota1x4 = vget_low_u16(iota1x8); + + // The C implementation of the z3 predictor when not upsampling uses: + // ((y & 0x3f) >> 1) + // The right shift is unnecessary here since we instead shift by +1 later, + // so adjust the mask to 0x3e to ensure we don't consider the extra bit. + const uint16x4_t shift_mask = vdup_n_u16(0x3e); + + if (bh == 4) { + int y = dy; + int c = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); + const int base0 = (y + 0 * dy) >> frac_bits; + const int base1 = (y + 1 * dy) >> frac_bits; + const int base2 = (y + 2 * dy) >> frac_bits; + const int base3 = (y + 3 * dy) >> frac_bits; + uint16x4_t out[4]; + if (base0 >= max_base_y) { + out[0] = vdup_n_u16(left_max); + } else { + const uint16x4_t l00 = vld1_u16(left + base0); + const uint16x4_t l01 = vld1_u16(left1 + base0); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota1x4, base0, l00, l01, + shifts0, shifts1, 0, 6); + } + if (base1 >= max_base_y) { + out[1] = vdup_n_u16(left_max); + } else { + const uint16x4_t l10 = vld1_u16(left + base1); + const uint16x4_t l11 = vld1_u16(left1 + base1); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota1x4, base1, l10, l11, + shifts0, shifts1, 1, 6); + } + if (base2 >= max_base_y) { + out[2] = vdup_n_u16(left_max); + } else { + const uint16x4_t l20 = vld1_u16(left + base2); + const uint16x4_t l21 = vld1_u16(left1 + base2); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota1x4, base2, l20, l21, + shifts0, shifts1, 2, 6); + } + if (base3 >= max_base_y) { + out[3] = vdup_n_u16(left_max); + } else { + const uint16x4_t l30 = vld1_u16(left + base3); + const uint16x4_t l31 = vld1_u16(left1 + base3); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota1x4, base3, l30, l31, + shifts0, shifts1, 3, 6); + } + transpose_array_inplace_u16_4x4(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + r2 * stride + c, out[r2]); + } + y += 4 * dy; + c += 4; + } while (c < bw); + } else { + int y = dy; + int c = 0; + do { + int r = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(64), shifts1); + const int base0 = ((y + 0 * dy) >> frac_bits) + r; + const int base1 = ((y + 1 * dy) >> frac_bits) + r; + const int base2 = ((y + 2 * dy) >> frac_bits) + r; + const int base3 = ((y + 3 * dy) >> frac_bits) + r; + uint16x8_t out[4]; + if (base0 >= max_base_y) { + out[0] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l00 = vld1q_u16(left + base0); + const uint16x8_t l01 = vld1q_u16(left1 + base0); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota1x8, base0, l00, l01, + shifts0, shifts1, 0, 6); + } + if (base1 >= max_base_y) { + out[1] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l10 = vld1q_u16(left + base1); + const uint16x8_t l11 = vld1q_u16(left1 + base1); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota1x8, base1, l10, l11, + shifts0, shifts1, 1, 6); + } + if (base2 >= max_base_y) { + out[2] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l20 = vld1q_u16(left + base2); + const uint16x8_t l21 = vld1q_u16(left1 + base2); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota1x8, base2, l20, l21, + shifts0, shifts1, 2, 6); + } + if (base3 >= max_base_y) { + out[3] = vdupq_n_u16(left_max); + } else { + const uint16x8_t l30 = vld1q_u16(left + base3); + const uint16x8_t l31 = vld1q_u16(left1 + base3); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota1x8, base3, l30, l31, + shifts0, shifts1, 3, 6); + } + transpose_array_inplace_u16_4x8(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); + } + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); + } + r += 8; + } while (r < bh); + y += 4 * dy; + c += 4; + } while (c < bw); + } +} + +static void highbd_dr_prediction_z3_upsample1_neon(uint16_t *dst, + ptrdiff_t stride, int bw, + int bh, const uint16_t *left, + int dy) { + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dy > 0); + + const int max_base_y = (bw + bh - 1) << 1; + const int left_max = left[max_base_y]; + const int frac_bits = 5; + + const uint16x4_t iota1x4 = vreinterpret_u16_s16(vld1_s16(iota1_s16)); + const uint16x8_t iota2x8 = vreinterpretq_u16_s16(vld1q_s16(iota2_s16)); + const uint16x4_t iota2x4 = vget_low_u16(iota2x8); + + // The C implementation of the z3 predictor when upsampling uses: + // (((x << 1) & 0x3f) >> 1) + // The two shifts are unnecessary here since the lowest bit is guaranteed to + // be zero when the mask is applied, so adjust the mask to 0x1f to avoid + // needing the shifts at all. + const uint16x4_t shift_mask = vdup_n_u16(0x1F); + + if (bh == 4) { + int y = dy; + int c = 0; + do { + // Fully unroll the 4x4 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); + const int base0 = (y + 0 * dy) >> frac_bits; + const int base1 = (y + 1 * dy) >> frac_bits; + const int base2 = (y + 2 * dy) >> frac_bits; + const int base3 = (y + 3 * dy) >> frac_bits; + const uint16x4x2_t l0 = vld2_u16(left + base0); + const uint16x4x2_t l1 = vld2_u16(left + base1); + const uint16x4x2_t l2 = vld2_u16(left + base2); + const uint16x4x2_t l3 = vld2_u16(left + base3); + uint16x4_t out[4]; + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[0], iota2x4, base0, l0.val[0], + l0.val[1], shifts0, shifts1, 0, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[1], iota2x4, base1, l1.val[0], + l1.val[1], shifts0, shifts1, 1, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[2], iota2x4, base2, l2.val[0], + l2.val[1], shifts0, shifts1, 2, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X4(&out[3], iota2x4, base3, l3.val[0], + l3.val[1], shifts0, shifts1, 3, 5); + transpose_array_inplace_u16_4x4(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + r2 * stride + c, out[r2]); + } + y += 4 * dy; + c += 4; + } while (c < bw); + } else { + assert(bh % 8 == 0); + + int y = dy; + int c = 0; + do { + int r = 0; + do { + // Fully unroll the 4x8 block to allow us to use immediate lane-indexed + // multiply instructions. + const uint16x4_t shifts1 = + vand_u16(vmla_n_u16(vdup_n_u16(y), iota1x4, dy), shift_mask); + const uint16x4_t shifts0 = vsub_u16(vdup_n_u16(32), shifts1); + const int base0 = ((y + 0 * dy) >> frac_bits) + (r * 2); + const int base1 = ((y + 1 * dy) >> frac_bits) + (r * 2); + const int base2 = ((y + 2 * dy) >> frac_bits) + (r * 2); + const int base3 = ((y + 3 * dy) >> frac_bits) + (r * 2); + const uint16x8x2_t l0 = vld2q_u16(left + base0); + const uint16x8x2_t l1 = vld2q_u16(left + base1); + const uint16x8x2_t l2 = vld2q_u16(left + base2); + const uint16x8x2_t l3 = vld2q_u16(left + base3); + uint16x8_t out[4]; + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[0], iota2x8, base0, l0.val[0], + l0.val[1], shifts0, shifts1, 0, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[1], iota2x8, base1, l1.val[0], + l1.val[1], shifts0, shifts1, 1, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[2], iota2x8, base2, l2.val[0], + l2.val[1], shifts0, shifts1, 2, 5); + HIGHBD_DR_PREDICTOR_Z3_STEP_X8(&out[3], iota2x8, base3, l3.val[0], + l3.val[1], shifts0, shifts1, 3, 5); + transpose_array_inplace_u16_4x8(out); + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2) * stride + c, vget_low_u16(out[r2])); + } + for (int r2 = 0; r2 < 4; ++r2) { + vst1_u16(dst + (r + r2 + 4) * stride + c, vget_high_u16(out[r2])); + } + r += 8; + } while (r < bh); + y += 4 * dy; + c += 4; + } while (c < bw); + } +} + +// Directional prediction, zone 3: 180 < angle < 270 +void av1_highbd_dr_prediction_z3_neon(uint16_t *dst, ptrdiff_t stride, int bw, + int bh, const uint16_t *above, + const uint16_t *left, int upsample_left, + int dx, int dy, int bd) { + (void)above; + (void)dx; + (void)bd; + assert(bw % 4 == 0); + assert(bh % 4 == 0); + assert(dx == 1); + assert(dy > 0); + + if (upsample_left) { + highbd_dr_prediction_z3_upsample1_neon(dst, stride, bw, bh, left, dy); + } else { + highbd_dr_prediction_z3_upsample0_neon(dst, stride, bw, bh, left, dy); + } +} + +#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X4 +#undef HIGHBD_DR_PREDICTOR_Z3_STEP_X8 diff -Nru aom-3.8.2/aom_dsp/arm/highbd_sse_sve.c aom-3.9.0/aom_dsp/arm/highbd_sse_sve.c --- aom-3.8.2/aom_dsp/arm/highbd_sse_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_sse_sve.c 2024-05-07 19:57:02.470000000 +0000 @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE void highbd_sse_8x1_neon(const uint16_t *src, const uint16_t *ref, + uint64x2_t *sse) { + uint16x8_t s = vld1q_u16(src); + uint16x8_t r = vld1q_u16(ref); + + uint16x8_t abs_diff = vabdq_u16(s, r); + + *sse = aom_udotq_u16(*sse, abs_diff, abs_diff); +} + +static INLINE int64_t highbd_sse_128xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 8 * 8, ref + 8 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 9 * 8, ref + 9 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 10 * 8, ref + 10 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 11 * 8, ref + 11 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 12 * 8, ref + 12 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 13 * 8, ref + 13 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 14 * 8, ref + 14 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 15 * 8, ref + 15 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + highbd_sse_8x1_neon(src + 4 * 8, ref + 4 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 5 * 8, ref + 5 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 6 * 8, ref + 6 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 7 * 8, ref + 7 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + highbd_sse_8x1_neon(src + 2 * 8, ref + 2 * 8, &sse[2]); + highbd_sse_8x1_neon(src + 3 * 8, ref + 3 * 8, &sse[3]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + sse[0] = vaddq_u64(sse[0], sse[1]); + sse[2] = vaddq_u64(sse[2], sse[3]); + sse[0] = vaddq_u64(sse[0], sse[2]); + return vaddvq_u64(sse[0]); +} + +static INLINE int64_t highbd_sse_16xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * 8, ref + 0 * 8, &sse[0]); + highbd_sse_8x1_neon(src + 1 * 8, ref + 1 * 8, &sse[1]); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return vaddvq_u64(vaddq_u64(sse[0], sse[1])); +} + +static INLINE int64_t highbd_sse_8xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + do { + highbd_sse_8x1_neon(src + 0 * src_stride, ref + 0 * ref_stride, &sse[0]); + highbd_sse_8x1_neon(src + 1 * src_stride, ref + 1 * ref_stride, &sse[1]); + + src += 2 * src_stride; + ref += 2 * ref_stride; + height -= 2; + } while (height != 0); + + return vaddvq_u64(vaddq_u64(sse[0], sse[1])); +} + +static INLINE int64_t highbd_sse_4xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int height) { + uint64x2_t sse = vdupq_n_u64(0); + + do { + uint16x8_t s = load_unaligned_u16_4x2(src, src_stride); + uint16x8_t r = load_unaligned_u16_4x2(ref, ref_stride); + + uint16x8_t abs_diff = vabdq_u16(s, r); + sse = aom_udotq_u16(sse, abs_diff, abs_diff); + + src += 2 * src_stride; + ref += 2 * ref_stride; + height -= 2; + } while (height != 0); + + return vaddvq_u64(sse); +} + +static INLINE int64_t highbd_sse_wxh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int width, int height) { + svuint64_t sse = svdup_n_u64(0); + uint64_t step = svcnth(); + + do { + int w = 0; + const uint16_t *src_ptr = src; + const uint16_t *ref_ptr = ref; + + do { + svbool_t pred = svwhilelt_b16_u32(w, width); + svuint16_t s = svld1_u16(pred, src_ptr); + svuint16_t r = svld1_u16(pred, ref_ptr); + + svuint16_t abs_diff = svabd_u16_z(pred, s, r); + + sse = svdot_u64(sse, abs_diff, abs_diff); + + src_ptr += step; + ref_ptr += step; + w += step; + } while (w < width); + + src += src_stride; + ref += ref_stride; + } while (--height != 0); + + return svaddv_u64(svptrue_b64(), sse); +} + +int64_t aom_highbd_sse_sve(const uint8_t *src8, int src_stride, + const uint8_t *ref8, int ref_stride, int width, + int height) { + uint16_t *src = CONVERT_TO_SHORTPTR(src8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + + switch (width) { + case 4: return highbd_sse_4xh_sve(src, src_stride, ref, ref_stride, height); + case 8: return highbd_sse_8xh_sve(src, src_stride, ref, ref_stride, height); + case 16: + return highbd_sse_16xh_sve(src, src_stride, ref, ref_stride, height); + case 32: + return highbd_sse_32xh_sve(src, src_stride, ref, ref_stride, height); + case 64: + return highbd_sse_64xh_sve(src, src_stride, ref, ref_stride, height); + case 128: + return highbd_sse_128xh_sve(src, src_stride, ref, ref_stride, height); + default: + return highbd_sse_wxh_sve(src, src_stride, ref, ref_stride, width, + height); + } +} diff -Nru aom-3.8.2/aom_dsp/arm/highbd_subpel_variance_neon.c aom-3.9.0/aom_dsp/arm/highbd_subpel_variance_neon.c --- aom-3.8.2/aom_dsp/arm/highbd_subpel_variance_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_subpel_variance_neon.c 2024-05-07 19:57:02.470000000 +0000 @@ -184,40 +184,40 @@ \ if (xoffset == 0) { \ if (yoffset == 0) { \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(src_ptr), src_stride, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_avg(src_ptr, tmp, src_stride, src_stride, w, \ h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp, src_stride, \ src_stride, h, yoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ if (yoffset == 0) { \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ highbd_var_filter_block2d_avg(src_ptr, tmp0, src_stride, 1, w, \ (h + 1)); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -225,21 +225,21 @@ if (yoffset == 0) { \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, h, \ xoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_avg(tmp0, tmp1, w, w, w, h); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, \ (h + 1), xoffset); \ highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ @@ -508,22 +508,22 @@ } while (--i != 0); } -#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ - uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, uint32_t *sse, \ - const uint8_t *second_pred) { \ - uint16_t tmp0[w * (h + 1)]; \ - uint16_t tmp1[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ - xoffset); \ - highbd_avg_pred_var_filter_block2d_bil_w##w( \ - tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ - CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ +#define HBD_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ + uint32_t aom_highbd_##bitdepth##_sub_pixel_avg_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, uint32_t *sse, \ + const uint8_t *second_pred) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_avg_pred_var_filter_block2d_bil_w##w( \ + tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ + \ + return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp1), \ + w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(bitdepth, w, h) \ @@ -538,19 +538,19 @@ if (yoffset == 0) { \ highbd_avg_pred(src_ptr, tmp, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp, source_stride, source_stride, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } else { \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp, source_stride, source_stride, h, yoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ @@ -559,7 +559,7 @@ highbd_avg_pred_var_filter_block2d_avg( \ src_ptr, tmp0, source_stride, 1, w, h, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * (h + 1)]; \ @@ -567,7 +567,7 @@ (h + 1)); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * (h + 1)]; \ @@ -575,7 +575,7 @@ (h + 1)); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else { \ @@ -584,7 +584,7 @@ highbd_avg_pred_var_filter_block2d_bil_w##w( \ src_ptr, tmp0, source_stride, 1, h, xoffset, \ CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -592,7 +592,7 @@ (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_avg( \ tmp0, tmp1, w, w, w, h, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -600,7 +600,7 @@ (h + 1), xoffset); \ highbd_avg_pred_var_filter_block2d_bil_w##w( \ tmp0, tmp1, w, w, h, yoffset, CONVERT_TO_SHORTPTR(second_pred)); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } \ @@ -714,25 +714,25 @@ HBD_SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON(12, 64, 16) #endif // !CONFIG_REALTIME_ONLY -#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ - unsigned int \ - aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t tmp0[w * (h + 1)]; \ - uint16_t tmp1[w * (h + 1)]; \ - uint16_t tmp2[w * h]; \ - uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ - highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ - xoffset); \ - highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ - aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \ - h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ - msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ - CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ +#define HBD_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ + unsigned int \ + aom_highbd_##bitdepth##_masked_sub_pixel_variance##w##x##h##_neon( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t tmp0[w * (h + 1)]; \ + uint16_t tmp1[w * (h + 1)]; \ + uint16_t tmp2[w * h]; \ + uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src); \ + highbd_var_filter_block2d_bil_w##w(src_ptr, tmp0, src_stride, 1, (h + 1), \ + xoffset); \ + highbd_var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset); \ + aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, w, \ + h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ + msk_stride, invert_mask); \ + return aom_highbd_##bitdepth##_variance##w##x##h(CONVERT_TO_BYTEPTR(tmp2), \ + w, ref, ref_stride, sse); \ } #define HBD_SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(bitdepth, w, h) \ @@ -749,7 +749,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp0), second_pred, \ w, h, src, src_stride, msk, msk_stride, \ invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp0), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -758,7 +758,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -767,7 +767,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } \ } else if (xoffset == 4) { \ @@ -778,7 +778,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp1[w * h]; \ @@ -789,7 +789,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp1[w * h]; \ @@ -800,7 +800,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } \ } else { \ @@ -812,7 +812,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp1), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp0), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp1), w, ref, ref_stride, sse); \ } else if (yoffset == 4) { \ uint16_t tmp0[w * (h + 1)]; \ @@ -824,7 +824,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } else { \ uint16_t tmp0[w * (h + 1)]; \ @@ -836,7 +836,7 @@ aom_highbd_comp_mask_pred_neon(CONVERT_TO_BYTEPTR(tmp2), second_pred, \ w, h, CONVERT_TO_BYTEPTR(tmp1), w, msk, \ msk_stride, invert_mask); \ - return aom_highbd_##bitdepth##_variance##w##x##h##_neon( \ + return aom_highbd_##bitdepth##_variance##w##x##h( \ CONVERT_TO_BYTEPTR(tmp2), w, ref, ref_stride, sse); \ } \ } \ diff -Nru aom-3.8.2/aom_dsp/arm/highbd_variance_neon.c aom-3.9.0/aom_dsp/arm/highbd_variance_neon.c --- aom-3.8.2/aom_dsp/arm/highbd_variance_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_variance_neon.c 2024-05-07 19:57:02.472000000 +0000 @@ -412,52 +412,34 @@ return *sse; } -static INLINE uint32_t highbd_mse8_8xh_neon(const uint16_t *src_ptr, - int src_stride, - const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { - return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 8, h, - sse); -} - -static INLINE uint32_t highbd_mse8_16xh_neon(const uint16_t *src_ptr, - int src_stride, - const uint16_t *ref_ptr, - int ref_stride, int h, - unsigned int *sse) { - return highbd_mse_wxh_neon(src_ptr, src_stride, ref_ptr, ref_stride, 16, h, - sse); -} - -#define HIGHBD_MSE_WXH_NEON(w, h) \ - uint32_t aom_highbd_8_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse8_##w##xh_neon(src, src_stride, ref, ref_stride, h, sse); \ - return *sse; \ - } \ - \ - uint32_t aom_highbd_10_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ - *sse = ROUND_POWER_OF_TWO(*sse, 4); \ - return *sse; \ - } \ - \ - uint32_t aom_highbd_12_mse##w##x##h##_neon( \ - const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ - int ref_stride, uint32_t *sse) { \ - uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ - highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ - *sse = ROUND_POWER_OF_TWO(*sse, 8); \ - return *sse; \ +#define HIGHBD_MSE_WXH_NEON(w, h) \ + uint32_t aom_highbd_8_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_10_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##w##x##h##_neon( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_neon(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ } HIGHBD_MSE_WXH_NEON(16, 16) diff -Nru aom-3.8.2/aom_dsp/arm/highbd_variance_sve.c aom-3.9.0/aom_dsp/arm/highbd_variance_sve.c --- aom-3.8.2/aom_dsp/arm/highbd_variance_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/highbd_variance_sve.c 2024-05-07 19:57:02.473000000 +0000 @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" + +#include "aom_dsp/aom_filter.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/variance.h" + +// Process a block of width 4 two rows at a time. +static INLINE void highbd_variance_4xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int16x8_t sum_s16 = vdupq_n_s16(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + const uint16x8_t s = load_unaligned_u16_4x2(src_ptr, src_stride); + const uint16x8_t r = load_unaligned_u16_4x2(ref_ptr, ref_stride); + + int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + sum_s16 = vaddq_s16(sum_s16, diff); + + sse_s64 = aom_sdotq_s16(sse_s64, diff, diff); + + src_ptr += 2 * src_stride; + ref_ptr += 2 * ref_stride; + h -= 2; + } while (h != 0); + + *sum = vaddlvq_s16(sum_s16); + *sse = vaddvq_s64(sse_s64); +} + +static INLINE void variance_8x1_sve(const uint16_t *src, const uint16_t *ref, + int32x4_t *sum, int64x2_t *sse) { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + const int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(s, r)); + *sum = vpadalq_s16(*sum, diff); + + *sse = aom_sdotq_s16(*sse, diff, diff); +} + +static INLINE void highbd_variance_8xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, uint64_t *sse, + int64_t *sum) { + int32x4_t sum_s32 = vdupq_n_s32(0); + int64x2_t sse_s64 = vdupq_n_s64(0); + + do { + variance_8x1_sve(src_ptr, ref_ptr, &sum_s32, &sse_s64); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = vaddlvq_s32(sum_s32); + *sse = vaddvq_s64(sse_s64); +} + +static INLINE void highbd_variance_16xh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + int64x2_t sse_s64[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + variance_8x1_sve(src_ptr, ref_ptr, &sum_s32[0], &sse_s64[0]); + variance_8x1_sve(src_ptr + 8, ref_ptr + 8, &sum_s32[1], &sse_s64[1]); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[1])); + *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[1])); +} + +static INLINE void highbd_variance_large_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + uint64_t *sse, int64_t *sum) { + int32x4_t sum_s32[4] = { vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), + vdupq_n_s32(0) }; + int64x2_t sse_s64[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int j = 0; + do { + variance_8x1_sve(src_ptr + j, ref_ptr + j, &sum_s32[0], &sse_s64[0]); + variance_8x1_sve(src_ptr + j + 8, ref_ptr + j + 8, &sum_s32[1], + &sse_s64[1]); + variance_8x1_sve(src_ptr + j + 16, ref_ptr + j + 16, &sum_s32[2], + &sse_s64[2]); + variance_8x1_sve(src_ptr + j + 24, ref_ptr + j + 24, &sum_s32[3], + &sse_s64[3]); + + j += 32; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + sum_s32[0] = vaddq_s32(sum_s32[0], sum_s32[1]); + sum_s32[2] = vaddq_s32(sum_s32[2], sum_s32[3]); + *sum = vaddlvq_s32(vaddq_s32(sum_s32[0], sum_s32[2])); + sse_s64[0] = vaddq_s64(sse_s64[0], sse_s64[1]); + sse_s64[2] = vaddq_s64(sse_s64[2], sse_s64[3]); + *sse = vaddvq_s64(vaddq_s64(sse_s64[0], sse_s64[2])); +} + +static INLINE void highbd_variance_32xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 32, h, sse, sum); +} + +static INLINE void highbd_variance_64xh_sve(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + int h, uint64_t *sse, + int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 64, h, sse, sum); +} + +static INLINE void highbd_variance_128xh_sve(const uint16_t *src, + int src_stride, + const uint16_t *ref, + int ref_stride, int h, + uint64_t *sse, int64_t *sum) { + highbd_variance_large_sve(src, src_stride, ref, ref_stride, 128, h, sse, sum); +} + +#define HBD_VARIANCE_WXH_8_SVE(w, h) \ + uint32_t aom_highbd_8_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)sse_long; \ + sum = (int)sum_long; \ + return *sse - (uint32_t)(((int64_t)sum * sum) / (w * h)); \ + } + +#define HBD_VARIANCE_WXH_10_SVE(w, h) \ + uint32_t aom_highbd_10_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 2); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +#define HBD_VARIANCE_WXH_12_SVE(w, h) \ + uint32_t aom_highbd_12_variance##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint64_t sse_long = 0; \ + int64_t sum_long = 0; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_variance_##w##xh_sve(src, src_stride, ref, ref_stride, h, \ + &sse_long, &sum_long); \ + *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8); \ + sum = (int)ROUND_POWER_OF_TWO(sum_long, 4); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) / (w * h)); \ + return (var >= 0) ? (uint32_t)var : 0; \ + } + +// 8-bit +HBD_VARIANCE_WXH_8_SVE(4, 4) +HBD_VARIANCE_WXH_8_SVE(4, 8) + +HBD_VARIANCE_WXH_8_SVE(8, 4) +HBD_VARIANCE_WXH_8_SVE(8, 8) +HBD_VARIANCE_WXH_8_SVE(8, 16) + +HBD_VARIANCE_WXH_8_SVE(16, 8) +HBD_VARIANCE_WXH_8_SVE(16, 16) +HBD_VARIANCE_WXH_8_SVE(16, 32) + +HBD_VARIANCE_WXH_8_SVE(32, 16) +HBD_VARIANCE_WXH_8_SVE(32, 32) +HBD_VARIANCE_WXH_8_SVE(32, 64) + +HBD_VARIANCE_WXH_8_SVE(64, 32) +HBD_VARIANCE_WXH_8_SVE(64, 64) +HBD_VARIANCE_WXH_8_SVE(64, 128) + +HBD_VARIANCE_WXH_8_SVE(128, 64) +HBD_VARIANCE_WXH_8_SVE(128, 128) + +// 10-bit +HBD_VARIANCE_WXH_10_SVE(4, 4) +HBD_VARIANCE_WXH_10_SVE(4, 8) + +HBD_VARIANCE_WXH_10_SVE(8, 4) +HBD_VARIANCE_WXH_10_SVE(8, 8) +HBD_VARIANCE_WXH_10_SVE(8, 16) + +HBD_VARIANCE_WXH_10_SVE(16, 8) +HBD_VARIANCE_WXH_10_SVE(16, 16) +HBD_VARIANCE_WXH_10_SVE(16, 32) + +HBD_VARIANCE_WXH_10_SVE(32, 16) +HBD_VARIANCE_WXH_10_SVE(32, 32) +HBD_VARIANCE_WXH_10_SVE(32, 64) + +HBD_VARIANCE_WXH_10_SVE(64, 32) +HBD_VARIANCE_WXH_10_SVE(64, 64) +HBD_VARIANCE_WXH_10_SVE(64, 128) + +HBD_VARIANCE_WXH_10_SVE(128, 64) +HBD_VARIANCE_WXH_10_SVE(128, 128) + +// 12-bit +HBD_VARIANCE_WXH_12_SVE(4, 4) +HBD_VARIANCE_WXH_12_SVE(4, 8) + +HBD_VARIANCE_WXH_12_SVE(8, 4) +HBD_VARIANCE_WXH_12_SVE(8, 8) +HBD_VARIANCE_WXH_12_SVE(8, 16) + +HBD_VARIANCE_WXH_12_SVE(16, 8) +HBD_VARIANCE_WXH_12_SVE(16, 16) +HBD_VARIANCE_WXH_12_SVE(16, 32) + +HBD_VARIANCE_WXH_12_SVE(32, 16) +HBD_VARIANCE_WXH_12_SVE(32, 32) +HBD_VARIANCE_WXH_12_SVE(32, 64) + +HBD_VARIANCE_WXH_12_SVE(64, 32) +HBD_VARIANCE_WXH_12_SVE(64, 64) +HBD_VARIANCE_WXH_12_SVE(64, 128) + +HBD_VARIANCE_WXH_12_SVE(128, 64) +HBD_VARIANCE_WXH_12_SVE(128, 128) + +#if !CONFIG_REALTIME_ONLY +// 8-bit +HBD_VARIANCE_WXH_8_SVE(4, 16) + +HBD_VARIANCE_WXH_8_SVE(8, 32) + +HBD_VARIANCE_WXH_8_SVE(16, 4) +HBD_VARIANCE_WXH_8_SVE(16, 64) + +HBD_VARIANCE_WXH_8_SVE(32, 8) + +HBD_VARIANCE_WXH_8_SVE(64, 16) + +// 10-bit +HBD_VARIANCE_WXH_10_SVE(4, 16) + +HBD_VARIANCE_WXH_10_SVE(8, 32) + +HBD_VARIANCE_WXH_10_SVE(16, 4) +HBD_VARIANCE_WXH_10_SVE(16, 64) + +HBD_VARIANCE_WXH_10_SVE(32, 8) + +HBD_VARIANCE_WXH_10_SVE(64, 16) + +// 12-bit +HBD_VARIANCE_WXH_12_SVE(4, 16) + +HBD_VARIANCE_WXH_12_SVE(8, 32) + +HBD_VARIANCE_WXH_12_SVE(16, 4) +HBD_VARIANCE_WXH_12_SVE(16, 64) + +HBD_VARIANCE_WXH_12_SVE(32, 8) + +HBD_VARIANCE_WXH_12_SVE(64, 16) + +#endif // !CONFIG_REALTIME_ONLY + +#undef HBD_VARIANCE_WXH_8_SVE +#undef HBD_VARIANCE_WXH_10_SVE +#undef HBD_VARIANCE_WXH_12_SVE + +static INLINE uint32_t highbd_mse_wxh_sve(const uint16_t *src_ptr, + int src_stride, + const uint16_t *ref_ptr, + int ref_stride, int w, int h, + unsigned int *sse) { + uint64x2_t sse_u64 = vdupq_n_u64(0); + + do { + int j = 0; + do { + uint16x8_t s = vld1q_u16(src_ptr + j); + uint16x8_t r = vld1q_u16(ref_ptr + j); + + uint16x8_t diff = vabdq_u16(s, r); + + sse_u64 = aom_udotq_u16(sse_u64, diff, diff); + + j += 8; + } while (j < w); + + src_ptr += src_stride; + ref_ptr += ref_stride; + } while (--h != 0); + + *sse = (uint32_t)vaddvq_u64(sse_u64); + return *sse; +} + +#define HIGHBD_MSE_WXH_SVE(w, h) \ + uint32_t aom_highbd_10_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 4); \ + return *sse; \ + } \ + \ + uint32_t aom_highbd_12_mse##w##x##h##_sve( \ + const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \ + int ref_stride, uint32_t *sse) { \ + uint16_t *src = CONVERT_TO_SHORTPTR(src_ptr); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref_ptr); \ + highbd_mse_wxh_sve(src, src_stride, ref, ref_stride, w, h, sse); \ + *sse = ROUND_POWER_OF_TWO(*sse, 8); \ + return *sse; \ + } + +HIGHBD_MSE_WXH_SVE(16, 16) +HIGHBD_MSE_WXH_SVE(16, 8) +HIGHBD_MSE_WXH_SVE(8, 16) +HIGHBD_MSE_WXH_SVE(8, 8) + +#undef HIGHBD_MSE_WXH_SVE + +uint64_t aom_mse_wxh_16bit_highbd_sve(uint16_t *dst, int dstride, uint16_t *src, + int sstride, int w, int h) { + assert((w == 8 || w == 4) && (h == 8 || h == 4)); + + uint64x2_t sum = vdupq_n_u64(0); + + if (w == 8) { + do { + uint16x8_t d0 = vld1q_u16(dst + 0 * dstride); + uint16x8_t d1 = vld1q_u16(dst + 1 * dstride); + uint16x8_t s0 = vld1q_u16(src + 0 * sstride); + uint16x8_t s1 = vld1q_u16(src + 1 * sstride); + + uint16x8_t abs_diff0 = vabdq_u16(s0, d0); + uint16x8_t abs_diff1 = vabdq_u16(s1, d1); + + sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); + sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); + + dst += 2 * dstride; + src += 2 * sstride; + h -= 2; + } while (h != 0); + } else { // w == 4 + do { + uint16x8_t d0 = load_unaligned_u16_4x2(dst + 0 * dstride, dstride); + uint16x8_t d1 = load_unaligned_u16_4x2(dst + 2 * dstride, dstride); + uint16x8_t s0 = load_unaligned_u16_4x2(src + 0 * sstride, sstride); + uint16x8_t s1 = load_unaligned_u16_4x2(src + 2 * sstride, sstride); + + uint16x8_t abs_diff0 = vabdq_u16(s0, d0); + uint16x8_t abs_diff1 = vabdq_u16(s1, d1); + + sum = aom_udotq_u16(sum, abs_diff0, abs_diff0); + sum = aom_udotq_u16(sum, abs_diff1, abs_diff1); + + dst += 4 * dstride; + src += 4 * sstride; + h -= 4; + } while (h != 0); + } + + return vaddvq_u64(sum); +} diff -Nru aom-3.8.2/aom_dsp/arm/intrapred_neon.c aom-3.9.0/aom_dsp/arm/intrapred_neon.c --- aom-3.8.2/aom_dsp/arm/intrapred_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/intrapred_neon.c 2024-05-07 19:57:02.475000000 +0000 @@ -11,13 +11,16 @@ #include #include +#include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/reinterpret_neon.h" #include "aom_dsp/arm/sum_neon.h" +#include "aom_dsp/arm/transpose_neon.h" #include "aom_dsp/intrapred_common.h" //------------------------------------------------------------------------------ @@ -33,7 +36,7 @@ static INLINE void dc_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t dc) { for (int i = 0; i < h; ++i) { - store_u8_4x1(dst + i * stride, dc, 0); + store_u8_4x1(dst + i * stride, dc); } } @@ -578,7 +581,7 @@ static INLINE void v_store_4xh(uint8_t *dst, ptrdiff_t stride, int h, uint8x8_t d0) { for (int i = 0; i < h; ++i) { - store_u8_4x1(dst + i * stride, d0, 0); + store_u8_4x1(dst + i * stride, d0); } } @@ -754,14 +757,14 @@ // ----------------------------------------------------------------------------- static INLINE void h_store_4x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { - store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0); - store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0); - store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2), 0); - store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3), 0); - store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4), 0); - store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5), 0); - store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6), 0); - store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7), 0); + store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); + store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); + store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); + store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); + store_u8_4x1(dst + 4 * stride, vdup_lane_u8(d0, 4)); + store_u8_4x1(dst + 5 * stride, vdup_lane_u8(d0, 5)); + store_u8_4x1(dst + 6 * stride, vdup_lane_u8(d0, 6)); + store_u8_4x1(dst + 7 * stride, vdup_lane_u8(d0, 7)); } static INLINE void h_store_8x8(uint8_t *dst, ptrdiff_t stride, uint8x8_t d0) { @@ -858,10 +861,10 @@ const uint8_t *above, const uint8_t *left) { const uint8x8_t d0 = load_u8_4x1(left); (void)above; - store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0), 0); - store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1), 0); - store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2), 0); - store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3), 0); + store_u8_4x1(dst + 0 * stride, vdup_lane_u8(d0, 0)); + store_u8_4x1(dst + 1 * stride, vdup_lane_u8(d0, 1)); + store_u8_4x1(dst + 2 * stride, vdup_lane_u8(d0, 2)); + store_u8_4x1(dst + 3 * stride, vdup_lane_u8(d0, 3)); } void aom_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, @@ -1045,17 +1048,6 @@ /* ---------------------P R E D I C T I O N Z 1--------------------------- */ -static DECLARE_ALIGNED(16, uint8_t, EvenOddMaskx[8][16]) = { - { 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15 }, - { 0, 1, 3, 5, 7, 9, 11, 13, 0, 2, 4, 6, 8, 10, 12, 14 }, - { 0, 0, 2, 4, 6, 8, 10, 12, 0, 0, 3, 5, 7, 9, 11, 13 }, - { 0, 0, 0, 3, 5, 7, 9, 11, 0, 0, 0, 4, 6, 8, 10, 12 }, - { 0, 0, 0, 0, 4, 6, 8, 10, 0, 0, 0, 0, 5, 7, 9, 11 }, - { 0, 0, 0, 0, 0, 5, 7, 9, 0, 0, 0, 0, 0, 6, 8, 10 }, - { 0, 0, 0, 0, 0, 0, 6, 8, 0, 0, 0, 0, 0, 0, 7, 9 }, - { 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 8 } -}; - // Low bit depth functions static DECLARE_ALIGNED(32, uint8_t, BaseMask[33][32]) = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -1163,9 +1155,7 @@ // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - const uint16x8_t a16 = vdupq_n_u16(16); const uint8x8_t a_mbase_x = vdup_n_u8(above[max_base_x]); - const uint8x8_t v_32 = vdup_n_u8(32); int x = dx; for (int r = 0; r < W; r++) { @@ -1191,7 +1181,7 @@ shift = vdupq_n_u16((x & 0x3f) >> 1); } uint16x8_t diff = vsubl_u8(a01_128.val[1], a01_128.val[0]); - uint16x8_t a32 = vmlal_u8(a16, a01_128.val[0], v_32); + uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a01_128.val[0], vdup_n_u8(32)); uint16x8_t res = vmlaq_u16(a32, diff, shift); uint8x8_t mask = vld1_u8(BaseMask[base_max_diff]); @@ -1240,17 +1230,10 @@ // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - const uint16x8_t a16 = vdupq_n_u16(16); const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); - const uint8x8_t v_32 = vdup_n_u8(32); - const uint8x16_t v_zero = vdupq_n_u8(0); int x = dx; for (int r = 0; r < W; r++) { - uint16x8x2_t res; - uint16x8_t shift; - uint8x16_t a0_128, a1_128; - int base = x >> frac_bits; int base_max_diff = (max_base_x - base) >> upsample_above; if (base_max_diff <= 0) { @@ -1262,25 +1245,28 @@ if (base_max_diff > H) base_max_diff = H; + uint16x8_t shift; + uint8x16_t a0_128, a1_128; if (upsample_above) { uint8x8x2_t v_tmp_a0_128 = vld2_u8(above + base); a0_128 = vcombine_u8(v_tmp_a0_128.val[0], v_tmp_a0_128.val[1]); - a1_128 = vextq_u8(a0_128, v_zero, 8); - shift = vdupq_n_u16(((x << upsample_above) & 0x3f) >> 1); + a1_128 = vextq_u8(a0_128, vdupq_n_u8(0), 8); + shift = vdupq_n_u16(x & 0x1f); } else { a0_128 = vld1q_u8(above + base); a1_128 = vld1q_u8(above + base + 1); shift = vdupq_n_u16((x & 0x3f) >> 1); } - uint16x8x2_t diff, a32; - diff.val[0] = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); - diff.val[1] = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); - a32.val[0] = vmlal_u8(a16, vget_low_u8(a0_128), v_32); - a32.val[1] = vmlal_u8(a16, vget_high_u8(a0_128), v_32); - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); uint8x16_t v_temp = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); + vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); uint8x16_t mask = vld1q_u8(BaseMask[base_max_diff]); dst[r] = vbslq_u8(mask, v_temp, a_mbase_x); @@ -1301,10 +1287,7 @@ } static AOM_FORCE_INLINE void dr_prediction_z1_32xN_internal_neon( - int N, uint8x16x2_t *dstvec, const uint8_t *above, int upsample_above, - int dx) { - // here upsample_above is 0 by design of av1_use_intra_edge_upsample - (void)upsample_above; + int N, uint8x16x2_t *dstvec, const uint8_t *above, int dx) { const int frac_bits = 6; const int max_base_x = ((32 + N) - 1); @@ -1316,13 +1299,9 @@ // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); - const uint16x8_t a16 = vdupq_n_u16(16); - const uint8x8_t v_32 = vdup_n_u8(32); int x = dx; for (int r = 0; r < N; r++) { - uint8x16_t res16[2]; - int base = x >> frac_bits; int base_max_diff = (max_base_x - base); if (base_max_diff <= 0) { @@ -1336,43 +1315,41 @@ uint16x8_t shift = vdupq_n_u16((x & 0x3f) >> 1); + uint8x16_t res16[2]; for (int j = 0, jj = 0; j < 32; j += 16, jj++) { int mdiff = base_max_diff - j; if (mdiff <= 0) { res16[jj] = a_mbase_x; } else { - uint16x8x2_t a32, diff, res; - uint8x16_t a0_128, a1_128; - a0_128 = vld1q_u8(above + base + j); - a1_128 = vld1q_u8(above + base + j + 1); - diff.val[0] = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); - diff.val[1] = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); - a32.val[0] = vmlal_u8(a16, vget_low_u8(a0_128), v_32); - a32.val[1] = vmlal_u8(a16, vget_high_u8(a0_128), v_32); - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift); + uint8x16_t a0_128 = vld1q_u8(above + base + j); + uint8x16_t a1_128 = vld1q_u8(above + base + j + 1); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = + vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); - res16[jj] = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); + res16[jj] = vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); } } - uint8x16x2_t mask; - - mask.val[0] = vld1q_u8(BaseMask[base_max_diff]); - mask.val[1] = vld1q_u8(BaseMask[base_max_diff] + 16); - dstvec[r].val[0] = vbslq_u8(mask.val[0], res16[0], a_mbase_x); - dstvec[r].val[1] = vbslq_u8(mask.val[1], res16[1], a_mbase_x); + uint8x16_t mask_lo = vld1q_u8(BaseMask[base_max_diff]); + uint8x16_t mask_hi = vld1q_u8(BaseMask[base_max_diff] + 16); + dstvec[r].val[0] = vbslq_u8(mask_lo, res16[0], a_mbase_x); + dstvec[r].val[1] = vbslq_u8(mask_hi, res16[1], a_mbase_x); x += dx; } } static void dr_prediction_z1_32xN_neon(int N, uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, int upsample_above, - int dx) { + const uint8_t *above, int dx) { uint8x16x2_t dstvec[64]; - dr_prediction_z1_32xN_internal_neon(N, dstvec, above, upsample_above, dx); + dr_prediction_z1_32xN_internal_neon(N, dstvec, above, dx); for (int i = 0; i < N; i++) { vst1q_u8(dst + stride * i, dstvec[i].val[0]); vst1q_u8(dst + stride * i + 16, dstvec[i].val[1]); @@ -1380,10 +1357,7 @@ } static void dr_prediction_z1_64xN_neon(int N, uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, int upsample_above, - int dx) { - // here upsample_above is 0 by design of av1_use_intra_edge_upsample - (void)upsample_above; + const uint8_t *above, int dx) { const int frac_bits = 6; const int max_base_x = ((64 + N) - 1); @@ -1394,12 +1368,8 @@ // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - const uint16x8_t a16 = vdupq_n_u16(16); const uint8x16_t a_mbase_x = vdupq_n_u8(above[max_base_x]); const uint8x16_t max_base_x128 = vdupq_n_u8(max_base_x); - const uint8x8_t v_32 = vdup_n_u8(32); - const uint8x16_t v_zero = vdupq_n_u8(0); - const uint8x16_t step = vdupq_n_u8(16); int x = dx; for (int r = 0; r < N; r++, dst += stride) { @@ -1425,24 +1395,26 @@ if (mdif <= 0) { vst1q_u8(dst + j, a_mbase_x); } else { - uint16x8x2_t a32, diff, res; - uint8x16_t a0_128, a1_128, mask128, res128; - a0_128 = vld1q_u8(above + base + j); - a1_128 = vld1q_u8(above + base + 1 + j); - diff.val[0] = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); - diff.val[1] = vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); - a32.val[0] = vmlal_u8(a16, vget_low_u8(a0_128), v_32); - a32.val[1] = vmlal_u8(a16, vget_high_u8(a0_128), v_32); - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift); + uint8x16_t a0_128 = vld1q_u8(above + base + j); + uint8x16_t a1_128 = vld1q_u8(above + base + 1 + j); + uint16x8_t diff_lo = vsubl_u8(vget_low_u8(a1_128), vget_low_u8(a0_128)); + uint16x8_t diff_hi = + vsubl_u8(vget_high_u8(a1_128), vget_high_u8(a0_128)); + uint16x8_t a32_lo = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_128), vdup_n_u8(32)); + uint16x8_t a32_hi = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_128), vdup_n_u8(32)); + uint16x8_t res_lo = vmlaq_u16(a32_lo, diff_lo, shift); + uint16x8_t res_hi = vmlaq_u16(a32_hi, diff_hi, shift); uint8x16_t v_temp = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); + vcombine_u8(vshrn_n_u16(res_lo, 5), vshrn_n_u16(res_hi, 5)); - mask128 = vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), v_zero); - res128 = vbslq_u8(mask128, v_temp, a_mbase_x); + uint8x16_t mask128 = + vcgtq_u8(vqsubq_u8(max_base_x128, base_inc128), vdupq_n_u8(0)); + uint8x16_t res128 = vbslq_u8(mask128, v_temp, a_mbase_x); vst1q_u8(dst + j, res128); - base_inc128 = vaddq_u8(base_inc128, step); + base_inc128 = vaddq_u8(base_inc128, vdupq_n_u8(16)); } } x += dx; @@ -1466,18 +1438,15 @@ case 16: dr_prediction_z1_16xN_neon(bh, dst, stride, above, upsample_above, dx); break; - case 32: - dr_prediction_z1_32xN_neon(bh, dst, stride, above, upsample_above, dx); - break; - case 64: - dr_prediction_z1_64xN_neon(bh, dst, stride, above, upsample_above, dx); - break; + case 32: dr_prediction_z1_32xN_neon(bh, dst, stride, above, dx); break; + case 64: dr_prediction_z1_64xN_neon(bh, dst, stride, above, dx); break; default: break; } } /* ---------------------P R E D I C T I O N Z 2--------------------------- */ +#if !AOM_ARCH_AARCH64 static DECLARE_ALIGNED(16, uint8_t, LoadMaskz2[4][16]) = { { 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0 }, @@ -1486,17 +1455,322 @@ { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff } }; +#endif // !AOM_ARCH_AARCH64 -static AOM_FORCE_INLINE void vector_shift_x4(uint8x8_t *vec, uint8x8_t *v_zero, - int shift_value) { - switch (shift_value) { - case 1: *vec = vext_u8(*v_zero, *vec, 7); break; - case 2: *vec = vext_u8(*v_zero, *vec, 6); break; - case 3: *vec = vext_u8(*v_zero, *vec, 5); break; - default: break; +static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_above_neon( + const uint8_t *above, int upsample_above, int dx, int base_x, int y, + uint8x8_t *a0_x, uint8x8_t *a1_x, uint16x4_t *shift0) { + uint16x4_t r6 = vcreate_u16(0x00C0008000400000); + uint16x4_t ydx = vdup_n_u16(y * dx); + if (upsample_above) { + // Cannot use LD2 here since we only want to load eight bytes, but LD2 can + // only load either 16 or 32. + uint8x8_t v_tmp = vld1_u8(above + base_x); + *a0_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[0]; + *a1_x = vuzp_u8(v_tmp, vdup_n_u8(0)).val[1]; + *shift0 = vand_u16(vsub_u16(r6, ydx), vdup_n_u16(0x1f)); + } else { + *a0_x = load_u8_4x1(above + base_x); + *a1_x = load_u8_4x1(above + base_x + 1); + *shift0 = vand_u16(vhsub_u16(r6, ydx), vdup_n_u16(0x1f)); + } +} + +static AOM_FORCE_INLINE void dr_prediction_z2_Nx4_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x2_t left_vals, +#else + const uint8_t *left, +#endif + int upsample_left, int dy, int r, int min_base_y, int frac_bits_y, + uint16x4_t *a0_y, uint16x4_t *a1_y, uint16x4_t *shift1) { + int16x4_t dy64 = vdup_n_s16(dy); + int16x4_t v_1234 = vcreate_s16(0x0004000300020001); + int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y); + int16x4_t min_base_y64 = vdup_n_s16(min_base_y); + int16x4_t v_r6 = vdup_n_s16(r << 6); + int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64); + int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y); + + // Values in base_y_c64 range from -2 through 14 inclusive. + base_y_c64 = vmax_s16(base_y_c64, min_base_y64); + +#if AOM_ARCH_AARCH64 + uint8x8_t left_idx0 = + vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16] + uint8x8_t left_idx1 = + vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17] + + *a0_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx0)); + *a1_y = vreinterpret_u16_u8(vqtbl2_u8(left_vals, left_idx1)); +#else // !AOM_ARCH_AARCH64 + DECLARE_ALIGNED(32, int16_t, base_y_c[4]); + + vst1_s16(base_y_c, base_y_c64); + uint8x8_t a0_y_u8 = vdup_n_u8(0); + a0_y_u8 = vld1_lane_u8(left + base_y_c[0], a0_y_u8, 0); + a0_y_u8 = vld1_lane_u8(left + base_y_c[1], a0_y_u8, 2); + a0_y_u8 = vld1_lane_u8(left + base_y_c[2], a0_y_u8, 4); + a0_y_u8 = vld1_lane_u8(left + base_y_c[3], a0_y_u8, 6); + + base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1)); + vst1_s16(base_y_c, base_y_c64); + uint8x8_t a1_y_u8 = vdup_n_u8(0); + a1_y_u8 = vld1_lane_u8(left + base_y_c[0], a1_y_u8, 0); + a1_y_u8 = vld1_lane_u8(left + base_y_c[1], a1_y_u8, 2); + a1_y_u8 = vld1_lane_u8(left + base_y_c[2], a1_y_u8, 4); + a1_y_u8 = vld1_lane_u8(left + base_y_c[3], a1_y_u8, 6); + + *a0_y = vreinterpret_u16_u8(a0_y_u8); + *a1_y = vreinterpret_u16_u8(a1_y_u8); +#endif // AOM_ARCH_AARCH64 + + if (upsample_left) { + *shift1 = vand_u16(vreinterpret_u16_s16(y_c64), vdup_n_u16(0x1f)); + } else { + *shift1 = + vand_u16(vshr_n_u16(vreinterpret_u16_s16(y_c64), 1), vdup_n_u16(0x1f)); } } +static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_above_neon( + const uint8_t *above, int upsample_above, int dx, int base_x, int y) { + uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), + vcreate_u16(0x0008000700060005)); + uint16x8_t ydx = vdupq_n_u16(y * dx); + uint16x8_t r6 = vshlq_n_u16(vextq_u16(c1234, vdupq_n_u16(0), 2), 6); + + uint16x8_t shift0; + uint8x8_t a0_x0; + uint8x8_t a1_x0; + if (upsample_above) { + uint8x8x2_t v_tmp = vld2_u8(above + base_x); + a0_x0 = v_tmp.val[0]; + a1_x0 = v_tmp.val[1]; + shift0 = vandq_u16(vsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); + } else { + a0_x0 = vld1_u8(above + base_x); + a1_x0 = vld1_u8(above + base_x + 1); + shift0 = vandq_u16(vhsubq_u16(r6, ydx), vdupq_n_u16(0x1f)); + } + + uint16x8_t diff0 = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x] + uint16x8_t a32 = + vmlal_u8(vdupq_n_u16(16), a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16 + uint16x8_t res = vmlaq_u16(a32, diff0, shift0); + return vshrn_n_u16(res, 5); +} + +static AOM_FORCE_INLINE uint8x8_t dr_prediction_z2_Nx8_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x3_t left_vals, +#else + const uint8_t *left, +#endif + int upsample_left, int dy, int r, int min_base_y, int frac_bits_y) { + int16x8_t v_r6 = vdupq_n_s16(r << 6); + int16x8_t dy128 = vdupq_n_s16(dy); + int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); + int16x8_t min_base_y128 = vdupq_n_s16(min_base_y); + + uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), + vcreate_u16(0x0008000700060005)); + int16x8_t y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128); + int16x8_t base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y); + + // Values in base_y_c128 range from -2 through 31 inclusive. + base_y_c128 = vmaxq_s16(base_y_c128, min_base_y128); + +#if AOM_ARCH_AARCH64 + uint8x16_t left_idx0 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33] + uint8x16_t left_idx1 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34] + uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); + + uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01); + uint8x8_t a0_x1 = vget_low_u8(a01_x); + uint8x8_t a1_x1 = vget_high_u8(a01_x); +#else // !AOM_ARCH_AARCH64 + uint8x8_t a0_x1 = load_u8_gather_s16_x8(left, base_y_c128); + uint8x8_t a1_x1 = load_u8_gather_s16_x8(left + 1, base_y_c128); +#endif // AOM_ARCH_AARCH64 + + uint16x8_t shift1; + if (upsample_left) { + shift1 = vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x1f)); + } else { + shift1 = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c128), vdupq_n_u16(0x3f)), 1); + } + + uint16x8_t diff1 = vsubl_u8(a1_x1, a0_x1); + uint16x8_t a32 = vmlal_u8(vdupq_n_u16(16), a0_x1, vdup_n_u8(32)); + uint16x8_t res = vmlaq_u16(a32, diff1, shift1); + return vshrn_n_u16(res, 5); +} + +static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_above_neon( + const uint8_t *above, int dx, int base_x, int y, int j) { + uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)), + vcombine_u16(vcreate_u16(0x000B000A00090008), + vcreate_u16(0x000F000E000D000C)) } }; + uint16x8_t j256 = vdupq_n_u16(j); + uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx)); + + const uint8x16_t a0_x128 = vld1q_u8(above + base_x + j); + const uint8x16_t a1_x128 = vld1q_u8(above + base_x + j + 1); + uint16x8_t res6_0 = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6); + uint16x8_t res6_1 = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6); + uint16x8_t shift0 = + vshrq_n_u16(vandq_u16(vsubq_u16(res6_0, ydx), vdupq_n_u16(0x3f)), 1); + uint16x8_t shift1 = + vshrq_n_u16(vandq_u16(vsubq_u16(res6_1, ydx), vdupq_n_u16(0x3f)), 1); + // a[x+1] - a[x] + uint16x8_t diff0 = vsubl_u8(vget_low_u8(a1_x128), vget_low_u8(a0_x128)); + uint16x8_t diff1 = vsubl_u8(vget_high_u8(a1_x128), vget_high_u8(a0_x128)); + // a[x] * 32 + 16 + uint16x8_t a32_0 = + vmlal_u8(vdupq_n_u16(16), vget_low_u8(a0_x128), vdup_n_u8(32)); + uint16x8_t a32_1 = + vmlal_u8(vdupq_n_u16(16), vget_high_u8(a0_x128), vdup_n_u8(32)); + uint16x8_t res0 = vmlaq_u16(a32_0, diff0, shift0); + uint16x8_t res1 = vmlaq_u16(a32_1, diff1, shift1); + return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); +} + +static AOM_FORCE_INLINE uint8x16_t dr_prediction_z2_NxW_left_neon( +#if AOM_ARCH_AARCH64 + uint8x16x4_t left_vals0, uint8x16x4_t left_vals1, +#else + const uint8_t *left, +#endif + int dy, int r, int j) { + // here upsample_above and upsample_left are 0 by design of + // av1_use_intra_edge_upsample + const int min_base_y = -1; + + int16x8_t min_base_y256 = vdupq_n_s16(min_base_y); + int16x8_t half_min_base_y256 = vdupq_n_s16(min_base_y >> 1); + int16x8_t dy256 = vdupq_n_s16(dy); + uint16x8_t j256 = vdupq_n_u16(j); + + uint16x8x2_t c0123 = { { vcombine_u16(vcreate_u16(0x0003000200010000), + vcreate_u16(0x0007000600050004)), + vcombine_u16(vcreate_u16(0x000B000A00090008), + vcreate_u16(0x000F000E000D000C)) } }; + uint16x8x2_t c1234 = { { vaddq_u16(c0123.val[0], vdupq_n_u16(1)), + vaddq_u16(c0123.val[1], vdupq_n_u16(1)) } }; + + int16x8_t v_r6 = vdupq_n_s16(r << 6); + + int16x8_t c256_0 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[0])); + int16x8_t c256_1 = vreinterpretq_s16_u16(vaddq_u16(j256, c1234.val[1])); + int16x8_t mul16_lo = vreinterpretq_s16_u16( + vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_0, dy256)), + vreinterpretq_u16_s16(half_min_base_y256))); + int16x8_t mul16_hi = vreinterpretq_s16_u16( + vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256_1, dy256)), + vreinterpretq_u16_s16(half_min_base_y256))); + int16x8_t y_c256_lo = vsubq_s16(v_r6, mul16_lo); + int16x8_t y_c256_hi = vsubq_s16(v_r6, mul16_hi); + + int16x8_t base_y_c256_lo = vshrq_n_s16(y_c256_lo, 6); + int16x8_t base_y_c256_hi = vshrq_n_s16(y_c256_hi, 6); + + base_y_c256_lo = vmaxq_s16(min_base_y256, base_y_c256_lo); + base_y_c256_hi = vmaxq_s16(min_base_y256, base_y_c256_hi); + +#if !AOM_ARCH_AARCH64 + int16_t min_y = vgetq_lane_s16(base_y_c256_hi, 7); + int16_t max_y = vgetq_lane_s16(base_y_c256_lo, 0); + int16_t offset_diff = max_y - min_y; + + uint8x8_t a0_y0; + uint8x8_t a0_y1; + uint8x8_t a1_y0; + uint8x8_t a1_y1; + if (offset_diff < 16) { + // Avoid gathers where the data we want is close together in memory. + // We don't need this for AArch64 since we can already use TBL to cover the + // full range of possible values. + assert(offset_diff >= 0); + int16x8_t min_y256 = vdupq_lane_s16(vget_high_s16(base_y_c256_hi), 3); + + int16x8x2_t base_y_offset; + base_y_offset.val[0] = vsubq_s16(base_y_c256_lo, min_y256); + base_y_offset.val[1] = vsubq_s16(base_y_c256_hi, min_y256); + + int8x16_t base_y_offset128 = vcombine_s8(vqmovn_s16(base_y_offset.val[0]), + vqmovn_s16(base_y_offset.val[1])); + + uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]); + uint8x16_t a0_y128 = vld1q_u8(left + min_y); + uint8x16_t a1_y128 = vld1q_u8(left + min_y + 1); + a0_y128 = vandq_u8(a0_y128, v_loadmaskz2); + a1_y128 = vandq_u8(a1_y128, v_loadmaskz2); + + uint8x8_t v_index_low = vget_low_u8(vreinterpretq_u8_s8(base_y_offset128)); + uint8x8_t v_index_high = + vget_high_u8(vreinterpretq_u8_s8(base_y_offset128)); + uint8x8x2_t v_tmp, v_res; + v_tmp.val[0] = vget_low_u8(a0_y128); + v_tmp.val[1] = vget_high_u8(a0_y128); + v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); + v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); + a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); + v_tmp.val[0] = vget_low_u8(a1_y128); + v_tmp.val[1] = vget_high_u8(a1_y128); + v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); + v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); + a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); + + a0_y0 = vget_low_u8(a0_y128); + a0_y1 = vget_high_u8(a0_y128); + a1_y0 = vget_low_u8(a1_y128); + a1_y1 = vget_high_u8(a1_y128); + } else { + a0_y0 = load_u8_gather_s16_x8(left, base_y_c256_lo); + a0_y1 = load_u8_gather_s16_x8(left, base_y_c256_hi); + a1_y0 = load_u8_gather_s16_x8(left + 1, base_y_c256_lo); + a1_y1 = load_u8_gather_s16_x8(left + 1, base_y_c256_hi); + } +#else + // Values in left_idx{0,1} range from 0 through 63 inclusive. + uint8x16_t left_idx0 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c256_lo, vdupq_n_s16(1))); + uint8x16_t left_idx1 = + vreinterpretq_u8_s16(vaddq_s16(base_y_c256_hi, vdupq_n_s16(1))); + uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); + + uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01); + uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01); + + uint8x8_t a0_y0 = vget_low_u8(a0_y01); + uint8x8_t a0_y1 = vget_high_u8(a0_y01); + uint8x8_t a1_y0 = vget_low_u8(a1_y01); + uint8x8_t a1_y1 = vget_high_u8(a1_y01); +#endif // !AOM_ARCH_AARCH64 + + uint16x8_t shifty_lo = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c256_lo), vdupq_n_u16(0x3f)), 1); + uint16x8_t shifty_hi = vshrq_n_u16( + vandq_u16(vreinterpretq_u16_s16(y_c256_hi), vdupq_n_u16(0x3f)), 1); + + // a[x+1] - a[x] + uint16x8_t diff_lo = vsubl_u8(a1_y0, a0_y0); + uint16x8_t diff_hi = vsubl_u8(a1_y1, a0_y1); + // a[x] * 32 + 16 + uint16x8_t a32_lo = vmlal_u8(vdupq_n_u16(16), a0_y0, vdup_n_u8(32)); + uint16x8_t a32_hi = vmlal_u8(vdupq_n_u16(16), a0_y1, vdup_n_u8(32)); + + uint16x8_t res0 = vmlaq_u16(a32_lo, diff_lo, shifty_lo); + uint16x8_t res1 = vmlaq_u16(a32_hi, diff_hi, shifty_hi); + + return vcombine_u8(vshrn_n_u16(res0, 5), vshrn_n_u16(res1, 5)); +} + static void dr_prediction_z2_Nx4_neon(int N, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int upsample_above, int upsample_left, @@ -1513,20 +1787,6 @@ // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - uint16x8_t a0_x, a1_x, a32, diff; - uint16x8_t v_32 = vdupq_n_u16(32); - uint16x8_t v_zero = vdupq_n_u16(0); - uint16x8_t a16 = vdupq_n_u16(16); - - uint8x8_t v_zero_u8 = vdup_n_u8(0); - uint16x4_t v_c3f = vdup_n_u16(0x3f); - uint16x4_t r6 = vcreate_u16(0x00C0008000400000); - int16x4_t v_upsample_left = vdup_n_s16(upsample_left); - int16x4_t v_upsample_above = vdup_n_s16(upsample_above); - int16x4_t v_1234 = vcreate_s16(0x0004000300020001); - int16x4_t dy64 = vdup_n_s16(dy); - int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y); - int16x4_t min_base_y64 = vdup_n_s16(min_base_y); #if AOM_ARCH_AARCH64 // Use ext rather than loading left + 14 directly to avoid over-read. @@ -1534,140 +1794,76 @@ const uint8x16_t left_0 = vld1q_u8(left); const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14); const uint8x16x2_t left_vals = { { left_m2, left_14 } }; +#define LEFT left_vals +#else // !AOM_ARCH_AARCH64 +#define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < N; r++) { - uint16x8_t res, shift; - uint8x8_t resx, resy; - uint16x4x2_t v_shift; - v_shift.val[1] = vdup_n_u16(0); int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; - int base_shift = 0; - if (base_x < (min_base_x - 1)) { - base_shift = (min_base_x - base_x - 1) >> upsample_above; - } - int base_min_diff = - (min_base_x - base_x + upsample_above) >> upsample_above; - if (base_min_diff > 4) { - base_min_diff = 4; - } else { - if (base_min_diff < 0) base_min_diff = 0; - } + const int base_min_diff = + (min_base_x - ((-y * dx) >> frac_bits_x) + upsample_above) >> + upsample_above; + + if (base_min_diff <= 0) { + uint8x8_t a0_x_u8, a1_x_u8; + uint16x4_t shift0; + dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, + &a0_x_u8, &a1_x_u8, &shift0); + uint8x8_t a0_x = a0_x_u8; + uint8x8_t a1_x = a1_x_u8; + + uint16x8_t diff = vsubl_u8(a1_x, a0_x); // a[x+1] - a[x] + uint16x8_t a32 = + vmlal_u8(vdupq_n_u16(16), a0_x, vdup_n_u8(32)); // a[x] * 32 + 16 + uint16x8_t res = + vmlaq_u16(a32, diff, vcombine_u16(shift0, vdup_n_u16(0))); + uint8x8_t resx = vshrn_n_u16(res, 5); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resx), 0); + } else if (base_min_diff < 4) { + uint8x8_t a0_x_u8, a1_x_u8; + uint16x4_t shift0; + dr_prediction_z2_Nx4_above_neon(above, upsample_above, dx, base_x, y, + &a0_x_u8, &a1_x_u8, &shift0); + uint16x8_t a0_x = vmovl_u8(a0_x_u8); + uint16x8_t a1_x = vmovl_u8(a1_x_u8); + + uint16x4_t a0_y; + uint16x4_t a1_y; + uint16x4_t shift1; + dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, + frac_bits_y, &a0_y, &a1_y, &shift1); + a0_x = vcombine_u16(vget_low_u16(a0_x), a0_y); + a1_x = vcombine_u16(vget_low_u16(a1_x), a1_y); + + uint16x8_t shift = vcombine_u16(shift0, shift1); + uint16x8_t diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x] + uint16x8_t a32 = + vmlaq_n_u16(vdupq_n_u16(16), a0_x, 32); // a[x] * 32 + 16 + uint16x8_t res = vmlaq_u16(a32, diff, shift); + uint8x8_t resx = vshrn_n_u16(res, 5); + uint8x8_t resy = vext_u8(resx, vdup_n_u8(0), 4); - if (base_shift > 3) { - a0_x = v_zero; - a1_x = v_zero; - v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8); - v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8); + uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); + uint8x8_t v_resxy = vbsl_u8(mask, resy, resx); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0); } else { - uint16x4_t ydx = vdup_n_u16(y * dx); - - if (upsample_above) { - uint8x8x2_t v_tmp; - v_tmp.val[0] = vld1_u8(above + base_x + base_shift); - v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8); - uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]); - uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8); - a0_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_low)); - a1_x = vmovl_u8(vtbl2_u8(v_tmp, v_index_high)); - v_shift.val[0] = vshr_n_u16( - vand_u16(vshl_u16(vsub_u16(r6, ydx), v_upsample_above), v_c3f), 1); - } else { - uint8x8_t v_a0_x64 = vld1_u8(above + base_x + base_shift); - vector_shift_x4(&v_a0_x64, &v_zero_u8, base_shift); - uint8x8_t v_a1_x64 = vext_u8(v_a0_x64, v_zero_u8, 1); - v_shift.val[0] = vshr_n_u16(vand_u16(vsub_u16(r6, ydx), v_c3f), 1); - a0_x = vmovl_u8(v_a0_x64); - a1_x = vmovl_u8(v_a1_x64); - } - } - - // y calc - if (base_x < min_base_x) { - int16x4_t v_r6 = vdup_n_s16(r << 6); - int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64); - int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y); - uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64); - - // Values in base_y_c64 range from -2 through 14 inclusive. - base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64)); - -#if AOM_ARCH_AARCH64 - uint8x8_t left_idx0 = - vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(2))); // [0, 16] - uint8x8_t left_idx1 = - vreinterpret_u8_s16(vadd_s16(base_y_c64, vdup_n_s16(3))); // [1, 17] - - uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8); - uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8); -#else // !AOM_ARCH_AARCH64 - DECLARE_ALIGNED(32, int16_t, base_y_c[4]); - - vst1_s16(base_y_c, base_y_c64); - uint8x8_t a0_y = vdup_n_u8(0); - a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0); - a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2); - a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4); - a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6); - - base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1)); - vst1_s16(base_y_c, base_y_c64); - uint8x8_t a1_y = vdup_n_u8(0); - a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0); - a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2); - a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4); - a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6); -#endif // AOM_ARCH_AARCH64 + uint16x4_t a0_y, a1_y; + uint16x4_t shift1; + dr_prediction_z2_Nx4_left_neon(LEFT, upsample_left, dy, r, min_base_y, + frac_bits_y, &a0_y, &a1_y, &shift1); + uint16x4_t diff = vsub_u16(a1_y, a0_y); // a[x+1] - a[x] + uint16x4_t a32 = vmla_n_u16(vdup_n_u16(16), a0_y, 32); // a[x] * 32 + 16 + uint16x4_t res = vmla_u16(a32, diff, shift1); + uint8x8_t resy = vshrn_n_u16(vcombine_u16(res, vdup_n_u16(0)), 5); - if (upsample_left) { - v_shift.val[1] = vshr_n_u16( - vand_u16(vshl_u16(vreinterpret_u16_s16(y_c64), v_upsample_left), - v_c3f), - 1); - } else { - v_shift.val[1] = - vshr_n_u16(vand_u16(vreinterpret_u16_s16(y_c64), v_c3f), 1); - } - - a0_x = vcombine_u16(vget_low_u16(a0_x), vreinterpret_u16_u8(a0_y)); - a1_x = vcombine_u16(vget_low_u16(a1_x), vreinterpret_u16_u8(a1_y)); + vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(resy), 0); } - shift = vcombine_u16(v_shift.val[0], v_shift.val[1]); - diff = vsubq_u16(a1_x, a0_x); // a[x+1] - a[x] - a32 = vmlaq_u16(a16, a0_x, v_32); // a[x] * 32 + 16 - res = vmlaq_u16(a32, diff, shift); - resx = vshrn_n_u16(res, 5); - resy = vext_u8(resx, v_zero_u8, 4); - - uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); - uint8x8_t v_resxy = vbsl_u8(mask, resy, resx); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0); dst += stride; } -} - -static AOM_FORCE_INLINE void vector_shuffle(uint8x16_t *vec, uint8x16_t *vzero, - int shift_value) { - switch (shift_value) { - case 1: *vec = vextq_u8(*vzero, *vec, 15); break; - case 2: *vec = vextq_u8(*vzero, *vec, 14); break; - case 3: *vec = vextq_u8(*vzero, *vec, 13); break; - case 4: *vec = vextq_u8(*vzero, *vec, 12); break; - case 5: *vec = vextq_u8(*vzero, *vec, 11); break; - case 6: *vec = vextq_u8(*vzero, *vec, 10); break; - case 7: *vec = vextq_u8(*vzero, *vec, 9); break; - case 8: *vec = vextq_u8(*vzero, *vec, 8); break; - case 9: *vec = vextq_u8(*vzero, *vec, 7); break; - case 10: *vec = vextq_u8(*vzero, *vec, 6); break; - case 11: *vec = vextq_u8(*vzero, *vec, 5); break; - case 12: *vec = vextq_u8(*vzero, *vec, 4); break; - case 13: *vec = vextq_u8(*vzero, *vec, 3); break; - case 14: *vec = vextq_u8(*vzero, *vec, 2); break; - case 15: *vec = vextq_u8(*vzero, *vec, 1); break; - default: break; - } +#undef LEFT } static void dr_prediction_z2_Nx8_neon(int N, uint8_t *dst, ptrdiff_t stride, @@ -1685,18 +1881,6 @@ // above[x+1] - above[x] // final pixels will be calculated as: // (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5 - uint16x8x2_t diff, a32; - uint8x16_t v_zero = vdupq_n_u8(0); - int16x8_t v_upsample_left = vdupq_n_s16(upsample_left); - int16x8_t v_upsample_above = vdupq_n_s16(upsample_above); - int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); - - uint16x8_t a16 = vdupq_n_u16(16); - uint16x8_t c3f = vdupq_n_u16(0x3f); - int16x8_t min_base_y128 = vdupq_n_s16(min_base_y); - int16x8_t dy128 = vdupq_n_s16(dy); - uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001), - vcreate_u16(0x0008000700060005)); #if AOM_ARCH_AARCH64 // Use ext rather than loading left + 30 directly to avoid over-read. @@ -1706,170 +1890,46 @@ const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14); const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14); const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } }; +#define LEFT left_vals +#else // !AOM_ARCH_AARCH64 +#define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < N; r++) { - uint8x8_t resx, resy, resxy; - uint16x8x2_t res, shift; - shift.val[1] = vdupq_n_u16(0); - int y = r + 1; int base_x = (-y * dx) >> frac_bits_x; - int base_shift = 0; - if (base_x < (min_base_x - 1)) { - base_shift = (min_base_x - base_x - 1) >> upsample_above; - } int base_min_diff = (min_base_x - base_x + upsample_above) >> upsample_above; - if (base_min_diff > 8) { - base_min_diff = 8; - } else { - if (base_min_diff < 0) base_min_diff = 0; - } - - uint8x8_t a0_x0, a1_x0; - if (base_shift > 7) { - a0_x0 = vdup_n_u8(0); - a1_x0 = vdup_n_u8(0); - shift.val[0] = vreinterpretq_u16_u8(v_zero); - shift.val[1] = vreinterpretq_u16_u8(v_zero); - } else { - uint16x8_t ydx = vdupq_n_u16(y * dx); - uint16x8_t r6 = - vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6); - - if (upsample_above) { - uint8x8x2_t v_tmp; - v_tmp.val[0] = vld1_u8(above + base_x + base_shift); - v_tmp.val[1] = vld1_u8(above + base_x + base_shift + 8); - uint8x8_t v_index_low = vld1_u8(EvenOddMaskx[base_shift]); - uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8); - shift.val[0] = vshrq_n_u16( - vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1); - a0_x0 = vtbl2_u8(v_tmp, v_index_low); - a1_x0 = vtbl2_u8(v_tmp, v_index_high); - } else { - uint8x16_t a0_x128, a1_x128; - a0_x128 = vld1q_u8(above + base_x + base_shift); - a1_x128 = vextq_u8(a0_x128, v_zero, 1); - vector_shuffle(&a0_x128, &v_zero, base_shift); - vector_shuffle(&a1_x128, &v_zero, base_shift); - shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1); - a0_x0 = vget_low_u8(a0_x128); - a1_x0 = vget_low_u8(a1_x128); - } - } - - diff.val[0] = vsubl_u8(a1_x0, a0_x0); // a[x+1] - a[x] - a32.val[0] = vmlal_u8(a16, a0_x0, vdup_n_u8(32)); // a[x] * 32 + 16 - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]); - resx = vshrn_n_u16(res.val[0], 5); - - // y calc - if (base_x < min_base_x) { - int16x8_t y_c128, base_y_c128; - uint16x8_t mask128; - int16x8_t v_r6 = vdupq_n_s16(r << 6); - - y_c128 = vmlsq_s16(v_r6, vreinterpretq_s16_u16(c1234), dy128); - base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y); - mask128 = vcgtq_s16(min_base_y128, base_y_c128); - - // Values in base_y_c128 range from -2 through 31 inclusive. - base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128)); - -#if AOM_ARCH_AARCH64 - uint8x16_t left_idx0 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c128, vdupq_n_s16(2))); // [0, 33] - uint8x16_t left_idx1 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c128, vdupq_n_s16(3))); // [1, 34] - uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); - - uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01); - uint8x8_t a0_x1 = vget_low_u8(a01_x); - uint8x8_t a1_x1 = vget_high_u8(a01_x); -#else // !AOM_ARCH_AARCH64 - DECLARE_ALIGNED(32, int16_t, base_y_c[16]); - - vst1q_s16(base_y_c, base_y_c128); - uint8x8_t a0_x1 = vdup_n_u8(0); - a0_x1 = vld1_lane_u8(left + base_y_c[0], a0_x1, 0); - a0_x1 = vld1_lane_u8(left + base_y_c[1], a0_x1, 1); - a0_x1 = vld1_lane_u8(left + base_y_c[2], a0_x1, 2); - a0_x1 = vld1_lane_u8(left + base_y_c[3], a0_x1, 3); - a0_x1 = vld1_lane_u8(left + base_y_c[4], a0_x1, 4); - a0_x1 = vld1_lane_u8(left + base_y_c[5], a0_x1, 5); - a0_x1 = vld1_lane_u8(left + base_y_c[6], a0_x1, 6); - a0_x1 = vld1_lane_u8(left + base_y_c[7], a0_x1, 7); - - base_y_c128 = vaddq_s16(base_y_c128, vdupq_n_s16(1)); - vst1q_s16(base_y_c, base_y_c128); - uint8x8_t a1_x1 = vdup_n_u8(0); - a1_x1 = vld1_lane_u8(left + base_y_c[0], a1_x1, 0); - a1_x1 = vld1_lane_u8(left + base_y_c[1], a1_x1, 1); - a1_x1 = vld1_lane_u8(left + base_y_c[2], a1_x1, 2); - a1_x1 = vld1_lane_u8(left + base_y_c[3], a1_x1, 3); - a1_x1 = vld1_lane_u8(left + base_y_c[4], a1_x1, 4); - a1_x1 = vld1_lane_u8(left + base_y_c[5], a1_x1, 5); - a1_x1 = vld1_lane_u8(left + base_y_c[6], a1_x1, 6); - a1_x1 = vld1_lane_u8(left + base_y_c[7], a1_x1, 7); -#endif // AOM_ARCH_AARCH64 - - if (upsample_left) { - shift.val[1] = vshrq_n_u16( - vandq_u16(vshlq_u16(vreinterpretq_u16_s16(y_c128), v_upsample_left), - c3f), - 1); - } else { - shift.val[1] = - vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1); - } - diff.val[1] = vsubl_u8(a1_x1, a0_x1); - a32.val[1] = vmlal_u8(a16, a0_x1, vdup_n_u8(32)); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]); - resy = vshrn_n_u16(res.val[1], 5); + if (base_min_diff <= 0) { + uint8x8_t resx = + dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); + vst1_u8(dst, resx); + } else if (base_min_diff < 8) { + uint8x8_t resx = + dr_prediction_z2_Nx8_above_neon(above, upsample_above, dx, base_x, y); + uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( + LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]); - resxy = vbsl_u8(mask, resy, resx); + uint8x8_t resxy = vbsl_u8(mask, resy, resx); vst1_u8(dst, resxy); } else { - vst1_u8(dst, resx); + uint8x8_t resy = dr_prediction_z2_Nx8_left_neon( + LEFT, upsample_left, dy, r, min_base_y, frac_bits_y); + vst1_u8(dst, resy); } dst += stride; } +#undef LEFT } static void dr_prediction_z2_HxW_neon(int H, int W, uint8_t *dst, ptrdiff_t stride, const uint8_t *above, - const uint8_t *left, int upsample_above, - int upsample_left, int dx, int dy) { + const uint8_t *left, int dx, int dy) { // here upsample_above and upsample_left are 0 by design of // av1_use_intra_edge_upsample const int min_base_x = -1; - const int min_base_y = -1; - (void)upsample_above; - (void)upsample_left; - const int frac_bits_x = 6; - const int frac_bits_y = 6; - - uint16x8x2_t a32, c0123, c1234, diff, shifty; - uint8x16x2_t a0_x, a1_x; - uint16x8_t v_32 = vdupq_n_u16(32); - uint8x16_t v_zero = vdupq_n_u8(0); - int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y); - - uint16x8_t a16 = vdupq_n_u16(16); - uint16x8_t c1 = vshrq_n_u16(a16, 4); - int16x8_t min_base_y256 = vdupq_n_s16(min_base_y); - uint16x8_t c3f = vdupq_n_u16(0x3f); - int16x8_t dy256 = vdupq_n_s16(dy); - c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000), - vcreate_u16(0x0007000600050004)); - c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008), - vcreate_u16(0x000F000E000D000C)); - c1234.val[0] = vaddq_u16(c0123.val[0], c1); - c1234.val[1] = vaddq_u16(c0123.val[1], c1); #if AOM_ARCH_AARCH64 const uint8x16_t left_m1 = vld1q_u8(left - 1); @@ -1882,241 +1942,36 @@ const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15); const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } }; const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } }; +#define LEFT left_vals0, left_vals1 +#else // !AOM_ARCH_AARCH64 +#define LEFT left #endif // AOM_ARCH_AARCH64 for (int r = 0; r < H; r++) { - uint16x8x2_t res, r6, shift; - uint16x8_t j256; - uint8x16_t resx, resy, resxy; int y = r + 1; - uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx)); - - int base_x = (-y * dx) >> frac_bits_x; + int base_x = (-y * dx) >> 6; for (int j = 0; j < W; j += 16) { - j256 = vdupq_n_u16(j); - - int base_shift = 0; - if ((base_x + j) < (min_base_x - 1)) { - base_shift = (min_base_x - (base_x + j) - 1); - } - int base_min_diff = (min_base_x - base_x - j); - if (base_min_diff > 16) { - base_min_diff = 16; - } else { - if (base_min_diff < 0) base_min_diff = 0; - } - - if (base_shift < 16) { - uint8x16_t a0_x128, a1_x128; - a0_x128 = vld1q_u8(above + base_x + base_shift + j); - a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j); - vector_shuffle(&a0_x128, &v_zero, base_shift); - vector_shuffle(&a1_x128, &v_zero, base_shift); - a0_x = vzipq_u8(a0_x128, v_zero); - a1_x = vzipq_u8(a1_x128, v_zero); - r6.val[0] = vshlq_n_u16(vaddq_u16(c0123.val[0], j256), 6); - r6.val[1] = vshlq_n_u16(vaddq_u16(c0123.val[1], j256), 6); - shift.val[0] = - vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[0], ydx), c3f), 1); - shift.val[1] = - vshrq_n_u16(vandq_u16(vsubq_u16(r6.val[1], ydx), c3f), 1); - diff.val[0] = - vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]), - vreinterpretq_u16_u8(a0_x.val[0])); // a[x+1] - a[x] - diff.val[1] = - vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]), - vreinterpretq_u16_u8(a0_x.val[1])); // a[x+1] - a[x] - a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]), - v_32); // a[x] * 32 + 16 - a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]), - v_32); // a[x] * 32 + 16 - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]); - resx = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); - } else { - resx = v_zero; - } - - // y calc - if (base_x < min_base_x) { - uint16x8x2_t mask256; - int16x8x2_t c256, y_c256, base_y_c256, mul16; - int16x8_t v_r6 = vdupq_n_s16(r << 6); - - c256.val[0] = vaddq_s16(vreinterpretq_s16_u16(j256), - vreinterpretq_s16_u16(c1234.val[0])); - c256.val[1] = vaddq_s16(vreinterpretq_s16_u16(j256), - vreinterpretq_s16_u16(c1234.val[1])); - mul16.val[0] = vreinterpretq_s16_u16( - vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256.val[0], dy256)), - vshrq_n_u16(vreinterpretq_u16_s16(min_base_y256), 1))); - mul16.val[1] = vreinterpretq_s16_u16( - vminq_u16(vreinterpretq_u16_s16(vmulq_s16(c256.val[1], dy256)), - vshrq_n_u16(vreinterpretq_u16_s16(min_base_y256), 1))); - y_c256.val[0] = vsubq_s16(v_r6, mul16.val[0]); - y_c256.val[1] = vsubq_s16(v_r6, mul16.val[1]); - - base_y_c256.val[0] = vshlq_s16(y_c256.val[0], v_frac_bits_y); - base_y_c256.val[1] = vshlq_s16(y_c256.val[1], v_frac_bits_y); - mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]); - mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]); - - base_y_c256.val[0] = - vbslq_s16(mask256.val[0], min_base_y256, base_y_c256.val[0]); - base_y_c256.val[1] = - vbslq_s16(mask256.val[1], min_base_y256, base_y_c256.val[1]); - - int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7); - int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0); - int16_t offset_diff = max_y - min_y; - - uint8x8_t a0_y0; - uint8x8_t a0_y1; - uint8x8_t a1_y0; - uint8x8_t a1_y1; - - if (offset_diff < 16) { - assert(offset_diff >= 0); - int16x8_t min_y256 = - vdupq_lane_s16(vget_high_s16(base_y_c256.val[1]), 3); - - int16x8x2_t base_y_offset; - base_y_offset.val[0] = vsubq_s16(base_y_c256.val[0], min_y256); - base_y_offset.val[1] = vsubq_s16(base_y_c256.val[1], min_y256); - - int8x16_t base_y_offset128 = - vcombine_s8(vqmovn_s16(base_y_offset.val[0]), - vqmovn_s16(base_y_offset.val[1])); - - uint8x16_t a0_y128, a1_y128; - uint8x16_t v_loadmaskz2 = vld1q_u8(LoadMaskz2[offset_diff / 4]); - a0_y128 = vld1q_u8(left + min_y); - a0_y128 = vandq_u8(a0_y128, v_loadmaskz2); - a1_y128 = vld1q_u8(left + min_y + 1); - a1_y128 = vandq_u8(a1_y128, v_loadmaskz2); -#if AOM_ARCH_AARCH64 - a0_y128 = vqtbl1q_u8(a0_y128, vreinterpretq_u8_s8(base_y_offset128)); - a1_y128 = vqtbl1q_u8(a1_y128, vreinterpretq_u8_s8(base_y_offset128)); -#else - uint8x8x2_t v_tmp; - uint8x8x2_t v_res; - uint8x8_t v_index_low = - vget_low_u8(vreinterpretq_u8_s8(base_y_offset128)); - uint8x8_t v_index_high = - vget_high_u8(vreinterpretq_u8_s8(base_y_offset128)); - v_tmp.val[0] = vget_low_u8(a0_y128); - v_tmp.val[1] = vget_high_u8(a0_y128); - v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); - v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); - a0_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); - v_tmp.val[0] = vget_low_u8(a1_y128); - v_tmp.val[1] = vget_high_u8(a1_y128); - v_res.val[0] = vtbl2_u8(v_tmp, v_index_low); - v_res.val[1] = vtbl2_u8(v_tmp, v_index_high); - a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]); -#endif - a0_y0 = vget_low_u8(a0_y128); - a0_y1 = vget_high_u8(a0_y128); - a1_y0 = vget_low_u8(a1_y128); - a1_y1 = vget_high_u8(a1_y128); - } else { - // Values in base_y_c256 range from -1 through 62 inclusive. - base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0], - vreinterpretq_s16_u16(mask256.val[0])); - base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1], - vreinterpretq_s16_u16(mask256.val[1])); - -#if AOM_ARCH_AARCH64 - // Values in left_idx{0,1} range from 0 through 63 inclusive. - uint8x16_t left_idx0 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c256.val[0], vdupq_n_s16(1))); - uint8x16_t left_idx1 = vreinterpretq_u8_s16( - vaddq_s16(base_y_c256.val[1], vdupq_n_s16(1))); - - uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1); - - uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01); - uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01); - - a0_y0 = vget_low_u8(a0_y01); - a0_y1 = vget_high_u8(a0_y01); - a1_y0 = vget_low_u8(a1_y01); - a1_y1 = vget_high_u8(a1_y01); -#else // !AOM_ARCH_AARCH64 - DECLARE_ALIGNED(32, int16_t, base_y_c[16]); - - vst1q_s16(base_y_c, base_y_c256.val[0]); - vst1q_s16(base_y_c + 8, base_y_c256.val[1]); - a0_y0 = vdup_n_u8(0); - a0_y0 = vld1_lane_u8(left + base_y_c[0], a0_y0, 0); - a0_y0 = vld1_lane_u8(left + base_y_c[1], a0_y0, 1); - a0_y0 = vld1_lane_u8(left + base_y_c[2], a0_y0, 2); - a0_y0 = vld1_lane_u8(left + base_y_c[3], a0_y0, 3); - a0_y0 = vld1_lane_u8(left + base_y_c[4], a0_y0, 4); - a0_y0 = vld1_lane_u8(left + base_y_c[5], a0_y0, 5); - a0_y0 = vld1_lane_u8(left + base_y_c[6], a0_y0, 6); - a0_y0 = vld1_lane_u8(left + base_y_c[7], a0_y0, 7); - a0_y1 = vdup_n_u8(0); - a0_y1 = vld1_lane_u8(left + base_y_c[8], a0_y1, 0); - a0_y1 = vld1_lane_u8(left + base_y_c[9], a0_y1, 1); - a0_y1 = vld1_lane_u8(left + base_y_c[10], a0_y1, 2); - a0_y1 = vld1_lane_u8(left + base_y_c[11], a0_y1, 3); - a0_y1 = vld1_lane_u8(left + base_y_c[12], a0_y1, 4); - a0_y1 = vld1_lane_u8(left + base_y_c[13], a0_y1, 5); - a0_y1 = vld1_lane_u8(left + base_y_c[14], a0_y1, 6); - a0_y1 = vld1_lane_u8(left + base_y_c[15], a0_y1, 7); - - base_y_c256.val[0] = - vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1)); - base_y_c256.val[1] = - vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1)); - - vst1q_s16(base_y_c, base_y_c256.val[0]); - vst1q_s16(base_y_c + 8, base_y_c256.val[1]); - a1_y0 = vdup_n_u8(0); - a1_y0 = vld1_lane_u8(left + base_y_c[0], a1_y0, 0); - a1_y0 = vld1_lane_u8(left + base_y_c[1], a1_y0, 1); - a1_y0 = vld1_lane_u8(left + base_y_c[2], a1_y0, 2); - a1_y0 = vld1_lane_u8(left + base_y_c[3], a1_y0, 3); - a1_y0 = vld1_lane_u8(left + base_y_c[4], a1_y0, 4); - a1_y0 = vld1_lane_u8(left + base_y_c[5], a1_y0, 5); - a1_y0 = vld1_lane_u8(left + base_y_c[6], a1_y0, 6); - a1_y0 = vld1_lane_u8(left + base_y_c[7], a1_y0, 7); - a1_y1 = vdup_n_u8(0); - a1_y1 = vld1_lane_u8(left + base_y_c[8], a1_y1, 0); - a1_y1 = vld1_lane_u8(left + base_y_c[9], a1_y1, 1); - a1_y1 = vld1_lane_u8(left + base_y_c[10], a1_y1, 2); - a1_y1 = vld1_lane_u8(left + base_y_c[11], a1_y1, 3); - a1_y1 = vld1_lane_u8(left + base_y_c[12], a1_y1, 4); - a1_y1 = vld1_lane_u8(left + base_y_c[13], a1_y1, 5); - a1_y1 = vld1_lane_u8(left + base_y_c[14], a1_y1, 6); - a1_y1 = vld1_lane_u8(left + base_y_c[15], a1_y1, 7); -#endif // AOM_ARCH_AARCH64 - } + const int base_min_diff = min_base_x - base_x - j; - shifty.val[0] = vshrq_n_u16( - vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1); - shifty.val[1] = vshrq_n_u16( - vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1); - diff.val[0] = vsubl_u8(a1_y0, a0_y0); // a[x+1] - a[x] - diff.val[1] = vsubl_u8(a1_y1, a0_y1); // a[x+1] - a[x] - a32.val[0] = vmlal_u8(a16, a0_y0, vdup_n_u8(32)); // a[x] * 32 + 16 - a32.val[1] = vmlal_u8(a16, a0_y1, vdup_n_u8(32)); // a[x] * 32 + 16 - res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]); - res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]); - - resy = - vcombine_u8(vshrn_n_u16(res.val[0], 5), vshrn_n_u16(res.val[1], 5)); + if (base_min_diff <= 0) { + uint8x16_t resx = + dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); + vst1q_u8(dst + j, resx); + } else if (base_min_diff < 16) { + uint8x16_t resx = + dr_prediction_z2_NxW_above_neon(above, dx, base_x, y, j); + uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); + uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]); + uint8x16_t resxy = vbslq_u8(mask, resy, resx); + vst1q_u8(dst + j, resxy); } else { - resy = v_zero; + uint8x16_t resy = dr_prediction_z2_NxW_left_neon(LEFT, dy, r, j); + vst1q_u8(dst + j, resy); } - uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]); - resxy = vbslq_u8(mask, resy, resx); - vst1q_u8(dst + j, resxy); } // for j dst += stride; } +#undef LEFT } // Directional prediction, zone 2: 90 < angle < 180 @@ -2137,626 +1992,67 @@ upsample_left, dx, dy); break; default: - dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, - upsample_above, upsample_left, dx, dy); + dr_prediction_z2_HxW_neon(bh, bw, dst, stride, above, left, dx, dy); break; } } /* ---------------------P R E D I C T I O N Z 3--------------------------- */ -static AOM_FORCE_INLINE void transpose4x16_neon(uint8x16_t *x, - uint16x8x2_t *d) { - uint8x16x2_t w0, w1; - - w0 = vzipq_u8(x[0], x[1]); - w1 = vzipq_u8(x[2], x[3]); - - d[0] = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - d[1] = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); -} - -static AOM_FORCE_INLINE void transpose4x8_8x4_low_neon(uint8x8_t *x, - uint16x4x2_t *d) { - uint8x8x2_t w0, w1; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - - *d = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); -} - -static AOM_FORCE_INLINE void transpose4x8_8x4_neon(uint8x8_t *x, - uint16x4x2_t *d) { - uint8x8x2_t w0, w1; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - - d[0] = - vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - d[1] = - vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); -} - -static AOM_FORCE_INLINE void transpose8x8_low_neon(uint8x8_t *x, - uint32x2x2_t *d) { - uint8x8x2_t w0, w1, w2, w3; - uint16x4x2_t w4, w5; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - w2 = vzip_u8(x[4], x[5]); - w3 = vzip_u8(x[6], x[7]); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); - - d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); -} - -static AOM_FORCE_INLINE void transpose8x8_neon(uint8x8_t *x, uint32x2x2_t *d) { - uint8x8x2_t w0, w1, w2, w3; - uint16x4x2_t w4, w5, w6, w7; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - w2 = vzip_u8(x[4], x[5]); - w3 = vzip_u8(x[6], x[7]); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); - - d[0] = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - d[1] = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); - - w6 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); - w7 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); - - d[2] = vzip_u32(vreinterpret_u32_u16(w6.val[0]), - vreinterpret_u32_u16(w7.val[0])); - d[3] = vzip_u32(vreinterpret_u32_u16(w6.val[1]), - vreinterpret_u32_u16(w7.val[1])); -} - -static AOM_FORCE_INLINE void transpose16x8_8x16_neon(uint8x8_t *x, - uint64x2_t *d) { - uint8x8x2_t w0, w1, w2, w3, w8, w9, w10, w11; - uint16x4x2_t w4, w5, w12, w13; - uint32x2x2_t w6, w7, w14, w15; - - w0 = vzip_u8(x[0], x[1]); - w1 = vzip_u8(x[2], x[3]); - w2 = vzip_u8(x[4], x[5]); - w3 = vzip_u8(x[6], x[7]); - - w8 = vzip_u8(x[8], x[9]); - w9 = vzip_u8(x[10], x[11]); - w10 = vzip_u8(x[12], x[13]); - w11 = vzip_u8(x[14], x[15]); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); - w12 = - vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0])); - w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]), - vreinterpret_u16_u8(w11.val[0])); - - w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); - w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), - vreinterpret_u32_u16(w13.val[0])); - w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), - vreinterpret_u32_u16(w13.val[1])); - - // Store first 4-line result - d[0] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]), - vreinterpret_u64_u32(w14.val[0])); - d[1] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]), - vreinterpret_u64_u32(w14.val[1])); - d[2] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]), - vreinterpret_u64_u32(w15.val[0])); - d[3] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]), - vreinterpret_u64_u32(w15.val[1])); - - w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); - w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); - w12 = - vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1])); - w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]), - vreinterpret_u16_u8(w11.val[1])); - - w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), - vreinterpret_u32_u16(w5.val[0])); - w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), - vreinterpret_u32_u16(w5.val[1])); - w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), - vreinterpret_u32_u16(w13.val[0])); - w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), - vreinterpret_u32_u16(w13.val[1])); - - // Store second 4-line result - d[4] = vcombine_u64(vreinterpret_u64_u32(w6.val[0]), - vreinterpret_u64_u32(w14.val[0])); - d[5] = vcombine_u64(vreinterpret_u64_u32(w6.val[1]), - vreinterpret_u64_u32(w14.val[1])); - d[6] = vcombine_u64(vreinterpret_u64_u32(w7.val[0]), - vreinterpret_u64_u32(w15.val[0])); - d[7] = vcombine_u64(vreinterpret_u64_u32(w7.val[1]), - vreinterpret_u64_u32(w15.val[1])); -} - -static AOM_FORCE_INLINE void transpose8x16_16x8_neon(uint8x16_t *x, - uint64x2_t *d) { - uint8x16x2_t w0, w1, w2, w3; - uint16x8x2_t w4, w5, w6, w7; - uint32x4x2_t w8, w9, w10, w11; - - w0 = vzipq_u8(x[0], x[1]); - w1 = vzipq_u8(x[2], x[3]); - w2 = vzipq_u8(x[4], x[5]); - w3 = vzipq_u8(x[6], x[7]); - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - - w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]), - vreinterpretq_u32_u16(w7.val[0])); - w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]), - vreinterpretq_u32_u16(w7.val[1])); +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_16x4(const uint8x16_t *x, + uint8x16x2_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); -#if AOM_ARCH_AARCH64 - d[0] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[0]), - vreinterpretq_u64_u32(w9.val[0])); - d[1] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[0]), - vreinterpretq_u64_u32(w9.val[0])); - d[2] = vzip1q_u64(vreinterpretq_u64_u32(w8.val[1]), - vreinterpretq_u64_u32(w9.val[1])); - d[3] = vzip2q_u64(vreinterpretq_u64_u32(w8.val[1]), - vreinterpretq_u64_u32(w9.val[1])); - d[4] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[0]), - vreinterpretq_u64_u32(w11.val[0])); - d[5] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[0]), - vreinterpretq_u64_u32(w11.val[0])); - d[6] = vzip1q_u64(vreinterpretq_u64_u32(w10.val[1]), - vreinterpretq_u64_u32(w11.val[1])); - d[7] = vzip2q_u64(vreinterpretq_u64_u32(w10.val[1]), - vreinterpretq_u64_u32(w11.val[1])); -#else - d[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w8.val[0]), vget_low_u32(w9.val[0]))); - d[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w8.val[0]), vget_high_u32(w9.val[0]))); - d[2] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w8.val[1]), vget_low_u32(w9.val[1]))); - d[3] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w8.val[1]), vget_high_u32(w9.val[1]))); - d[4] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w10.val[0]), vget_low_u32(w11.val[0]))); - d[5] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w10.val[0]), vget_high_u32(w11.val[0]))); - d[6] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w10.val[1]), vget_low_u32(w11.val[1]))); - d[7] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w10.val[1]), vget_high_u32(w11.val[1]))); -#endif + d[0] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0]))); + d[1] = aom_reinterpretq_u8_u16_x2(vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1]))); } -static AOM_FORCE_INLINE void transpose16x16_neon(uint8x16_t *x, uint64x2_t *d) { - uint8x16x2_t w0, w1, w2, w3, w4, w5, w6, w7; - uint16x8x2_t w8, w9, w10, w11; - uint32x4x2_t w12, w13, w14, w15; - - w0 = vzipq_u8(x[0], x[1]); - w1 = vzipq_u8(x[2], x[3]); - w2 = vzipq_u8(x[4], x[5]); - w3 = vzipq_u8(x[6], x[7]); - - w4 = vzipq_u8(x[8], x[9]); - w5 = vzipq_u8(x[10], x[11]); - w6 = vzipq_u8(x[12], x[13]); - w7 = vzipq_u8(x[14], x[15]); - - w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]), - vreinterpretq_u16_u8(w5.val[0])); - w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]), - vreinterpretq_u16_u8(w7.val[0])); - - w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), - vreinterpretq_u32_u16(w9.val[0])); - w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), - vreinterpretq_u32_u16(w11.val[0])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), - vreinterpretq_u32_u16(w9.val[1])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), - vreinterpretq_u32_u16(w11.val[1])); - -#if AOM_ARCH_AARCH64 - d[0] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[1] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[2] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[3] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[4] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[5] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[6] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[7] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0]))); - d[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0]))); - d[2] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1]))); - d[3] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1]))); - d[4] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0]))); - d[5] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0]))); - d[6] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1]))); - d[7] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1]))); -#endif +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_4x4(const uint8x8_t *x, + uint8x8x2_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); - // upper half - w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]), - vreinterpretq_u16_u8(w5.val[1])); - w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]), - vreinterpretq_u16_u8(w7.val[1])); - - w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), - vreinterpretq_u32_u16(w9.val[0])); - w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), - vreinterpretq_u32_u16(w11.val[0])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), - vreinterpretq_u32_u16(w9.val[1])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), - vreinterpretq_u32_u16(w11.val[1])); - -#if AOM_ARCH_AARCH64 - d[8] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[9] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[0]), - vreinterpretq_u64_u32(w13.val[0])); - d[10] = vzip1q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[11] = vzip2q_u64(vreinterpretq_u64_u32(w12.val[1]), - vreinterpretq_u64_u32(w13.val[1])); - d[12] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[13] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[14] = vzip1q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[15] = vzip2q_u64(vreinterpretq_u64_u32(w14.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[8] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[0]), vget_low_u32(w13.val[0]))); - d[9] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[0]), vget_high_u32(w13.val[0]))); - d[10] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w12.val[1]), vget_low_u32(w13.val[1]))); - d[11] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w12.val[1]), vget_high_u32(w13.val[1]))); - d[12] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[0]), vget_low_u32(w15.val[0]))); - d[13] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[0]), vget_high_u32(w15.val[0]))); - d[14] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w14.val[1]), vget_low_u32(w15.val[1]))); - d[15] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w14.val[1]), vget_high_u32(w15.val[1]))); -#endif + *d = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); } -static AOM_FORCE_INLINE void transpose16x32_neon(uint8x16x2_t *x, - uint64x2x2_t *d) { - uint8x16x2_t w0, w1, w2, w3, w8, w9, w10, w11; - uint16x8x2_t w4, w5, w12, w13; - uint32x4x2_t w6, w7, w14, w15; - - w0 = vzipq_u8(x[0].val[0], x[1].val[0]); - w1 = vzipq_u8(x[2].val[0], x[3].val[0]); - w2 = vzipq_u8(x[4].val[0], x[5].val[0]); - w3 = vzipq_u8(x[6].val[0], x[7].val[0]); - - w8 = vzipq_u8(x[8].val[0], x[9].val[0]); - w9 = vzipq_u8(x[10].val[0], x[11].val[0]); - w10 = vzipq_u8(x[12].val[0], x[13].val[0]); - w11 = vzipq_u8(x[14].val[0], x[15].val[0]); - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]), - vreinterpretq_u16_u8(w9.val[0])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]), - vreinterpretq_u16_u8(w11.val[0])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store first 4-line result - -#if AOM_ARCH_AARCH64 - d[0].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[0].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[1].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[1].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[2].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[2].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[3].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[3].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[0].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[0].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[1].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[1].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[2].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[2].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[3].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[3].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]), - vreinterpretq_u16_u8(w9.val[1])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]), - vreinterpretq_u16_u8(w11.val[1])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store second 4-line result - -#if AOM_ARCH_AARCH64 - d[4].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[4].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[5].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[5].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[6].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[6].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[7].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[7].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[4].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[4].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[5].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[5].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[6].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[6].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[7].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[7].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif - - // upper half - w0 = vzipq_u8(x[0].val[1], x[1].val[1]); - w1 = vzipq_u8(x[2].val[1], x[3].val[1]); - w2 = vzipq_u8(x[4].val[1], x[5].val[1]); - w3 = vzipq_u8(x[6].val[1], x[7].val[1]); - - w8 = vzipq_u8(x[8].val[1], x[9].val[1]); - w9 = vzipq_u8(x[10].val[1], x[11].val[1]); - w10 = vzipq_u8(x[12].val[1], x[13].val[1]); - w11 = vzipq_u8(x[14].val[1], x[15].val[1]); - - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), - vreinterpretq_u16_u8(w1.val[0])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), - vreinterpretq_u16_u8(w3.val[0])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[0]), - vreinterpretq_u16_u8(w9.val[0])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[0]), - vreinterpretq_u16_u8(w11.val[0])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store first 4-line result - -#if AOM_ARCH_AARCH64 - d[8].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[8].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[9].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[9].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[10].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[10].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[11].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[11].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[8].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[8].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[9].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[9].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[10].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[10].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[11].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[11].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif +static AOM_FORCE_INLINE void z3_transpose_arrays_u8_8x4(const uint8x8_t *x, + uint8x8x2_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); - w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), - vreinterpretq_u16_u8(w1.val[1])); - w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), - vreinterpretq_u16_u8(w3.val[1])); - w12 = vzipq_u16(vreinterpretq_u16_u8(w8.val[1]), - vreinterpretq_u16_u8(w9.val[1])); - w13 = vzipq_u16(vreinterpretq_u16_u8(w10.val[1]), - vreinterpretq_u16_u8(w11.val[1])); - - w6 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), - vreinterpretq_u32_u16(w5.val[0])); - w7 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), - vreinterpretq_u32_u16(w5.val[1])); - w14 = vzipq_u32(vreinterpretq_u32_u16(w12.val[0]), - vreinterpretq_u32_u16(w13.val[0])); - w15 = vzipq_u32(vreinterpretq_u32_u16(w12.val[1]), - vreinterpretq_u32_u16(w13.val[1])); - - // Store second 4-line result - -#if AOM_ARCH_AARCH64 - d[12].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[12].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[0]), - vreinterpretq_u64_u32(w14.val[0])); - d[13].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[13].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w6.val[1]), - vreinterpretq_u64_u32(w14.val[1])); - d[14].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[14].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[0]), - vreinterpretq_u64_u32(w15.val[0])); - d[15].val[0] = vzip1q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); - d[15].val[1] = vzip2q_u64(vreinterpretq_u64_u32(w7.val[1]), - vreinterpretq_u64_u32(w15.val[1])); -#else - d[12].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[0]), vget_low_u32(w14.val[0]))); - d[12].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[0]), vget_high_u32(w14.val[0]))); - d[13].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w6.val[1]), vget_low_u32(w14.val[1]))); - d[13].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w6.val[1]), vget_high_u32(w14.val[1]))); - d[14].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[0]), vget_low_u32(w15.val[0]))); - d[14].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[0]), vget_high_u32(w15.val[0]))); - d[15].val[0] = vreinterpretq_u64_u32( - vcombine_u32(vget_low_u32(w7.val[1]), vget_low_u32(w15.val[1]))); - d[15].val[1] = vreinterpretq_u64_u32( - vcombine_u32(vget_high_u32(w7.val[1]), vget_high_u32(w15.val[1]))); -#endif + d[0] = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0]))); + d[1] = aom_reinterpret_u8_u16_x2( + vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1]))); } -static void transpose_TX_16X16(const uint8_t *src, ptrdiff_t pitchSrc, - uint8_t *dst, ptrdiff_t pitchDst) { +static void z3_transpose_arrays_u8_16x16(const uint8_t *src, ptrdiff_t pitchSrc, + uint8_t *dst, ptrdiff_t pitchDst) { + // The same as the normal transposes in transpose_neon.h, but with a stride + // between consecutive vectors of elements. uint8x16_t r[16]; - uint64x2_t d[16]; + uint8x16_t d[16]; for (int i = 0; i < 16; i++) { r[i] = vld1q_u8(src + i * pitchSrc); } - transpose16x16_neon(r, d); + transpose_arrays_u8_16x16(r, d); for (int i = 0; i < 16; i++) { - vst1q_u8(dst + i * pitchDst, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * pitchDst, d[i]); } } -static void transpose(const uint8_t *src, ptrdiff_t pitchSrc, uint8_t *dst, - ptrdiff_t pitchDst, int width, int height) { +static void z3_transpose_arrays_u8_16nx16n(const uint8_t *src, + ptrdiff_t pitchSrc, uint8_t *dst, + ptrdiff_t pitchDst, int width, + int height) { for (int j = 0; j < height; j += 16) { for (int i = 0; i < width; i += 16) { - transpose_TX_16X16(src + i * pitchSrc + j, pitchSrc, - dst + j * pitchDst + i, pitchDst); + z3_transpose_arrays_u8_16x16(src + i * pitchSrc + j, pitchSrc, + dst + j * pitchDst + i, pitchDst); } } } @@ -2765,89 +2061,60 @@ const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[4]; - uint16x4x2_t dest; + uint8x8x2_t dest; dr_prediction_z1_HxW_internal_neon_64(4, 4, dstvec, left, upsample_left, dy); - transpose4x8_8x4_low_neon(dstvec, &dest); - vst1_lane_u32((uint32_t *)(dst + stride * 0), - vreinterpret_u32_u16(dest.val[0]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 1), - vreinterpret_u32_u16(dest.val[0]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 2), - vreinterpret_u32_u16(dest.val[1]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 3), - vreinterpret_u32_u16(dest.val[1]), 1); + z3_transpose_arrays_u8_4x4(dstvec, &dest); + store_u8x4_strided_x2(dst + stride * 0, stride, dest.val[0]); + store_u8x4_strided_x2(dst + stride * 2, stride, dest.val[1]); } static void dr_prediction_z3_8x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[8]; - uint32x2x2_t d[4]; + uint8x8_t d[8]; dr_prediction_z1_HxW_internal_neon_64(8, 8, dstvec, left, upsample_left, dy); - transpose8x8_neon(dstvec, d); - vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]); - vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]); - vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]); - vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]); - vst1_u32((uint32_t *)(dst + 4 * stride), d[2].val[0]); - vst1_u32((uint32_t *)(dst + 5 * stride), d[2].val[1]); - vst1_u32((uint32_t *)(dst + 6 * stride), d[3].val[0]); - vst1_u32((uint32_t *)(dst + 7 * stride), d[3].val[1]); + transpose_arrays_u8_8x8(dstvec, d); + store_u8_8x8(dst, stride, d[0], d[1], d[2], d[3], d[4], d[5], d[6], d[7]); } static void dr_prediction_z3_4x8_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[4]; - uint16x4x2_t d[2]; + uint8x8x2_t d[2]; dr_prediction_z1_HxW_internal_neon_64(8, 4, dstvec, left, upsample_left, dy); - transpose4x8_8x4_neon(dstvec, d); - vst1_lane_u32((uint32_t *)(dst + stride * 0), - vreinterpret_u32_u16(d[0].val[0]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 1), - vreinterpret_u32_u16(d[0].val[0]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 2), - vreinterpret_u32_u16(d[0].val[1]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 3), - vreinterpret_u32_u16(d[0].val[1]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 4), - vreinterpret_u32_u16(d[1].val[0]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 5), - vreinterpret_u32_u16(d[1].val[0]), 1); - vst1_lane_u32((uint32_t *)(dst + stride * 6), - vreinterpret_u32_u16(d[1].val[1]), 0); - vst1_lane_u32((uint32_t *)(dst + stride * 7), - vreinterpret_u32_u16(d[1].val[1]), 1); + z3_transpose_arrays_u8_8x4(dstvec, d); + store_u8x4_strided_x2(dst + stride * 0, stride, d[0].val[0]); + store_u8x4_strided_x2(dst + stride * 2, stride, d[0].val[1]); + store_u8x4_strided_x2(dst + stride * 4, stride, d[1].val[0]); + store_u8x4_strided_x2(dst + stride * 6, stride, d[1].val[1]); } static void dr_prediction_z3_8x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[8]; - uint32x2x2_t d[2]; + uint8x8_t d[8]; dr_prediction_z1_HxW_internal_neon_64(4, 8, dstvec, left, upsample_left, dy); - transpose8x8_low_neon(dstvec, d); - vst1_u32((uint32_t *)(dst + 0 * stride), d[0].val[0]); - vst1_u32((uint32_t *)(dst + 1 * stride), d[0].val[1]); - vst1_u32((uint32_t *)(dst + 2 * stride), d[1].val[0]); - vst1_u32((uint32_t *)(dst + 3 * stride), d[1].val[1]); + transpose_arrays_u8_8x8(dstvec, d); + store_u8_8x4(dst, stride, d[0], d[1], d[2], d[3]); } static void dr_prediction_z3_8x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[8]; - uint64x2_t d[8]; + uint8x8_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 8, dstvec, left, upsample_left, dy); - transpose8x16_16x8_neon(dstvec, d); - for (int i = 0; i < 8; i++) { - vst1_u8(dst + i * stride, vreinterpret_u8_u64(vget_low_u64(d[i]))); - vst1_u8(dst + (i + 8) * stride, vreinterpret_u8_u64(vget_high_u64(d[i]))); + transpose_arrays_u8_16x8(dstvec, d); + for (int i = 0; i < 16; i++) { + vst1_u8(dst + i * stride, d[i]); } } @@ -2855,12 +2122,12 @@ const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[16]; - uint64x2_t d[8]; + uint8x16_t d[8]; dr_prediction_z1_HxW_internal_neon_64(8, 16, dstvec, left, upsample_left, dy); - transpose16x8_8x16_neon(dstvec, d); + transpose_arrays_u8_8x16(dstvec, d); for (int i = 0; i < 8; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * stride, d[i]); } } @@ -2868,78 +2135,45 @@ const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[4]; - uint16x8x2_t d[2]; + uint8x16x2_t d[2]; dr_prediction_z1_HxW_internal_neon(16, 4, dstvec, left, upsample_left, dy); - transpose4x16_neon(dstvec, d); - vst1q_lane_u32((uint32_t *)(dst + stride * 0), - vreinterpretq_u32_u16(d[0].val[0]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 1), - vreinterpretq_u32_u16(d[0].val[0]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 2), - vreinterpretq_u32_u16(d[0].val[0]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 3), - vreinterpretq_u32_u16(d[0].val[0]), 3); - - vst1q_lane_u32((uint32_t *)(dst + stride * 4), - vreinterpretq_u32_u16(d[0].val[1]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 5), - vreinterpretq_u32_u16(d[0].val[1]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 6), - vreinterpretq_u32_u16(d[0].val[1]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 7), - vreinterpretq_u32_u16(d[0].val[1]), 3); - - vst1q_lane_u32((uint32_t *)(dst + stride * 8), - vreinterpretq_u32_u16(d[1].val[0]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 9), - vreinterpretq_u32_u16(d[1].val[0]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 10), - vreinterpretq_u32_u16(d[1].val[0]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 11), - vreinterpretq_u32_u16(d[1].val[0]), 3); - - vst1q_lane_u32((uint32_t *)(dst + stride * 12), - vreinterpretq_u32_u16(d[1].val[1]), 0); - vst1q_lane_u32((uint32_t *)(dst + stride * 13), - vreinterpretq_u32_u16(d[1].val[1]), 1); - vst1q_lane_u32((uint32_t *)(dst + stride * 14), - vreinterpretq_u32_u16(d[1].val[1]), 2); - vst1q_lane_u32((uint32_t *)(dst + stride * 15), - vreinterpretq_u32_u16(d[1].val[1]), 3); + z3_transpose_arrays_u8_16x4(dstvec, d); + store_u8x4_strided_x4(dst + stride * 0, stride, d[0].val[0]); + store_u8x4_strided_x4(dst + stride * 4, stride, d[0].val[1]); + store_u8x4_strided_x4(dst + stride * 8, stride, d[1].val[0]); + store_u8x4_strided_x4(dst + stride * 12, stride, d[1].val[1]); } static void dr_prediction_z3_16x4_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[16]; - uint64x2_t d[8]; + uint8x16_t d[8]; dr_prediction_z1_HxW_internal_neon_64(4, 16, dstvec, left, upsample_left, dy); - transpose16x8_8x16_neon(dstvec, d); + transpose_arrays_u8_8x16(dstvec, d); for (int i = 0; i < 4; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * stride, d[i]); } } static void dr_prediction_z3_8x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8x16x2_t dstvec[16]; - uint64x2x2_t d[16]; + uint8x16_t d[32]; uint8x16_t v_zero = vdupq_n_u8(0); - dr_prediction_z1_32xN_internal_neon(8, dstvec, left, upsample_left, dy); + dr_prediction_z1_32xN_internal_neon(8, dstvec, left, dy); for (int i = 8; i < 16; i++) { dstvec[i].val[0] = v_zero; dstvec[i].val[1] = v_zero; } - transpose16x32_neon(dstvec, d); - for (int i = 0; i < 16; i++) { - vst1_u8(dst + 2 * i * stride, - vreinterpret_u8_u64(vget_low_u64(d[i].val[0]))); - vst1_u8(dst + (2 * i + 1) * stride, - vreinterpret_u8_u64(vget_low_u64(d[i].val[1]))); + transpose_arrays_u8_32x16(dstvec, d); + for (int i = 0; i < 32; i++) { + vst1_u8(dst + i * stride, vget_low_u8(d[i])); } } @@ -2947,14 +2181,14 @@ const uint8_t *left, int upsample_left, int dy) { uint8x8_t dstvec[32]; - uint64x2_t d[16]; + uint8x16_t d[16]; dr_prediction_z1_HxW_internal_neon_64(8, 32, dstvec, left, upsample_left, dy); - transpose16x8_8x16_neon(dstvec, d); - transpose16x8_8x16_neon(dstvec + 16, d + 8); + transpose_arrays_u8_8x16(dstvec, d); + transpose_arrays_u8_8x16(dstvec + 16, d + 8); for (int i = 0; i < 8; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); - vst1q_u8(dst + i * stride + 16, vreinterpretq_u8_u64(d[i + 8])); + vst1q_u8(dst + i * stride, d[i]); + vst1q_u8(dst + i * stride + 16, d[i + 8]); } } @@ -2962,53 +2196,53 @@ const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[16]; - uint64x2_t d[16]; + uint8x16_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 16, dstvec, left, upsample_left, dy); - transpose16x16_neon(dstvec, d); + transpose_arrays_u8_16x16(dstvec, d); for (int i = 0; i < 16; i++) { - vst1q_u8(dst + i * stride, vreinterpretq_u8_u64(d[i])); + vst1q_u8(dst + i * stride, d[i]); } } static void dr_prediction_z3_32x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8x16x2_t dstvec[32]; - uint64x2x2_t d[32]; + uint8x16_t d[64]; - dr_prediction_z1_32xN_internal_neon(32, dstvec, left, upsample_left, dy); - transpose16x32_neon(dstvec, d); - transpose16x32_neon(dstvec + 16, d + 16); - for (int i = 0; i < 16; i++) { - vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0])); - vst1q_u8(dst + 2 * i * stride + 16, vreinterpretq_u8_u64(d[i + 16].val[0])); - vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1])); - vst1q_u8(dst + (2 * i + 1) * stride + 16, - vreinterpretq_u8_u64(d[i + 16].val[1])); + dr_prediction_z1_32xN_internal_neon(32, dstvec, left, dy); + transpose_arrays_u8_32x16(dstvec, d); + transpose_arrays_u8_32x16(dstvec + 16, d + 32); + for (int i = 0; i < 32; i++) { + vst1q_u8(dst + i * stride, d[i]); + vst1q_u8(dst + i * stride + 16, d[i + 32]); } } static void dr_prediction_z3_64x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; DECLARE_ALIGNED(16, uint8_t, dstT[64 * 64]); - dr_prediction_z1_64xN_neon(64, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 64, 64); + dr_prediction_z1_64xN_neon(64, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 64, 64); } static void dr_prediction_z3_16x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8x16x2_t dstvec[16]; - uint64x2x2_t d[16]; + uint8x16_t d[32]; - dr_prediction_z1_32xN_internal_neon(16, dstvec, left, upsample_left, dy); - transpose16x32_neon(dstvec, d); + dr_prediction_z1_32xN_internal_neon(16, dstvec, left, dy); + transpose_arrays_u8_32x16(dstvec, d); for (int i = 0; i < 16; i++) { - vst1q_u8(dst + 2 * i * stride, vreinterpretq_u8_u64(d[i].val[0])); - vst1q_u8(dst + (2 * i + 1) * stride, vreinterpretq_u8_u64(d[i].val[1])); + vst1q_u8(dst + 2 * i * stride, d[2 * i + 0]); + vst1q_u8(dst + (2 * i + 1) * stride, d[2 * i + 1]); } } @@ -3016,13 +2250,13 @@ const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[32]; - uint64x2_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 32, dstvec, left, upsample_left, dy); for (int i = 0; i < 32; i += 16) { - transpose16x16_neon((dstvec + i), d); + uint8x16_t d[16]; + transpose_arrays_u8_16x16(dstvec + i, d); for (int j = 0; j < 16; j++) { - vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j])); + vst1q_u8(dst + j * stride + i, d[j]); } } } @@ -3030,45 +2264,68 @@ static void dr_prediction_z3_32x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8_t dstT[64 * 32]; - dr_prediction_z1_64xN_neon(32, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 32, 64); + dr_prediction_z1_64xN_neon(32, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 32, 64); } static void dr_prediction_z3_64x32_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8_t dstT[32 * 64]; - dr_prediction_z1_32xN_neon(64, dstT, 32, left, upsample_left, dy); - transpose(dstT, 32, dst, stride, 64, 32); + dr_prediction_z1_32xN_neon(64, dstT, 32, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 32, dst, stride, 64, 32); } static void dr_prediction_z3_16x64_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { + (void)upsample_left; uint8_t dstT[64 * 16]; - dr_prediction_z1_64xN_neon(16, dstT, 64, left, upsample_left, dy); - transpose(dstT, 64, dst, stride, 16, 64); + dr_prediction_z1_64xN_neon(16, dstT, 64, left, dy); + z3_transpose_arrays_u8_16nx16n(dstT, 64, dst, stride, 16, 64); } static void dr_prediction_z3_64x16_neon(uint8_t *dst, ptrdiff_t stride, const uint8_t *left, int upsample_left, int dy) { uint8x16_t dstvec[64]; - uint64x2_t d[16]; dr_prediction_z1_HxW_internal_neon(16, 64, dstvec, left, upsample_left, dy); for (int i = 0; i < 64; i += 16) { - transpose16x16_neon((dstvec + i), d); - for (int j = 0; j < 16; j++) { - vst1q_u8(dst + j * stride + i, vreinterpretq_u8_u64(d[j])); + uint8x16_t d[16]; + transpose_arrays_u8_16x16(dstvec + i, d); + for (int j = 0; j < 16; ++j) { + vst1q_u8(dst + j * stride + i, d[j]); } } } +typedef void (*dr_prediction_z3_fn)(uint8_t *dst, ptrdiff_t stride, + const uint8_t *left, int upsample_left, + int dy); + +static dr_prediction_z3_fn dr_prediction_z3_arr[7][7] = { + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, NULL, NULL, NULL, NULL, NULL }, + { NULL, NULL, dr_prediction_z3_4x4_neon, dr_prediction_z3_4x8_neon, + dr_prediction_z3_4x16_neon, NULL, NULL }, + { NULL, NULL, dr_prediction_z3_8x4_neon, dr_prediction_z3_8x8_neon, + dr_prediction_z3_8x16_neon, dr_prediction_z3_8x32_neon, NULL }, + { NULL, NULL, dr_prediction_z3_16x4_neon, dr_prediction_z3_16x8_neon, + dr_prediction_z3_16x16_neon, dr_prediction_z3_16x32_neon, + dr_prediction_z3_16x64_neon }, + { NULL, NULL, NULL, dr_prediction_z3_32x8_neon, dr_prediction_z3_32x16_neon, + dr_prediction_z3_32x32_neon, dr_prediction_z3_32x64_neon }, + { NULL, NULL, NULL, NULL, dr_prediction_z3_64x16_neon, + dr_prediction_z3_64x32_neon, dr_prediction_z3_64x64_neon }, +}; + void av1_dr_prediction_z3_neon(uint8_t *dst, ptrdiff_t stride, int bw, int bh, const uint8_t *above, const uint8_t *left, int upsample_left, int dx, int dy) { @@ -3077,85 +2334,9 @@ assert(dx == 1); assert(dy > 0); - if (bw == bh) { - switch (bw) { - case 4: - dr_prediction_z3_4x4_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_8x8_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_16x16_neon(dst, stride, left, upsample_left, dy); - break; - case 32: - dr_prediction_z3_32x32_neon(dst, stride, left, upsample_left, dy); - break; - case 64: - dr_prediction_z3_64x64_neon(dst, stride, left, upsample_left, dy); - break; - } - } else { - if (bw < bh) { - if (bw + bw == bh) { - switch (bw) { - case 4: - dr_prediction_z3_4x8_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_8x16_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_16x32_neon(dst, stride, left, upsample_left, dy); - break; - case 32: - dr_prediction_z3_32x64_neon(dst, stride, left, upsample_left, dy); - break; - } - } else { - switch (bw) { - case 4: - dr_prediction_z3_4x16_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_8x32_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_16x64_neon(dst, stride, left, upsample_left, dy); - break; - } - } - } else { - if (bh + bh == bw) { - switch (bh) { - case 4: - dr_prediction_z3_8x4_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_16x8_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_32x16_neon(dst, stride, left, upsample_left, dy); - break; - case 32: - dr_prediction_z3_64x32_neon(dst, stride, left, upsample_left, dy); - break; - } - } else { - switch (bh) { - case 4: - dr_prediction_z3_16x4_neon(dst, stride, left, upsample_left, dy); - break; - case 8: - dr_prediction_z3_32x8_neon(dst, stride, left, upsample_left, dy); - break; - case 16: - dr_prediction_z3_64x16_neon(dst, stride, left, upsample_left, dy); - break; - } - } - } - } + dr_prediction_z3_fn f = dr_prediction_z3_arr[get_msb(bw)][get_msb(bh)]; + assert(f != NULL); + f(dst, stride, left, upsample_left, dy); } // ----------------------------------------------------------------------------- @@ -3759,7 +2940,7 @@ result = vbsl_u8(left_or_top_mask, result, top_left); if (width == 4) { - store_unaligned_u8_4x1(dest, result, 0); + store_u8_4x1(dest, result); } else { // width == 8 vst1_u8(dest, result); } diff -Nru aom-3.8.2/aom_dsp/arm/loopfilter_neon.c aom-3.9.0/aom_dsp/arm/loopfilter_neon.c --- aom-3.8.2/aom_dsp/arm/loopfilter_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/loopfilter_neon.c 2024-05-07 19:57:02.480000000 +0000 @@ -692,7 +692,7 @@ row2 = vcombine_u8(p5p1, q2q6); row3 = vcombine_u8(p4p0, q3qy); - store_u8_8x16(src - 8, stride, row0, row1, row2, row3); + store_u8_16x4(src - 8, stride, row0, row1, row2, row3); } void aom_lpf_vertical_14_dual_neon( @@ -862,10 +862,8 @@ transpose_elems_inplace_u8_4x4(&p1p0, &q0q1); - store_unaligned_u8_4x1(src - 2, p1p0, 0); - store_unaligned_u8_4x1((src - 2) + 1 * stride, q0q1, 0); - store_unaligned_u8_4x1((src - 2) + 2 * stride, p1p0, 1); - store_unaligned_u8_4x1((src - 2) + 3 * stride, q0q1, 1); + store_u8x4_strided_x2(src - 2, 2 * stride, p1p0); + store_u8x4_strided_x2(src + stride - 2, 2 * stride, q0q1); } void aom_lpf_vertical_4_dual_neon(uint8_t *s, int pitch, const uint8_t *blimit0, @@ -897,18 +895,12 @@ lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh); - store_u8_4x1(src - 6 * stride, p5q5, 0); - store_u8_4x1(src - 5 * stride, p4q4, 0); - store_u8_4x1(src - 4 * stride, p3q3, 0); - store_u8_4x1(src - 3 * stride, p2q2, 0); - store_u8_4x1(src - 2 * stride, p1q1, 0); - store_u8_4x1(src - 1 * stride, p0q0, 0); - store_u8_4x1(src + 0 * stride, p0q0, 1); - store_u8_4x1(src + 1 * stride, p1q1, 1); - store_u8_4x1(src + 2 * stride, p2q2, 1); - store_u8_4x1(src + 3 * stride, p3q3, 1); - store_u8_4x1(src + 4 * stride, p4q4, 1); - store_u8_4x1(src + 5 * stride, p5q5, 1); + store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); + store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); + store_u8x4_strided_x2(src - 3 * stride, 5 * stride, p2q2); + store_u8x4_strided_x2(src - 4 * stride, 7 * stride, p3q3); + store_u8x4_strided_x2(src - 5 * stride, 9 * stride, p4q4); + store_u8x4_strided_x2(src - 6 * stride, 11 * stride, p5q5); } void aom_lpf_horizontal_14_dual_neon( @@ -1029,10 +1021,8 @@ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh); - store_u8_4x1(src - 2 * stride, p1q1, 0); - store_u8_4x1(src - 1 * stride, p0q0, 0); - store_u8_4x1(src + 0 * stride, p0q0, 1); - store_u8_4x1(src + 1 * stride, p1q1, 1); + store_u8x4_strided_x2(src - 1 * stride, 1 * stride, p0q0); + store_u8x4_strided_x2(src - 2 * stride, 3 * stride, p1q1); } void aom_lpf_horizontal_4_dual_neon( diff -Nru aom-3.8.2/aom_dsp/arm/mem_neon.h aom-3.9.0/aom_dsp/arm/mem_neon.h --- aom-3.8.2/aom_dsp/arm/mem_neon.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/mem_neon.h 2024-05-07 19:57:02.482000000 +0000 @@ -459,17 +459,15 @@ *s3 = vld1_s16(s); } -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define store_u8_2x1(s, s0, lane) \ - do { \ - vst1_lane_u16((uint16_t *)(s), vreinterpret_u16_u8(s0), lane); \ - } while (0) - -#define store_u8_4x1(s, s0, lane) \ - do { \ - vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u8(s0), lane); \ - } while (0) +static INLINE void load_s16_4x3(const int16_t *s, ptrdiff_t p, + int16x4_t *const s0, int16x4_t *const s1, + int16x4_t *const s2) { + *s0 = vld1_s16(s); + s += p; + *s1 = vld1_s16(s); + s += p; + *s2 = vld1_s16(s); +} static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0, const uint8x8_t s1, const uint8x8_t s2, @@ -505,7 +503,7 @@ vst1_u8(s, s3); } -static INLINE void store_u8_8x16(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, +static INLINE void store_u8_16x4(uint8_t *s, ptrdiff_t p, const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2, const uint8x16_t s3) { vst1q_u8(s, s0); @@ -604,21 +602,6 @@ vst1_s16(s, s3); } -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define store_s16_2x1(s, s0, lane) \ - do { \ - vst1_lane_s32((int32_t *)(s), vreinterpret_s32_s16(s0), lane); \ - } while (0) -#define store_u16_2x1(s, s0, lane) \ - do { \ - vst1_lane_u32((uint32_t *)(s), vreinterpret_u32_u16(s0), lane); \ - } while (0) -#define store_u16q_2x1(s, s0, lane) \ - do { \ - vst1q_lane_u32((uint32_t *)(s), vreinterpretq_u32_u16(s0), lane); \ - } while (0) - static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride, const int16x8_t s0, const int16x8_t s1, const int16x8_t s2, const int16x8_t s3) { @@ -886,6 +869,16 @@ *s3 = vld1q_s16(s); } +static INLINE void load_s16_8x3(const int16_t *s, ptrdiff_t p, + int16x8_t *const s0, int16x8_t *const s1, + int16x8_t *const s2) { + *s0 = vld1q_s16(s); + s += p; + *s1 = vld1q_s16(s); + s += p; + *s2 = vld1q_s16(s); +} + // Load 2 sets of 4 bytes when alignment is not guaranteed. static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { uint32_t a; @@ -991,36 +984,6 @@ load_unaligned_u8_4x4(buf, stride, tu2, tu3); } -/* These intrinsics require immediate values, so we must use #defines - to enforce that. */ -#define store_unaligned_u8_4x1(dst, src, lane) \ - do { \ - uint32_t a; \ - a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ - memcpy(dst, &a, 4); \ - } while (0) - -#define store_unaligned_u8_2x1(dst, src, lane) \ - do { \ - uint16_t a; \ - a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \ - memcpy(dst, &a, 2); \ - } while (0) - -#define store_unaligned_u16_2x1(dst, src, lane) \ - do { \ - uint32_t a; \ - a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \ - memcpy(dst, &a, 4); \ - } while (0) - -#define store_unaligned_u16_4x1(dst, src, lane) \ - do { \ - uint64_t a; \ - a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \ - memcpy(dst, &a, 8); \ - } while (0) - static INLINE void load_u8_16x8(const uint8_t *s, ptrdiff_t p, uint8x16_t *const s0, uint8x16_t *const s1, uint8x16_t *const s2, uint8x16_t *const s3, @@ -1202,32 +1165,111 @@ vst1q_s32(buf, v0); } -static INLINE void store_unaligned_u8_2x2(uint8_t *dst, uint32_t dst_stride, - uint8x8_t src) { - store_unaligned_u8_2x1(dst, src, 0); - dst += dst_stride; - store_unaligned_u8_2x1(dst, src, 1); +static INLINE uint8x8_t load_u8_gather_s16_x8(const uint8_t *src, + int16x8_t indices) { + // Recent Clang and GCC versions correctly identify that this zero-broadcast + // is redundant. Alternatively we could load and broadcast the zeroth element + // and then replace the other lanes, however this is slower than loading a + // single element without broadcast on some micro-architectures. + uint8x8_t ret = vdup_n_u8(0); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 0), ret, 0); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 1), ret, 1); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 2), ret, 2); + ret = vld1_lane_u8(src + vget_lane_s16(vget_low_s16(indices), 3), ret, 3); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 0), ret, 4); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 1), ret, 5); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 2), ret, 6); + ret = vld1_lane_u8(src + vget_lane_s16(vget_high_s16(indices), 3), ret, 7); + return ret; +} + +// The `lane` parameter here must be an immediate. +#define store_u8_2x1_lane(dst, src, lane) \ + do { \ + uint16_t a = vget_lane_u16(vreinterpret_u16_u8(src), lane); \ + memcpy(dst, &a, 2); \ + } while (0) + +#define store_u8_4x1_lane(dst, src, lane) \ + do { \ + uint32_t a = vget_lane_u32(vreinterpret_u32_u8(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + +#define store_u16_2x1_lane(dst, src, lane) \ + do { \ + uint32_t a = vget_lane_u32(vreinterpret_u32_u16(src), lane); \ + memcpy(dst, &a, 4); \ + } while (0) + +#define store_u16_4x1_lane(dst, src, lane) \ + do { \ + uint64_t a = vgetq_lane_u64(vreinterpretq_u64_u16(src), lane); \ + memcpy(dst, &a, 8); \ + } while (0) + +// Store the low 16-bits from a single vector. +static INLINE void store_u8_2x1(uint8_t *dst, const uint8x8_t src) { + store_u8_2x1_lane(dst, src, 0); } -static INLINE void store_unaligned_u8_4x2(uint8_t *dst, uint32_t dst_stride, - uint8x8_t src) { - store_unaligned_u8_4x1(dst, src, 0); +// Store the low 32-bits from a single vector. +static INLINE void store_u8_4x1(uint8_t *dst, const uint8x8_t src) { + store_u8_4x1_lane(dst, src, 0); +} + +// Store two blocks of 16-bits from a single vector. +static INLINE void store_u8x2_strided_x2(uint8_t *dst, uint32_t dst_stride, + uint8x8_t src) { + store_u8_2x1_lane(dst, src, 0); dst += dst_stride; - store_unaligned_u8_4x1(dst, src, 1); + store_u8_2x1_lane(dst, src, 1); } -static INLINE void store_unaligned_u16_2x2(uint16_t *dst, uint32_t dst_stride, - uint16x4_t src) { - store_unaligned_u16_2x1(dst, src, 0); +// Store two blocks of 32-bits from a single vector. +static INLINE void store_u8x4_strided_x2(uint8_t *dst, ptrdiff_t stride, + uint8x8_t src) { + store_u8_4x1_lane(dst, src, 0); + dst += stride; + store_u8_4x1_lane(dst, src, 1); +} + +// Store four blocks of 32-bits from a single vector. +static INLINE void store_u8x4_strided_x4(uint8_t *dst, ptrdiff_t stride, + uint8x16_t src) { + store_u8_4x1_lane(dst, vget_low_u8(src), 0); + dst += stride; + store_u8_4x1_lane(dst, vget_low_u8(src), 1); + dst += stride; + store_u8_4x1_lane(dst, vget_high_u8(src), 0); + dst += stride; + store_u8_4x1_lane(dst, vget_high_u8(src), 1); +} + +// Store the low 32-bits from a single vector. +static INLINE void store_u16_2x1(uint16_t *dst, const uint16x4_t src) { + store_u16_2x1_lane(dst, src, 0); +} + +// Store two blocks of 32-bits from a single vector. +static INLINE void store_u16x2_strided_x2(uint16_t *dst, uint32_t dst_stride, + uint16x4_t src) { + store_u16_2x1_lane(dst, src, 0); dst += dst_stride; - store_unaligned_u16_2x1(dst, src, 1); + store_u16_2x1_lane(dst, src, 1); } -static INLINE void store_unaligned_u16_4x2(uint16_t *dst, uint32_t dst_stride, - uint16x8_t src) { - store_unaligned_u16_4x1(dst, src, 0); +// Store two blocks of 64-bits from a single vector. +static INLINE void store_u16x4_strided_x2(uint16_t *dst, uint32_t dst_stride, + uint16x8_t src) { + store_u16_4x1_lane(dst, src, 0); dst += dst_stride; - store_unaligned_u16_4x1(dst, src, 1); + store_u16_4x1_lane(dst, src, 1); } +#undef store_u8_2x1_lane +#undef store_u8_4x1_lane +#undef store_u16_2x1_lane +#undef store_u16_4x1_lane + #endif // AOM_AOM_DSP_ARM_MEM_NEON_H_ diff -Nru aom-3.8.2/aom_dsp/arm/reinterpret_neon.h aom-3.9.0/aom_dsp/arm/reinterpret_neon.h --- aom-3.8.2/aom_dsp/arm/reinterpret_neon.h 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/reinterpret_neon.h 2024-05-07 19:57:02.484000000 +0000 @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ +#define AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ + +#include + +#include "aom/aom_integer.h" // For AOM_FORCE_INLINE. +#include "config/aom_config.h" + +#define REINTERPRET_NEON(u, to_sz, to_count, from_sz, from_count, n, q) \ + static AOM_FORCE_INLINE u##int##to_sz##x##to_count##x##n##_t \ + aom_reinterpret##q##_##u##to_sz##_##u##from_sz##_x##n( \ + const u##int##from_sz##x##from_count##x##n##_t src) { \ + u##int##to_sz##x##to_count##x##n##_t ret; \ + for (int i = 0; i < (n); ++i) { \ + ret.val[i] = vreinterpret##q##_##u##to_sz##_##u##from_sz(src.val[i]); \ + } \ + return ret; \ + } + +REINTERPRET_NEON(u, 8, 8, 16, 4, 2, ) // uint8x8x2_t from uint16x4x2_t +REINTERPRET_NEON(u, 8, 16, 16, 8, 2, q) // uint8x16x2_t from uint16x8x2_t + +#endif // AOM_AOM_DSP_ARM_REINTERPRET_NEON_H_ diff -Nru aom-3.8.2/aom_dsp/arm/sum_neon.h aom-3.9.0/aom_dsp/arm/sum_neon.h --- aom-3.8.2/aom_dsp/arm/sum_neon.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/sum_neon.h 2024-05-07 19:57:02.490000000 +0000 @@ -17,6 +17,16 @@ #include "aom/aom_integer.h" #include "aom_ports/mem.h" +static INLINE int horizontal_add_u8x8(const uint8x8_t a) { +#if AOM_ARCH_AARCH64 + return vaddlv_u8(a); +#else + uint16x4_t b = vpaddl_u8(a); + uint32x2_t c = vpaddl_u16(b); + return vget_lane_u32(c, 0) + vget_lane_u32(c, 1); +#endif +} + static INLINE int horizontal_add_s16x8(const int16x8_t a) { #if AOM_ARCH_AARCH64 return vaddlvq_s16(a); @@ -186,6 +196,23 @@ #endif } +static INLINE int32x4_t horizontal_add_4d_s16x8(const int16x8_t sum[4]) { +#if AOM_ARCH_AARCH64 + const int16x8_t a0 = vpaddq_s16(sum[0], sum[1]); + const int16x8_t a1 = vpaddq_s16(sum[2], sum[3]); + const int16x8_t b0 = vpaddq_s16(a0, a1); + return vpaddlq_s16(b0); +#else + const int16x4_t a0 = vadd_s16(vget_low_s16(sum[0]), vget_high_s16(sum[0])); + const int16x4_t a1 = vadd_s16(vget_low_s16(sum[1]), vget_high_s16(sum[1])); + const int16x4_t a2 = vadd_s16(vget_low_s16(sum[2]), vget_high_s16(sum[2])); + const int16x4_t a3 = vadd_s16(vget_low_s16(sum[3]), vget_high_s16(sum[3])); + const int16x4_t b0 = vpadd_s16(a0, a1); + const int16x4_t b1 = vpadd_s16(a2, a3); + return vpaddlq_s16(vcombine_s16(b0, b1)); +#endif +} + static INLINE uint32_t horizontal_add_u32x2(const uint32x2_t a) { #if AOM_ARCH_AARCH64 return vaddv_u32(a); diff -Nru aom-3.8.2/aom_dsp/arm/sum_squares_sve.c aom-3.9.0/aom_dsp/arm/sum_squares_sve.c --- aom-3.8.2/aom_dsp/arm/sum_squares_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/sum_squares_sve.c 2024-05-07 19:57:02.491000000 +0000 @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "config/aom_dsp_rtcd.h" + +static INLINE uint64_t aom_sum_squares_2d_i16_4xh_sve(const int16_t *src, + int stride, int height) { + int64x2_t sum_squares = vdupq_n_s64(0); + + do { + int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); + + sum_squares = aom_sdotq_s16(sum_squares, s, s); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + return (uint64_t)vaddvq_s64(sum_squares); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_8xh_sve(const int16_t *src, + int stride, int height) { + int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + int16x8_t s0 = vld1q_s16(src + 0 * stride); + int16x8_t s1 = vld1q_s16(src + 1 * stride); + + sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); + sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); + return (uint64_t)vaddvq_s64(sum_squares[0]); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_large_sve(const int16_t *src, + int stride, int width, + int height) { + int64x2_t sum_squares[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const int16_t *src_ptr = src; + int w = width; + do { + int16x8_t s0 = vld1q_s16(src_ptr); + int16x8_t s1 = vld1q_s16(src_ptr + 8); + + sum_squares[0] = aom_sdotq_s16(sum_squares[0], s0, s0); + sum_squares[1] = aom_sdotq_s16(sum_squares[1], s1, s1); + + src_ptr += 16; + w -= 16; + } while (w != 0); + + src += stride; + } while (--height != 0); + + sum_squares[0] = vaddq_s64(sum_squares[0], sum_squares[1]); + return (uint64_t)vaddvq_s64(sum_squares[0]); +} + +static INLINE uint64_t aom_sum_squares_2d_i16_wxh_sve(const int16_t *src, + int stride, int width, + int height) { + svint64_t sum_squares = svdup_n_s64(0); + uint64_t step = svcnth(); + + do { + const int16_t *src_ptr = src; + int w = 0; + do { + svbool_t pred = svwhilelt_b16_u32(w, width); + svint16_t s0 = svld1_s16(pred, src_ptr); + + sum_squares = svdot_s64(sum_squares, s0, s0); + + src_ptr += step; + w += step; + } while (w < width); + + src += stride; + } while (--height != 0); + + return (uint64_t)svaddv_s64(svptrue_b64(), sum_squares); +} + +uint64_t aom_sum_squares_2d_i16_sve(const int16_t *src, int stride, int width, + int height) { + if (width == 4) { + return aom_sum_squares_2d_i16_4xh_sve(src, stride, height); + } + if (width == 8) { + return aom_sum_squares_2d_i16_8xh_sve(src, stride, height); + } + if (width % 16 == 0) { + return aom_sum_squares_2d_i16_large_sve(src, stride, width, height); + } + return aom_sum_squares_2d_i16_wxh_sve(src, stride, width, height); +} + +uint64_t aom_sum_squares_i16_sve(const int16_t *src, uint32_t n) { + // This function seems to be called only for values of N >= 64. See + // av1/encoder/compound_type.c. Additionally, because N = width x height for + // width and height between the standard block sizes, N will also be a + // multiple of 64. + if (LIKELY(n % 64 == 0)) { + int64x2_t sum[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + int16x8_t s0 = vld1q_s16(src); + int16x8_t s1 = vld1q_s16(src + 8); + int16x8_t s2 = vld1q_s16(src + 16); + int16x8_t s3 = vld1q_s16(src + 24); + + sum[0] = aom_sdotq_s16(sum[0], s0, s0); + sum[1] = aom_sdotq_s16(sum[1], s1, s1); + sum[2] = aom_sdotq_s16(sum[2], s2, s2); + sum[3] = aom_sdotq_s16(sum[3], s3, s3); + + src += 32; + n -= 32; + } while (n != 0); + + sum[0] = vaddq_s64(sum[0], sum[1]); + sum[2] = vaddq_s64(sum[2], sum[3]); + sum[0] = vaddq_s64(sum[0], sum[2]); + return vaddvq_s64(sum[0]); + } + return aom_sum_squares_i16_c(src, n); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_4xh_sve(const int16_t *src, + int stride, int height, + int *sum) { + int64x2_t sse = vdupq_n_s64(0); + int32x4_t sum_s32 = vdupq_n_s32(0); + + do { + int16x8_t s = vcombine_s16(vld1_s16(src), vld1_s16(src + stride)); + + sse = aom_sdotq_s16(sse, s, s); + + sum_s32 = vpadalq_s16(sum_s32, s); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + *sum += vaddvq_s32(sum_s32); + return vaddvq_s64(sse); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_8xh_sve(const int16_t *src, + int stride, int height, + int *sum) { + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + do { + int16x8_t s0 = vld1q_s16(src); + int16x8_t s1 = vld1q_s16(src + stride); + + sse[0] = aom_sdotq_s16(sse[0], s0, s0); + sse[1] = aom_sdotq_s16(sse[1], s1, s1); + + sum_acc[0] = vpadalq_s16(sum_acc[0], s0); + sum_acc[1] = vpadalq_s16(sum_acc[1], s1); + + src += 2 * stride; + height -= 2; + } while (height != 0); + + *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); + return vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +static INLINE uint64_t aom_sum_sse_2d_i16_16xh_sve(const int16_t *src, + int stride, int width, + int height, int *sum) { + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int32x4_t sum_acc[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + + do { + int w = 0; + do { + int16x8_t s0 = vld1q_s16(src + w); + int16x8_t s1 = vld1q_s16(src + w + 8); + + sse[0] = aom_sdotq_s16(sse[0], s0, s0); + sse[1] = aom_sdotq_s16(sse[1], s1, s1); + + sum_acc[0] = vpadalq_s16(sum_acc[0], s0); + sum_acc[1] = vpadalq_s16(sum_acc[1], s1); + + w += 16; + } while (w < width); + + src += stride; + } while (--height != 0); + + *sum += vaddvq_s32(vaddq_s32(sum_acc[0], sum_acc[1])); + return vaddvq_s64(vaddq_s64(sse[0], sse[1])); +} + +uint64_t aom_sum_sse_2d_i16_sve(const int16_t *src, int stride, int width, + int height, int *sum) { + uint64_t sse; + + if (width == 4) { + sse = aom_sum_sse_2d_i16_4xh_sve(src, stride, height, sum); + } else if (width == 8) { + sse = aom_sum_sse_2d_i16_8xh_sve(src, stride, height, sum); + } else if (width % 16 == 0) { + sse = aom_sum_sse_2d_i16_16xh_sve(src, stride, width, height, sum); + } else { + sse = aom_sum_sse_2d_i16_c(src, stride, width, height, sum); + } + + return sse; +} + +static INLINE uint64_t aom_var_2d_u16_4xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + int h = height; + do { + uint16x8_t s0 = + vcombine_u16(vld1_u16(src_u16), vld1_u16(src_u16 + src_stride)); + + sum_u32 = vpadalq_u16(sum_u32, s0); + + sse_u64 = aom_udotq_u16(sse_u64, s0, s0); + + src_u16 += 2 * src_stride; + h -= 2; + } while (h != 0); + + sum += vaddlvq_u32(sum_u32); + sse += vaddvq_u64(sse_u64); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_8xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32 = vdupq_n_u32(0); + uint64x2_t sse_u64 = vdupq_n_u64(0); + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + + sum_u32 = vpadalq_u16(sum_u32, s0); + + sse_u64 = aom_udotq_u16(sse_u64, s0, s0); + + src_ptr += 8; + w -= 8; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum += vaddlvq_u32(sum_u32); + sse += vaddvq_u64(sse_u64); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_16xh_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; + uint64x2_t sse_u64[2] = { vdupq_n_u64(0), vdupq_n_u64(0) }; + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 8); + + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s1); + + sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); + sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); + + src_ptr += 16; + w -= 16; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); + + sum += vaddlvq_u32(sum_u32[0]); + sse += vaddvq_u64(sse_u64[0]); + + return sse - sum * sum / (width * height); +} + +static INLINE uint64_t aom_var_2d_u16_large_sve(uint8_t *src, int src_stride, + int width, int height) { + uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src); + uint64_t sum = 0; + uint64_t sse = 0; + uint32x4_t sum_u32[4] = { vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), + vdupq_n_u32(0) }; + uint64x2_t sse_u64[4] = { vdupq_n_u64(0), vdupq_n_u64(0), vdupq_n_u64(0), + vdupq_n_u64(0) }; + + int h = height; + do { + int w = width; + uint16_t *src_ptr = src_u16; + do { + uint16x8_t s0 = vld1q_u16(src_ptr); + uint16x8_t s1 = vld1q_u16(src_ptr + 8); + uint16x8_t s2 = vld1q_u16(src_ptr + 16); + uint16x8_t s3 = vld1q_u16(src_ptr + 24); + + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s1); + sum_u32[2] = vpadalq_u16(sum_u32[2], s2); + sum_u32[3] = vpadalq_u16(sum_u32[3], s3); + + sse_u64[0] = aom_udotq_u16(sse_u64[0], s0, s0); + sse_u64[1] = aom_udotq_u16(sse_u64[1], s1, s1); + sse_u64[2] = aom_udotq_u16(sse_u64[2], s2, s2); + sse_u64[3] = aom_udotq_u16(sse_u64[3], s3, s3); + + src_ptr += 32; + w -= 32; + } while (w != 0); + + src_u16 += src_stride; + } while (--h != 0); + + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[1]); + sum_u32[2] = vaddq_u32(sum_u32[2], sum_u32[3]); + sum_u32[0] = vaddq_u32(sum_u32[0], sum_u32[2]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[1]); + sse_u64[2] = vaddq_u64(sse_u64[2], sse_u64[3]); + sse_u64[0] = vaddq_u64(sse_u64[0], sse_u64[2]); + + sum += vaddlvq_u32(sum_u32[0]); + sse += vaddvq_u64(sse_u64[0]); + + return sse - sum * sum / (width * height); +} + +uint64_t aom_var_2d_u16_sve(uint8_t *src, int src_stride, int width, + int height) { + if (width == 4) { + return aom_var_2d_u16_4xh_sve(src, src_stride, width, height); + } + if (width == 8) { + return aom_var_2d_u16_8xh_sve(src, src_stride, width, height); + } + if (width == 16) { + return aom_var_2d_u16_16xh_sve(src, src_stride, width, height); + } + if (width % 32 == 0) { + return aom_var_2d_u16_large_sve(src, src_stride, width, height); + } + return aom_var_2d_u16_neon(src, src_stride, width, height); +} diff -Nru aom-3.8.2/aom_dsp/arm/transpose_neon.h aom-3.9.0/aom_dsp/arm/transpose_neon.h --- aom-3.8.2/aom_dsp/arm/transpose_neon.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/arm/transpose_neon.h 2024-05-07 19:57:02.491000000 +0000 @@ -16,11 +16,11 @@ #include "aom/aom_integer.h" // For AOM_FORCE_INLINE. #include "config/aom_config.h" -static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, - uint8x8_t *a2, uint8x8_t *a3, - uint8x8_t *a4, uint8x8_t *a5, - uint8x8_t *a6, - uint8x8_t *a7) { +static INLINE void transpose_elems_u8_8x8( + uint8x8_t a0, uint8x8_t a1, uint8x8_t a2, uint8x8_t a3, uint8x8_t a4, + uint8x8_t a5, uint8x8_t a6, uint8x8_t a7, uint8x8_t *o0, uint8x8_t *o1, + uint8x8_t *o2, uint8x8_t *o3, uint8x8_t *o4, uint8x8_t *o5, uint8x8_t *o6, + uint8x8_t *o7) { // Swap 8 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 10 11 12 13 14 15 16 17 @@ -36,10 +36,8 @@ // b1.val[0]: 20 30 22 32 24 34 26 36 60 70 62 72 64 74 66 76 // b1.val[1]: 21 31 23 33 25 35 27 37 61 71 63 73 65 75 67 77 - const uint8x16x2_t b0 = - vtrnq_u8(vcombine_u8(*a0, *a4), vcombine_u8(*a1, *a5)); - const uint8x16x2_t b1 = - vtrnq_u8(vcombine_u8(*a2, *a6), vcombine_u8(*a3, *a7)); + const uint8x16x2_t b0 = vtrnq_u8(vcombine_u8(a0, a4), vcombine_u8(a1, a5)); + const uint8x16x2_t b1 = vtrnq_u8(vcombine_u8(a2, a6), vcombine_u8(a3, a7)); // Swap 16 bit elements resulting in: // c0.val[0]: 00 10 20 30 04 14 24 34 40 50 60 70 44 54 64 74 @@ -62,14 +60,235 @@ const uint32x4x2_t d1 = vuzpq_u32(vreinterpretq_u32_u16(c0.val[1]), vreinterpretq_u32_u16(c1.val[1])); - *a0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); - *a1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); - *a2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); - *a3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); - *a4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); - *a5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); - *a6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); - *a7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); + *o0 = vreinterpret_u8_u32(vget_low_u32(d0.val[0])); + *o1 = vreinterpret_u8_u32(vget_high_u32(d0.val[0])); + *o2 = vreinterpret_u8_u32(vget_low_u32(d1.val[0])); + *o3 = vreinterpret_u8_u32(vget_high_u32(d1.val[0])); + *o4 = vreinterpret_u8_u32(vget_low_u32(d0.val[1])); + *o5 = vreinterpret_u8_u32(vget_high_u32(d0.val[1])); + *o6 = vreinterpret_u8_u32(vget_low_u32(d1.val[1])); + *o7 = vreinterpret_u8_u32(vget_high_u32(d1.val[1])); +} + +static INLINE void transpose_elems_inplace_u8_8x8(uint8x8_t *a0, uint8x8_t *a1, + uint8x8_t *a2, uint8x8_t *a3, + uint8x8_t *a4, uint8x8_t *a5, + uint8x8_t *a6, + uint8x8_t *a7) { + transpose_elems_u8_8x8(*a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7, a0, a1, a2, a3, + a4, a5, a6, a7); +} + +static INLINE void transpose_arrays_u8_8x8(const uint8x8_t *in, + uint8x8_t *out) { + transpose_elems_u8_8x8(in[0], in[1], in[2], in[3], in[4], in[5], in[6], in[7], + &out[0], &out[1], &out[2], &out[3], &out[4], &out[5], + &out[6], &out[7]); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_8x16(const uint8x8_t *x, + uint8x16_t *d) { + uint8x8x2_t w0 = vzip_u8(x[0], x[1]); + uint8x8x2_t w1 = vzip_u8(x[2], x[3]); + uint8x8x2_t w2 = vzip_u8(x[4], x[5]); + uint8x8x2_t w3 = vzip_u8(x[6], x[7]); + + uint8x8x2_t w8 = vzip_u8(x[8], x[9]); + uint8x8x2_t w9 = vzip_u8(x[10], x[11]); + uint8x8x2_t w10 = vzip_u8(x[12], x[13]); + uint8x8x2_t w11 = vzip_u8(x[14], x[15]); + + uint16x4x2_t w4 = + vzip_u16(vreinterpret_u16_u8(w0.val[0]), vreinterpret_u16_u8(w1.val[0])); + uint16x4x2_t w5 = + vzip_u16(vreinterpret_u16_u8(w2.val[0]), vreinterpret_u16_u8(w3.val[0])); + uint16x4x2_t w12 = + vzip_u16(vreinterpret_u16_u8(w8.val[0]), vreinterpret_u16_u8(w9.val[0])); + uint16x4x2_t w13 = vzip_u16(vreinterpret_u16_u8(w10.val[0]), + vreinterpret_u16_u8(w11.val[0])); + + uint32x2x2_t w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), + vreinterpret_u32_u16(w5.val[0])); + uint32x2x2_t w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), + vreinterpret_u32_u16(w5.val[1])); + uint32x2x2_t w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), + vreinterpret_u32_u16(w13.val[0])); + uint32x2x2_t w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), + vreinterpret_u32_u16(w13.val[1])); + + // Store first 4-line result + d[0] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); + d[1] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); + d[2] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); + d[3] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); + + w4 = vzip_u16(vreinterpret_u16_u8(w0.val[1]), vreinterpret_u16_u8(w1.val[1])); + w5 = vzip_u16(vreinterpret_u16_u8(w2.val[1]), vreinterpret_u16_u8(w3.val[1])); + w12 = + vzip_u16(vreinterpret_u16_u8(w8.val[1]), vreinterpret_u16_u8(w9.val[1])); + w13 = vzip_u16(vreinterpret_u16_u8(w10.val[1]), + vreinterpret_u16_u8(w11.val[1])); + + w6 = vzip_u32(vreinterpret_u32_u16(w4.val[0]), + vreinterpret_u32_u16(w5.val[0])); + w7 = vzip_u32(vreinterpret_u32_u16(w4.val[1]), + vreinterpret_u32_u16(w5.val[1])); + w14 = vzip_u32(vreinterpret_u32_u16(w12.val[0]), + vreinterpret_u32_u16(w13.val[0])); + w15 = vzip_u32(vreinterpret_u32_u16(w12.val[1]), + vreinterpret_u32_u16(w13.val[1])); + + // Store second 4-line result + d[4] = vreinterpretq_u8_u32(vcombine_u32(w6.val[0], w14.val[0])); + d[5] = vreinterpretq_u8_u32(vcombine_u32(w6.val[1], w14.val[1])); + d[6] = vreinterpretq_u8_u32(vcombine_u32(w7.val[0], w15.val[0])); + d[7] = vreinterpretq_u8_u32(vcombine_u32(w7.val[1], w15.val[1])); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_16x8(const uint8x16_t *x, + uint8x8_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); + uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); + uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); + + uint16x8x2_t w4 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0])); + uint16x8x2_t w5 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), + vreinterpretq_u16_u8(w3.val[0])); + uint16x8x2_t w6 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1])); + uint16x8x2_t w7 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), + vreinterpretq_u16_u8(w3.val[1])); + + uint32x4x2_t w8 = vzipq_u32(vreinterpretq_u32_u16(w4.val[0]), + vreinterpretq_u32_u16(w5.val[0])); + uint32x4x2_t w9 = vzipq_u32(vreinterpretq_u32_u16(w6.val[0]), + vreinterpretq_u32_u16(w7.val[0])); + uint32x4x2_t w10 = vzipq_u32(vreinterpretq_u32_u16(w4.val[1]), + vreinterpretq_u32_u16(w5.val[1])); + uint32x4x2_t w11 = vzipq_u32(vreinterpretq_u32_u16(w6.val[1]), + vreinterpretq_u32_u16(w7.val[1])); + + d[0] = vreinterpret_u8_u32(vget_low_u32(w8.val[0])); + d[1] = vreinterpret_u8_u32(vget_high_u32(w8.val[0])); + d[2] = vreinterpret_u8_u32(vget_low_u32(w8.val[1])); + d[3] = vreinterpret_u8_u32(vget_high_u32(w8.val[1])); + d[4] = vreinterpret_u8_u32(vget_low_u32(w10.val[0])); + d[5] = vreinterpret_u8_u32(vget_high_u32(w10.val[0])); + d[6] = vreinterpret_u8_u32(vget_low_u32(w10.val[1])); + d[7] = vreinterpret_u8_u32(vget_high_u32(w10.val[1])); + d[8] = vreinterpret_u8_u32(vget_low_u32(w9.val[0])); + d[9] = vreinterpret_u8_u32(vget_high_u32(w9.val[0])); + d[10] = vreinterpret_u8_u32(vget_low_u32(w9.val[1])); + d[11] = vreinterpret_u8_u32(vget_high_u32(w9.val[1])); + d[12] = vreinterpret_u8_u32(vget_low_u32(w11.val[0])); + d[13] = vreinterpret_u8_u32(vget_high_u32(w11.val[0])); + d[14] = vreinterpret_u8_u32(vget_low_u32(w11.val[1])); + d[15] = vreinterpret_u8_u32(vget_high_u32(w11.val[1])); +} + +static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { + uint16x8x2_t b0; +#if AOM_ARCH_AARCH64 + b0.val[0] = vreinterpretq_u16_u64( + vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); + b0.val[1] = vreinterpretq_u16_u64( + vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); +#else + b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), + vreinterpret_u16_u32(vget_low_u32(a1))); + b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), + vreinterpret_u16_u32(vget_high_u32(a1))); +#endif + return b0; +} + +static INLINE void transpose_arrays_u8_16x16(const uint8x16_t *x, + uint8x16_t *d) { + uint8x16x2_t w0 = vzipq_u8(x[0], x[1]); + uint8x16x2_t w1 = vzipq_u8(x[2], x[3]); + uint8x16x2_t w2 = vzipq_u8(x[4], x[5]); + uint8x16x2_t w3 = vzipq_u8(x[6], x[7]); + + uint8x16x2_t w4 = vzipq_u8(x[8], x[9]); + uint8x16x2_t w5 = vzipq_u8(x[10], x[11]); + uint8x16x2_t w6 = vzipq_u8(x[12], x[13]); + uint8x16x2_t w7 = vzipq_u8(x[14], x[15]); + + uint16x8x2_t w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[0]), + vreinterpretq_u16_u8(w1.val[0])); + uint16x8x2_t w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[0]), + vreinterpretq_u16_u8(w3.val[0])); + uint16x8x2_t w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[0]), + vreinterpretq_u16_u8(w5.val[0])); + uint16x8x2_t w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[0]), + vreinterpretq_u16_u8(w7.val[0])); + + uint32x4x2_t w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), + vreinterpretq_u32_u16(w9.val[0])); + uint32x4x2_t w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), + vreinterpretq_u32_u16(w11.val[0])); + uint32x4x2_t w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), + vreinterpretq_u32_u16(w9.val[1])); + uint32x4x2_t w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), + vreinterpretq_u32_u16(w11.val[1])); + + uint16x8x2_t d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); + d[0] = vreinterpretq_u8_u16(d01.val[0]); + d[1] = vreinterpretq_u8_u16(d01.val[1]); + uint16x8x2_t d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); + d[2] = vreinterpretq_u8_u16(d23.val[0]); + d[3] = vreinterpretq_u8_u16(d23.val[1]); + uint16x8x2_t d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); + d[4] = vreinterpretq_u8_u16(d45.val[0]); + d[5] = vreinterpretq_u8_u16(d45.val[1]); + uint16x8x2_t d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); + d[6] = vreinterpretq_u8_u16(d67.val[0]); + d[7] = vreinterpretq_u8_u16(d67.val[1]); + + // upper half + w8 = vzipq_u16(vreinterpretq_u16_u8(w0.val[1]), + vreinterpretq_u16_u8(w1.val[1])); + w9 = vzipq_u16(vreinterpretq_u16_u8(w2.val[1]), + vreinterpretq_u16_u8(w3.val[1])); + w10 = vzipq_u16(vreinterpretq_u16_u8(w4.val[1]), + vreinterpretq_u16_u8(w5.val[1])); + w11 = vzipq_u16(vreinterpretq_u16_u8(w6.val[1]), + vreinterpretq_u16_u8(w7.val[1])); + + w12 = vzipq_u32(vreinterpretq_u32_u16(w8.val[0]), + vreinterpretq_u32_u16(w9.val[0])); + w13 = vzipq_u32(vreinterpretq_u32_u16(w10.val[0]), + vreinterpretq_u32_u16(w11.val[0])); + w14 = vzipq_u32(vreinterpretq_u32_u16(w8.val[1]), + vreinterpretq_u32_u16(w9.val[1])); + w15 = vzipq_u32(vreinterpretq_u32_u16(w10.val[1]), + vreinterpretq_u32_u16(w11.val[1])); + + d01 = aom_vtrnq_u64_to_u16(w12.val[0], w13.val[0]); + d[8] = vreinterpretq_u8_u16(d01.val[0]); + d[9] = vreinterpretq_u8_u16(d01.val[1]); + d23 = aom_vtrnq_u64_to_u16(w12.val[1], w13.val[1]); + d[10] = vreinterpretq_u8_u16(d23.val[0]); + d[11] = vreinterpretq_u8_u16(d23.val[1]); + d45 = aom_vtrnq_u64_to_u16(w14.val[0], w15.val[0]); + d[12] = vreinterpretq_u8_u16(d45.val[0]); + d[13] = vreinterpretq_u8_u16(d45.val[1]); + d67 = aom_vtrnq_u64_to_u16(w14.val[1], w15.val[1]); + d[14] = vreinterpretq_u8_u16(d67.val[0]); + d[15] = vreinterpretq_u8_u16(d67.val[1]); +} + +static AOM_FORCE_INLINE void transpose_arrays_u8_32x16(const uint8x16x2_t *x, + uint8x16_t *d) { + uint8x16_t x2[32]; + for (int i = 0; i < 16; ++i) { + x2[i] = x[i].val[0]; + x2[i + 16] = x[i].val[1]; + } + transpose_arrays_u8_16x16(x2, d); + transpose_arrays_u8_16x16(x2 + 16, d + 16); } static INLINE void transpose_elems_inplace_u8_8x4(uint8x8_t *a0, uint8x8_t *a1, @@ -265,22 +484,6 @@ a[3] = vreinterpretq_u16_u32(c1.val[1]); } -static INLINE uint16x8x2_t aom_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { - uint16x8x2_t b0; -#if AOM_ARCH_AARCH64 - b0.val[0] = vreinterpretq_u16_u64( - vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); - b0.val[1] = vreinterpretq_u16_u64( - vtrn2q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); -#else - b0.val[0] = vcombine_u16(vreinterpret_u16_u32(vget_low_u32(a0)), - vreinterpret_u16_u32(vget_low_u32(a1))); - b0.val[1] = vcombine_u16(vreinterpret_u16_u32(vget_high_u32(a0)), - vreinterpret_u16_u32(vget_high_u32(a1))); -#endif - return b0; -} - // Special transpose for loop filter. // 4x8 Input: // p_q: p3 p2 p1 p0 q0 q1 q2 q3 diff -Nru aom-3.8.2/aom_dsp/bitwriter.c aom-3.9.0/aom_dsp/bitwriter.c --- aom-3.8.2/aom_dsp/bitwriter.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/bitwriter.c 2024-05-07 19:57:02.496000000 +0000 @@ -23,6 +23,10 @@ uint32_t bytes; unsigned char *data; data = od_ec_enc_done(&w->ec, &bytes); + if (!data) { + od_ec_enc_clear(&w->ec); + return -1; + } nb_bits = od_ec_enc_tell(&w->ec); memcpy(w->buffer, data, bytes); w->pos = bytes; diff -Nru aom-3.8.2/aom_dsp/bitwriter.h aom-3.9.0/aom_dsp/bitwriter.h --- aom-3.8.2/aom_dsp/bitwriter.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/bitwriter.h 2024-05-07 19:57:02.497000000 +0000 @@ -62,6 +62,8 @@ void aom_start_encode(aom_writer *w, uint8_t *buffer); +// Returns a negative number on error. Caller must check the return value and +// handle error. int aom_stop_encode(aom_writer *w); int aom_tell_size(aom_writer *w); diff -Nru aom-3.8.2/aom_dsp/entenc.c aom-3.9.0/aom_dsp/entenc.c --- aom-3.8.2/aom_dsp/entenc.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/entenc.c 2024-05-07 19:57:02.500000000 +0000 @@ -58,6 +58,7 @@ int d; int c; int s; + if (enc->error) return; c = enc->cnt; assert(rng <= 65535U); /*The number of leading zeros in the 16-bit binary representation of rng.*/ @@ -83,7 +84,6 @@ out = (unsigned char *)realloc(out, sizeof(*out) * storage); if (out == NULL) { enc->error = -1; - enc->offs = 0; return; } enc->buf = out; @@ -372,28 +372,3 @@ uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) { return od_ec_tell_frac(od_ec_enc_tell(enc), enc->rng); } - -/*Saves a entropy coder checkpoint to dst. - This allows an encoder to reverse a series of entropy coder - decisions if it decides that the information would have been - better coded some other way.*/ -void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src) { - OD_COPY(dst, src, 1); -} - -/*Restores an entropy coder checkpoint saved by od_ec_enc_checkpoint. - This can only be used to restore from checkpoints earlier in the target - state's history: you can not switch backwards and forwards or otherwise - switch to a state which isn't a casual ancestor of the current state. - Restore is also incompatible with patching the initial bits, as the - changes will remain in the restored version.*/ -void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src) { - unsigned char *buf; - uint32_t storage; - assert(dst->storage >= src->storage); - buf = dst->buf; - storage = dst->storage; - OD_COPY(dst, src, 1); - dst->buf = buf; - dst->storage = storage; -} diff -Nru aom-3.8.2/aom_dsp/entenc.h aom-3.9.0/aom_dsp/entenc.h --- aom-3.8.2/aom_dsp/entenc.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/entenc.h 2024-05-07 19:57:02.501000000 +0000 @@ -74,9 +74,6 @@ OD_WARN_UNUSED_RESULT uint32_t od_ec_enc_tell_frac(const od_ec_enc *enc) OD_ARG_NONNULL(1); -void od_ec_enc_checkpoint(od_ec_enc *dst, const od_ec_enc *src); -void od_ec_enc_rollback(od_ec_enc *dst, const od_ec_enc *src); - // buf is the frame bitbuffer, offs is where carry to be added static AOM_INLINE void propagate_carry_bwd(unsigned char *buf, uint32_t offs) { uint16_t sum, carry = 1; diff -Nru aom-3.8.2/aom_dsp/flow_estimation/arm/disflow_neon.c aom-3.9.0/aom_dsp/flow_estimation/arm/disflow_neon.c --- aom-3.8.2/aom_dsp/flow_estimation/arm/disflow_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/arm/disflow_neon.c 2024-05-07 19:57:02.506000000 +0000 @@ -19,10 +19,10 @@ #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" -static INLINE void get_cubic_kernel_dbl(double x, double *kernel) { +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. @@ -36,7 +36,7 @@ kernel[3] = -0.5 * x2 + 0.5 * x3; } -static INLINE void get_cubic_kernel_int(double x, int *kernel) { +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { double kernel_dbl[4]; get_cubic_kernel_dbl(x, kernel_dbl); diff -Nru aom-3.8.2/aom_dsp/flow_estimation/corner_detect.c aom-3.9.0/aom_dsp/flow_estimation/corner_detect.c --- aom-3.8.2/aom_dsp/flow_estimation/corner_detect.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/corner_detect.c 2024-05-07 19:57:02.507000000 +0000 @@ -20,6 +20,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_mem/aom_mem.h" +#include "aom_util/aom_pthread.h" #include "av1/common/common.h" #define FAST_BARRIER 18 @@ -39,11 +40,24 @@ return corners; } -static bool compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { - const uint8_t *buf = pyr->layers[0].buffer; - int width = pyr->layers[0].width; - int height = pyr->layers[0].height; - int stride = pyr->layers[0].stride; +static bool compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { + ImagePyramid *pyr = frame->y_pyramid; + const int layers = + aom_compute_pyramid(frame, bit_depth, downsample_level + 1, pyr); + + if (layers < 0) { + return false; + } + + // Clamp downsampling ratio base on max number of layers allowed + // for this frame size + downsample_level = layers - 1; + + const uint8_t *buf = pyr->layers[downsample_level].buffer; + int width = pyr->layers[downsample_level].width; + int height = pyr->layers[downsample_level].height; + int stride = pyr->layers[downsample_level].stride; int *scores = NULL; int num_corners; @@ -53,9 +67,11 @@ if (num_corners <= MAX_CORNERS) { // Use all detected corners - if (num_corners != 0) { - memcpy(corners->corners, frame_corners_xy, - sizeof(*frame_corners_xy) * num_corners); + for (int i = 0; i < num_corners; i++) { + corners->corners[2 * i + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * i + 1] = + frame_corners_xy[i].y * (1 << downsample_level); } corners->num_corners = num_corners; } else { @@ -85,8 +101,10 @@ for (int i = 0; i < num_corners; i++) { if (scores[i] > threshold) { assert(copied_corners < MAX_CORNERS); - corners->corners[2 * copied_corners + 0] = frame_corners_xy[i].x; - corners->corners[2 * copied_corners + 1] = frame_corners_xy[i].y; + corners->corners[2 * copied_corners + 0] = + frame_corners_xy[i].x * (1 << downsample_level); + corners->corners[2 * copied_corners + 1] = + frame_corners_xy[i].y * (1 << downsample_level); copied_corners += 1; } } @@ -99,7 +117,8 @@ return true; } -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners) { +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners) { assert(corners); #if CONFIG_MULTITHREAD @@ -107,7 +126,8 @@ #endif // CONFIG_MULTITHREAD if (!corners->valid) { - corners->valid = compute_corner_list(pyr, corners); + corners->valid = + compute_corner_list(frame, bit_depth, downsample_level, corners); } bool valid = corners->valid; diff -Nru aom-3.8.2/aom_dsp/flow_estimation/corner_detect.h aom-3.9.0/aom_dsp/flow_estimation/corner_detect.h --- aom-3.8.2/aom_dsp/flow_estimation/corner_detect.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/corner_detect.h 2024-05-07 19:57:02.507000000 +0000 @@ -18,7 +18,7 @@ #include #include "aom_dsp/pyramid.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { @@ -57,7 +57,8 @@ CornerList *av1_alloc_corner_list(void); -bool av1_compute_corner_list(const ImagePyramid *pyr, CornerList *corners); +bool av1_compute_corner_list(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int downsample_level, CornerList *corners); #ifndef NDEBUG // Check if a corner list has already been computed. diff -Nru aom-3.8.2/aom_dsp/flow_estimation/corner_match.c aom-3.9.0/aom_dsp/flow_estimation/corner_match.c --- aom-3.8.2/aom_dsp/flow_estimation/corner_match.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/corner_match.c 2024-05-07 19:57:02.507000000 +0000 @@ -17,62 +17,84 @@ #include "aom_dsp/flow_estimation/corner_detect.h" #include "aom_dsp/flow_estimation/corner_match.h" +#include "aom_dsp/flow_estimation/disflow.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_dsp/flow_estimation/ransac.h" #include "aom_dsp/pyramid.h" #include "aom_scale/yv12config.h" -#define SEARCH_SZ 9 -#define SEARCH_SZ_BY2 ((SEARCH_SZ - 1) / 2) - #define THRESHOLD_NCC 0.75 -/* Compute var(frame) * MATCH_SZ_SQ over a MATCH_SZ by MATCH_SZ window of frame, - centered at (x, y). +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * and + *one_over_stddev = 1 / (MATCH_SZ * ) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -static double compute_variance(const unsigned char *frame, int stride, int x, - int y) { +bool aom_compute_mean_stddev_c(const unsigned char *frame, int stride, int x, + int y, double *mean, double *one_over_stddev) { int sum = 0; int sumsq = 0; - int var; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { sum += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; sumsq += frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)] * frame[(i + y - MATCH_SZ_BY2) * stride + (j + x - MATCH_SZ_BY2)]; } - var = sumsq * MATCH_SZ_SQ - sum * sum; - return (double)var; + } + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; } -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. */ -double av1_compute_cross_correlation_c(const unsigned char *frame1, int stride1, - int x1, int y1, - const unsigned char *frame2, int stride2, - int x2, int y2) { +double aom_compute_correlation_c(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { int v1, v2; - int sum1 = 0; - int sum2 = 0; - int sumsq2 = 0; int cross = 0; - int var2, cov; - int i, j; - for (i = 0; i < MATCH_SZ; ++i) - for (j = 0; j < MATCH_SZ; ++j) { + for (int i = 0; i < MATCH_SZ; ++i) { + for (int j = 0; j < MATCH_SZ; ++j) { v1 = frame1[(i + y1 - MATCH_SZ_BY2) * stride1 + (j + x1 - MATCH_SZ_BY2)]; v2 = frame2[(i + y2 - MATCH_SZ_BY2) * stride2 + (j + x2 - MATCH_SZ_BY2)]; - sum1 += v1; - sum2 += v2; - sumsq2 += v2 * v2; cross += v1 * v2; } - var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + } + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + double covariance = cross - mean1 * mean2; + double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } static int is_eligible_point(int pointx, int pointy, int width, int height) { @@ -87,65 +109,14 @@ (point1y - point2y) * (point1y - point2y)) <= thresh * thresh; } -static void improve_correspondence(const unsigned char *src, - const unsigned char *ref, int width, - int height, int src_stride, int ref_stride, - Correspondence *correspondences, - int num_correspondences) { - int i; - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - // For this algorithm, all points have integer coordinates. - // It's a little more efficient to convert them to ints once, - // before the inner loops - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) { - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(rx0 + x, ry0 + y, width, height)) continue; - if (!is_eligible_distance(x0, y0, rx0 + x, ry0 + y, width, height)) - continue; - match_ncc = av1_compute_cross_correlation(src, src_stride, x0, y0, ref, - ref_stride, rx0 + x, ry0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - } - correspondences[i].rx += best_x; - correspondences[i].ry += best_y; - } - for (i = 0; i < num_correspondences; ++i) { - int x, y, best_x = 0, best_y = 0; - double best_match_ncc = 0.0; - int x0 = (int)correspondences[i].x; - int y0 = (int)correspondences[i].y; - int rx0 = (int)correspondences[i].rx; - int ry0 = (int)correspondences[i].ry; - for (y = -SEARCH_SZ_BY2; y <= SEARCH_SZ_BY2; ++y) - for (x = -SEARCH_SZ_BY2; x <= SEARCH_SZ_BY2; ++x) { - double match_ncc; - if (!is_eligible_point(x0 + x, y0 + y, width, height)) continue; - if (!is_eligible_distance(x0 + x, y0 + y, rx0, ry0, width, height)) - continue; - match_ncc = av1_compute_cross_correlation( - ref, ref_stride, rx0, ry0, src, src_stride, x0 + x, y0 + y); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_y = y; - best_x = x; - } - } - correspondences[i].x += best_x; - correspondences[i].y += best_y; - } -} +typedef struct { + int x; + int y; + double mean; + double one_over_stddev; + int best_match_idx; + double best_match_corr; +} PointInfo; static int determine_correspondence(const unsigned char *src, const int *src_corners, int num_src_corners, @@ -154,56 +125,136 @@ int width, int height, int src_stride, int ref_stride, Correspondence *correspondences) { - // TODO(sarahparker) Improve this to include 2-way match - int i, j; + PointInfo *src_point_info = NULL; + PointInfo *ref_point_info = NULL; int num_correspondences = 0; - for (i = 0; i < num_src_corners; ++i) { - double best_match_ncc = 0.0; - double template_norm; - int best_match_j = -1; - if (!is_eligible_point(src_corners[2 * i], src_corners[2 * i + 1], width, - height)) + + src_point_info = + (PointInfo *)aom_calloc(num_src_corners, sizeof(*src_point_info)); + if (!src_point_info) { + goto finished; + } + + ref_point_info = + (PointInfo *)aom_calloc(num_ref_corners, sizeof(*ref_point_info)); + if (!ref_point_info) { + goto finished; + } + + // First pass (linear): + // Filter corner lists and compute per-patch means and standard deviations, + // for the src and ref frames independently + int src_point_count = 0; + for (int i = 0; i < num_src_corners; i++) { + int src_x = src_corners[2 * i]; + int src_y = src_corners[2 * i + 1]; + if (!is_eligible_point(src_x, src_y, width, height)) continue; + + PointInfo *point = &src_point_info[src_point_count]; + point->x = src_x; + point->y = src_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(src, src_stride, src_x, src_y, &point->mean, + &point->one_over_stddev)) continue; - for (j = 0; j < num_ref_corners; ++j) { - double match_ncc; - if (!is_eligible_point(ref_corners[2 * j], ref_corners[2 * j + 1], width, - height)) - continue; - if (!is_eligible_distance(src_corners[2 * i], src_corners[2 * i + 1], - ref_corners[2 * j], ref_corners[2 * j + 1], - width, height)) + src_point_count++; + } + if (src_point_count == 0) { + goto finished; + } + + int ref_point_count = 0; + for (int j = 0; j < num_ref_corners; j++) { + int ref_x = ref_corners[2 * j]; + int ref_y = ref_corners[2 * j + 1]; + if (!is_eligible_point(ref_x, ref_y, width, height)) continue; + + PointInfo *point = &ref_point_info[ref_point_count]; + point->x = ref_x; + point->y = ref_y; + point->best_match_corr = THRESHOLD_NCC; + if (!aom_compute_mean_stddev(ref, ref_stride, ref_x, ref_y, &point->mean, + &point->one_over_stddev)) + continue; + ref_point_count++; + } + if (ref_point_count == 0) { + goto finished; + } + + // Second pass (quadratic): + // For each pair of points, compute correlation, and use this to determine + // the best match of each corner, in both directions + for (int i = 0; i < src_point_count; ++i) { + PointInfo *src_point = &src_point_info[i]; + for (int j = 0; j < ref_point_count; ++j) { + PointInfo *ref_point = &ref_point_info[j]; + if (!is_eligible_distance(src_point->x, src_point->y, ref_point->x, + ref_point->y, width, height)) continue; - match_ncc = av1_compute_cross_correlation( - src, src_stride, src_corners[2 * i], src_corners[2 * i + 1], ref, - ref_stride, ref_corners[2 * j], ref_corners[2 * j + 1]); - if (match_ncc > best_match_ncc) { - best_match_ncc = match_ncc; - best_match_j = j; + + double corr = aom_compute_correlation( + src, src_stride, src_point->x, src_point->y, src_point->mean, + src_point->one_over_stddev, ref, ref_stride, ref_point->x, + ref_point->y, ref_point->mean, ref_point->one_over_stddev); + + if (corr > src_point->best_match_corr) { + src_point->best_match_idx = j; + src_point->best_match_corr = corr; + } + if (corr > ref_point->best_match_corr) { + ref_point->best_match_idx = i; + ref_point->best_match_corr = corr; } } - // Note: We want to test if the best correlation is >= THRESHOLD_NCC, - // but need to account for the normalization in - // av1_compute_cross_correlation. - template_norm = compute_variance(src, src_stride, src_corners[2 * i], - src_corners[2 * i + 1]); - if (best_match_ncc > THRESHOLD_NCC * sqrt(template_norm)) { - correspondences[num_correspondences].x = src_corners[2 * i]; - correspondences[num_correspondences].y = src_corners[2 * i + 1]; - correspondences[num_correspondences].rx = ref_corners[2 * best_match_j]; - correspondences[num_correspondences].ry = - ref_corners[2 * best_match_j + 1]; + } + + // Third pass (linear): + // Scan through source corners, generating a correspondence for each corner + // iff ref_best_match[src_best_match[i]] == i + // Then refine the generated correspondences using optical flow + for (int i = 0; i < src_point_count; i++) { + PointInfo *point = &src_point_info[i]; + + // Skip corners which were not matched, or which didn't find + // a good enough match + if (point->best_match_corr < THRESHOLD_NCC) continue; + + PointInfo *match_point = &ref_point_info[point->best_match_idx]; + if (match_point->best_match_idx == i) { + // Refine match using optical flow and store + const int sx = point->x; + const int sy = point->y; + const int rx = match_point->x; + const int ry = match_point->y; + double u = (double)(rx - sx); + double v = (double)(ry - sy); + + const int patch_tl_x = sx - DISFLOW_PATCH_CENTER; + const int patch_tl_y = sy - DISFLOW_PATCH_CENTER; + + aom_compute_flow_at_point(src, ref, patch_tl_x, patch_tl_y, width, height, + src_stride, &u, &v); + + Correspondence *correspondence = &correspondences[num_correspondences]; + correspondence->x = (double)sx; + correspondence->y = (double)sy; + correspondence->rx = (double)sx + u; + correspondence->ry = (double)sy + v; num_correspondences++; } } - improve_correspondence(src, ref, width, height, src_stride, ref_stride, - correspondences, num_correspondences); + +finished: + aom_free(src_point_info); + aom_free(ref_point_info); return num_correspondences; } bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed) { + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { int num_correspondences; Correspondence *correspondences; ImagePyramid *src_pyramid = src->y_pyramid; @@ -212,19 +263,19 @@ CornerList *ref_corners = ref->corners; // Precompute information we will need about each frame - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { + if (aom_compute_pyramid(src, bit_depth, 1, src_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + if (aom_compute_pyramid(ref, bit_depth, 1, ref_pyramid) < 0) { *mem_alloc_failed = true; return false; } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, ref_corners)) { *mem_alloc_failed = true; return false; } diff -Nru aom-3.8.2/aom_dsp/flow_estimation/corner_match.h aom-3.9.0/aom_dsp/flow_estimation/corner_match.h --- aom-3.8.2/aom_dsp/flow_estimation/corner_match.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/corner_match.h 2024-05-07 19:57:02.508000000 +0000 @@ -25,14 +25,20 @@ extern "C" { #endif -#define MATCH_SZ 13 +#define MATCH_SZ 16 #define MATCH_SZ_BY2 ((MATCH_SZ - 1) / 2) #define MATCH_SZ_SQ (MATCH_SZ * MATCH_SZ) +// Minimum threshold for the variance of a patch, in order for it to be +// considered useful for matching. +// This is evaluated against the scaled variance MATCH_SZ_SQ * sigma^2, +// so a setting of 1 * MATCH_SZ_SQ corresponds to an unscaled variance of 1 +#define MIN_FEATURE_VARIANCE (1 * MATCH_SZ_SQ) + bool av1_compute_global_motion_feature_match( TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, - int bit_depth, MotionModel *motion_models, int num_motion_models, - bool *mem_alloc_failed); + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } diff -Nru aom-3.8.2/aom_dsp/flow_estimation/disflow.c aom-3.9.0/aom_dsp/flow_estimation/disflow.c --- aom-3.8.2/aom_dsp/flow_estimation/disflow.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/disflow.c 2024-05-07 19:57:02.508000000 +0000 @@ -24,24 +24,29 @@ #include "config/aom_dsp_rtcd.h" -// TODO(rachelbarker): -// Implement specialized functions for upscaling flow fields, -// replacing av1_upscale_plane_double_prec(). -// Then we can avoid needing to include code from av1/ -#include "av1/common/resize.h" - // Amount to downsample the flow field by. -// eg. DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate +// e.g., DOWNSAMPLE_SHIFT = 2 (DOWNSAMPLE_FACTOR == 4) means we calculate // one flow point for each 4x4 pixel region of the frame // Must be a power of 2 #define DOWNSAMPLE_SHIFT 3 #define DOWNSAMPLE_FACTOR (1 << DOWNSAMPLE_SHIFT) + +// Filters used when upscaling the flow field from one pyramid level +// to another. See upscale_flow_component for details on kernel selection +#define FLOW_UPSCALE_TAPS 4 + // Number of outermost flow field entries (on each edge) which can't be // computed, because the patch they correspond to extends outside of the // frame // The border is (DISFLOW_PATCH_SIZE >> 1) pixels, which is // (DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT many flow field entries -#define FLOW_BORDER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT) +#define FLOW_BORDER_INNER ((DISFLOW_PATCH_SIZE >> 1) >> DOWNSAMPLE_SHIFT) + +// Number of extra padding entries on each side of the flow field. +// These samples are added so that we do not need to apply clamping when +// interpolating or upsampling the flow field +#define FLOW_BORDER_OUTER (FLOW_UPSCALE_TAPS / 2) + // When downsampling the flow field, each flow field entry covers a square // region of pixels in the image pyramid. This value is equal to the position // of the center of that region, as an offset from the top/left edge. @@ -52,10 +57,16 @@ // this gives the correct offset of 0 instead of -1. #define UPSAMPLE_CENTER_OFFSET ((DOWNSAMPLE_FACTOR - 1) / 2) -static INLINE void get_cubic_kernel_dbl(double x, double *kernel) { +static double flow_upscale_filter[2][FLOW_UPSCALE_TAPS] = { + // Cubic interpolation kernels for phase=0.75 and phase=0.25, respectively + { -3 / 128., 29 / 128., 111 / 128., -9 / 128. }, + { -9 / 128., 111 / 128., 29 / 128., -3 / 128. } +}; + +static INLINE void get_cubic_kernel_dbl(double x, double kernel[4]) { // Check that the fractional position is in range. // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. + // Note: x is calculated from, e.g., `u_frac = u - floor(u)`. // Mathematically, this implies that 0 <= x < 1. However, in practice it is // possible to have x == 1 due to floating point rounding. This is fine, // and we still interpolate correctly if we allow x = 1. @@ -69,7 +80,7 @@ kernel[3] = -0.5 * x2 + 0.5 * x3; } -static INLINE void get_cubic_kernel_int(double x, int *kernel) { +static INLINE void get_cubic_kernel_int(double x, int kernel[4]) { double kernel_dbl[4]; get_cubic_kernel_dbl(x, kernel_dbl); @@ -80,18 +91,19 @@ } static INLINE double get_cubic_value_dbl(const double *p, - const double *kernel) { + const double kernel[4]) { return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + kernel[3] * p[3]; } -static INLINE int get_cubic_value_int(const int *p, const int *kernel) { +static INLINE int get_cubic_value_int(const int *p, const int kernel[4]) { return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + kernel[3] * p[3]; } static INLINE double bicubic_interp_one(const double *arr, int stride, - double *h_kernel, double *v_kernel) { + const double h_kernel[4], + const double v_kernel[4]) { double tmp[1 * 4]; // Horizontal convolution @@ -103,7 +115,9 @@ return get_cubic_value_dbl(tmp, v_kernel); } -static int determine_disflow_correspondence(CornerList *corners, +static int determine_disflow_correspondence(const ImagePyramid *src_pyr, + const ImagePyramid *ref_pyr, + CornerList *corners, const FlowField *flow, Correspondence *correspondences) { const int width = flow->width; @@ -132,7 +146,15 @@ const double flow_sub_y = (y & (DOWNSAMPLE_FACTOR - 1)) / (double)DOWNSAMPLE_FACTOR; - // Make sure that bicubic interpolation won't read outside of the flow field + // Exclude points which would sample from the outer border of the flow + // field, as this would give lower-quality results. + // + // Note: As we never read from the border region at pyramid level 0, we + // can skip filling it in. If the conditions here are removed, or any + // other logic is added which reads from this border region, then + // compute_flow_field() will need to be modified to call + // fill_flow_field_borders() at pyramid level 0 to set up the correct + // border data. if (flow_x < 1 || (flow_x + 2) >= width) continue; if (flow_y < 1 || (flow_y + 2) >= height) continue; @@ -141,10 +163,18 @@ get_cubic_kernel_dbl(flow_sub_x, h_kernel); get_cubic_kernel_dbl(flow_sub_y, v_kernel); - const double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x], - stride, h_kernel, v_kernel); - const double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x], - stride, h_kernel, v_kernel); + double flow_u = bicubic_interp_one(&flow->u[flow_y * stride + flow_x], + stride, h_kernel, v_kernel); + double flow_v = bicubic_interp_one(&flow->v[flow_y * stride + flow_x], + stride, h_kernel, v_kernel); + + // Refine the interpolated flow vector one last time + const int patch_tl_x = x0 - DISFLOW_PATCH_CENTER; + const int patch_tl_y = y0 - DISFLOW_PATCH_CENTER; + aom_compute_flow_at_point( + src_pyr->layers[0].buffer, ref_pyr->layers[0].buffer, patch_tl_x, + patch_tl_y, src_pyr->layers[0].width, src_pyr->layers[0].height, + src_pyr->layers[0].stride, &flow_u, &flow_v); // Use original points (without offsets) when filling in correspondence // array @@ -420,16 +450,16 @@ // Calculate the bounds of the rectangle which was filled in by // compute_flow_field() before calling this function. // These indices are inclusive on both ends. - const int left_index = FLOW_BORDER; - const int right_index = (width - FLOW_BORDER - 1); - const int top_index = FLOW_BORDER; - const int bottom_index = (height - FLOW_BORDER - 1); + const int left_index = FLOW_BORDER_INNER; + const int right_index = (width - FLOW_BORDER_INNER - 1); + const int top_index = FLOW_BORDER_INNER; + const int bottom_index = (height - FLOW_BORDER_INNER - 1); // Left area for (int i = top_index; i <= bottom_index; i += 1) { double *row = flow + i * stride; const double left = row[left_index]; - for (int j = 0; j < left_index; j++) { + for (int j = -FLOW_BORDER_OUTER; j < left_index; j++) { row[j] = left; } } @@ -438,45 +468,178 @@ for (int i = top_index; i <= bottom_index; i += 1) { double *row = flow + i * stride; const double right = row[right_index]; - for (int j = right_index + 1; j < width; j++) { + for (int j = right_index + 1; j < width + FLOW_BORDER_OUTER; j++) { row[j] = right; } } // Top area - const double *top_row = flow + top_index * stride; - for (int i = 0; i < top_index; i++) { - double *row = flow + i * stride; - memcpy(row, top_row, width * sizeof(*row)); + const double *top_row = flow + top_index * stride - FLOW_BORDER_OUTER; + for (int i = -FLOW_BORDER_OUTER; i < top_index; i++) { + double *row = flow + i * stride - FLOW_BORDER_OUTER; + size_t length = width + 2 * FLOW_BORDER_OUTER; + memcpy(row, top_row, length * sizeof(*row)); } // Bottom area - const double *bottom_row = flow + bottom_index * stride; - for (int i = bottom_index + 1; i < height; i++) { - double *row = flow + i * stride; - memcpy(row, bottom_row, width * sizeof(*row)); + const double *bottom_row = flow + bottom_index * stride - FLOW_BORDER_OUTER; + for (int i = bottom_index + 1; i < height + FLOW_BORDER_OUTER; i++) { + double *row = flow + i * stride - FLOW_BORDER_OUTER; + size_t length = width + 2 * FLOW_BORDER_OUTER; + memcpy(row, bottom_row, length * sizeof(*row)); + } +} + +// Upscale one component of the flow field, from a size of +// cur_width x cur_height to a size of (2*cur_width) x (2*cur_height), storing +// the result back into the same buffer. This function also scales the flow +// vector by 2, so that when we move to the next pyramid level down, the implied +// motion vector is the same. +// +// The temporary buffer tmpbuf must be large enough to hold an intermediate +// array of size stride * cur_height, *plus* FLOW_BORDER_OUTER rows above and +// below. In other words, indices from -FLOW_BORDER_OUTER * stride to +// (cur_height + FLOW_BORDER_OUTER) * stride - 1 must be valid. +// +// Note that the same stride is used for u before and after upscaling +// and for the temporary buffer, for simplicity. +// +// A note on phasing: +// +// The flow fields at two adjacent pyramid levels are offset from each other, +// and we need to account for this in the construction of the interpolation +// kernels. +// +// Consider an 8x8 pixel patch at pyramid level n. This is split into four +// patches at pyramid level n-1. Bringing these patches back up to pyramid level +// n, each sub-patch covers 4x4 pixels, and between them they cover the same +// 8x8 region. +// +// Therefore, at pyramid level n, two adjacent patches look like this: +// +// + - - - - - - - + - - - - - - - + +// | | | +// | x x | x x | +// | | | +// | # | # | +// | | | +// | x x | x x | +// | | | +// + - - - - - - - + - - - - - - - + +// +// where # marks the center of a patch at pyramid level n (the input to this +// function), and x marks the center of a patch at pyramid level n-1 (the output +// of this function). +// +// By counting pixels (marked by +, -, and |), we can see that the flow vectors +// at pyramid level n-1 are offset relative to the flow vectors at pyramid +// level n, by 1/4 of the larger (input) patch size. Therefore, our +// interpolation kernels need to have phases of 0.25 and 0.75. +// +// In addition, in order to handle the frame edges correctly, we need to +// generate one output vector to the left and one to the right of each input +// vector, even though these must be interpolated using different source points. +static void upscale_flow_component(double *flow, int cur_width, int cur_height, + int stride, double *tmpbuf) { + const int half_len = FLOW_UPSCALE_TAPS / 2; + + // Check that the outer border is large enough to avoid needing to clamp + // the source locations + assert(half_len <= FLOW_BORDER_OUTER); + + // Horizontal upscale and multiply by 2 + for (int i = 0; i < cur_height; i++) { + for (int j = 0; j < cur_width; j++) { + double left = 0; + for (int k = -half_len; k < half_len; k++) { + left += + flow[i * stride + (j + k)] * flow_upscale_filter[0][k + half_len]; + } + tmpbuf[i * stride + (2 * j + 0)] = 2.0 * left; + + // Right output pixel is 0.25 units to the right of the input pixel + double right = 0; + for (int k = -(half_len - 1); k < (half_len + 1); k++) { + right += flow[i * stride + (j + k)] * + flow_upscale_filter[1][k + (half_len - 1)]; + } + tmpbuf[i * stride + (2 * j + 1)] = 2.0 * right; + } + } + + // Fill in top and bottom borders of tmpbuf + const double *top_row = &tmpbuf[0]; + for (int i = -FLOW_BORDER_OUTER; i < 0; i++) { + double *row = &tmpbuf[i * stride]; + memcpy(row, top_row, 2 * cur_width * sizeof(*row)); + } + + const double *bottom_row = &tmpbuf[(cur_height - 1) * stride]; + for (int i = cur_height; i < cur_height + FLOW_BORDER_OUTER; i++) { + double *row = &tmpbuf[i * stride]; + memcpy(row, bottom_row, 2 * cur_width * sizeof(*row)); + } + + // Vertical upscale + int upscaled_width = cur_width * 2; + for (int i = 0; i < cur_height; i++) { + for (int j = 0; j < upscaled_width; j++) { + double top = 0; + for (int k = -half_len; k < half_len; k++) { + top += + tmpbuf[(i + k) * stride + j] * flow_upscale_filter[0][k + half_len]; + } + flow[(2 * i) * stride + j] = top; + + double bottom = 0; + for (int k = -(half_len - 1); k < (half_len + 1); k++) { + bottom += tmpbuf[(i + k) * stride + j] * + flow_upscale_filter[1][k + (half_len - 1)]; + } + flow[(2 * i + 1) * stride + j] = bottom; + } } } // make sure flow_u and flow_v start at 0 static bool compute_flow_field(const ImagePyramid *src_pyr, - const ImagePyramid *ref_pyr, FlowField *flow) { + const ImagePyramid *ref_pyr, int n_levels, + FlowField *flow) { bool mem_status = true; - assert(src_pyr->n_levels == ref_pyr->n_levels); double *flow_u = flow->u; double *flow_v = flow->v; - const size_t flow_size = flow->stride * (size_t)flow->height; - double *u_upscale = aom_malloc(flow_size * sizeof(*u_upscale)); - double *v_upscale = aom_malloc(flow_size * sizeof(*v_upscale)); - if (!u_upscale || !v_upscale) { - mem_status = false; - goto free_uvscale; + double *tmpbuf0; + double *tmpbuf; + + if (n_levels < 2) { + // tmpbuf not needed + tmpbuf0 = NULL; + tmpbuf = NULL; + } else { + // This line must match the calculation of cur_flow_height below + const int layer1_height = src_pyr->layers[1].height >> DOWNSAMPLE_SHIFT; + + const size_t tmpbuf_size = + (layer1_height + 2 * FLOW_BORDER_OUTER) * flow->stride; + tmpbuf0 = aom_malloc(tmpbuf_size * sizeof(*tmpbuf0)); + if (!tmpbuf0) { + mem_status = false; + goto free_tmpbuf; + } + tmpbuf = tmpbuf0 + FLOW_BORDER_OUTER * flow->stride; } // Compute flow field from coarsest to finest level of the pyramid - for (int level = src_pyr->n_levels - 1; level >= 0; --level) { + // + // Note: We stop after refining pyramid level 1 and interpolating it to + // generate an initial flow field at level 0. We do *not* refine the dense + // flow field at level 0. Instead, we wait until we have generated + // correspondences by interpolating this flow field, and then refine the + // correspondences themselves. This is both faster and gives better output + // compared to refining the flow field at level 0 and then interpolating. + for (int level = n_levels - 1; level >= 1; --level) { const PyramidLayer *cur_layer = &src_pyr->layers[level]; const int cur_width = cur_layer->width; const int cur_height = cur_layer->height; @@ -489,8 +652,10 @@ const int cur_flow_height = cur_height >> DOWNSAMPLE_SHIFT; const int cur_flow_stride = flow->stride; - for (int i = FLOW_BORDER; i < cur_flow_height - FLOW_BORDER; i += 1) { - for (int j = FLOW_BORDER; j < cur_flow_width - FLOW_BORDER; j += 1) { + for (int i = FLOW_BORDER_INNER; i < cur_flow_height - FLOW_BORDER_INNER; + i += 1) { + for (int j = FLOW_BORDER_INNER; j < cur_flow_width - FLOW_BORDER_INNER; + j += 1) { const int flow_field_idx = i * cur_flow_stride + j; // Calculate the position of a patch of size DISFLOW_PATCH_SIZE pixels, @@ -523,28 +688,10 @@ const int upscale_flow_height = cur_flow_height << 1; const int upscale_stride = flow->stride; - bool upscale_u_plane = av1_upscale_plane_double_prec( - flow_u, cur_flow_height, cur_flow_width, cur_flow_stride, u_upscale, - upscale_flow_height, upscale_flow_width, upscale_stride); - bool upscale_v_plane = av1_upscale_plane_double_prec( - flow_v, cur_flow_height, cur_flow_width, cur_flow_stride, v_upscale, - upscale_flow_height, upscale_flow_width, upscale_stride); - if (!upscale_u_plane || !upscale_v_plane) { - mem_status = false; - goto free_uvscale; - } - - // Multiply all flow vectors by 2. - // When we move down a pyramid level, the image resolution doubles. - // Thus we need to double all vectors in order for them to represent - // the same translation at the next level down - for (int i = 0; i < upscale_flow_height; i++) { - for (int j = 0; j < upscale_flow_width; j++) { - const int index = i * upscale_stride + j; - flow_u[index] = u_upscale[index] * 2.0; - flow_v[index] = v_upscale[index] * 2.0; - } - } + upscale_flow_component(flow_u, cur_flow_width, cur_flow_height, + cur_flow_stride, tmpbuf); + upscale_flow_component(flow_v, cur_flow_width, cur_flow_height, + cur_flow_stride, tmpbuf); // If we didn't fill in the rightmost column or bottommost row during // upsampling (in order to keep the ratio to exactly 2), fill them @@ -574,9 +721,9 @@ } } } -free_uvscale: - aom_free(u_upscale); - aom_free(v_upscale); + +free_tmpbuf: + aom_free(tmpbuf0); return mem_status; } @@ -587,25 +734,25 @@ // Calculate the size of the bottom (largest) layer of the flow pyramid flow->width = frame_width >> DOWNSAMPLE_SHIFT; flow->height = frame_height >> DOWNSAMPLE_SHIFT; - flow->stride = flow->width; + flow->stride = flow->width + 2 * FLOW_BORDER_OUTER; - const size_t flow_size = flow->stride * (size_t)flow->height; - flow->u = aom_calloc(flow_size, sizeof(*flow->u)); - flow->v = aom_calloc(flow_size, sizeof(*flow->v)); - - if (flow->u == NULL || flow->v == NULL) { - aom_free(flow->u); - aom_free(flow->v); + const size_t flow_size = + flow->stride * (size_t)(flow->height + 2 * FLOW_BORDER_OUTER); + + flow->buf0 = aom_calloc(2 * flow_size, sizeof(*flow->buf0)); + if (!flow->buf0) { aom_free(flow); return NULL; } + flow->u = flow->buf0 + FLOW_BORDER_OUTER * flow->stride + FLOW_BORDER_OUTER; + flow->v = flow->u + flow_size; + return flow; } static void free_flow_field(FlowField *flow) { - aom_free(flow->u); - aom_free(flow->v); + aom_free(flow->buf0); aom_free(flow); } @@ -615,29 +762,31 @@ // Following the convention in flow_estimation.h, the flow vectors are computed // at fixed points in `src` and point to the corresponding locations in `ref`, // regardless of the temporal ordering of the frames. -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed) { +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed) { // Precompute information we will need about each frame ImagePyramid *src_pyramid = src->y_pyramid; CornerList *src_corners = src->corners; ImagePyramid *ref_pyramid = ref->y_pyramid; - if (!aom_compute_pyramid(src, bit_depth, src_pyramid)) { - *mem_alloc_failed = true; - return false; - } - if (!av1_compute_corner_list(src_pyramid, src_corners)) { + + const int src_layers = + aom_compute_pyramid(src, bit_depth, DISFLOW_PYRAMID_LEVELS, src_pyramid); + const int ref_layers = + aom_compute_pyramid(ref, bit_depth, DISFLOW_PYRAMID_LEVELS, ref_pyramid); + + if (src_layers < 0 || ref_layers < 0) { *mem_alloc_failed = true; return false; } - if (!aom_compute_pyramid(ref, bit_depth, ref_pyramid)) { + if (!av1_compute_corner_list(src, bit_depth, downsample_level, src_corners)) { *mem_alloc_failed = true; return false; } + assert(src_layers == ref_layers); + const int src_width = src_pyramid->layers[0].width; const int src_height = src_pyramid->layers[0].height; assert(ref_pyramid->layers[0].width == src_width); @@ -649,7 +798,7 @@ return false; } - if (!compute_flow_field(src_pyramid, ref_pyramid, flow)) { + if (!compute_flow_field(src_pyramid, ref_pyramid, src_layers, flow)) { *mem_alloc_failed = true; free_flow_field(flow); return false; @@ -664,8 +813,8 @@ return false; } - const int num_correspondences = - determine_disflow_correspondence(src_corners, flow, correspondences); + const int num_correspondences = determine_disflow_correspondence( + src_pyramid, ref_pyramid, src_corners, flow, correspondences); bool result = ransac(correspondences, num_correspondences, type, motion_models, num_motion_models, mem_alloc_failed); diff -Nru aom-3.8.2/aom_dsp/flow_estimation/disflow.h aom-3.9.0/aom_dsp/flow_estimation/disflow.h --- aom-3.8.2/aom_dsp/flow_estimation/disflow.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/disflow.h 2024-05-07 19:57:02.510000000 +0000 @@ -15,7 +15,6 @@ #include #include "aom_dsp/flow_estimation/flow_estimation.h" -#include "aom_dsp/rect.h" #include "aom_scale/yv12config.h" #ifdef __cplusplus @@ -79,6 +78,9 @@ #define DISFLOW_INTERP_BITS 14 typedef struct { + // Start of allocation for u and v buffers + double *buf0; + // x and y directions of flow, per patch double *u; double *v; @@ -89,12 +91,10 @@ int stride; } FlowField; -bool av1_compute_global_motion_disflow(TransformationType type, - YV12_BUFFER_CONFIG *src, - YV12_BUFFER_CONFIG *ref, int bit_depth, - MotionModel *motion_models, - int num_motion_models, - bool *mem_alloc_failed); +bool av1_compute_global_motion_disflow( + TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, + int bit_depth, int downsample_level, MotionModel *motion_models, + int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus } diff -Nru aom-3.8.2/aom_dsp/flow_estimation/flow_estimation.c aom-3.9.0/aom_dsp/flow_estimation/flow_estimation.c --- aom-3.8.2/aom_dsp/flow_estimation/flow_estimation.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/flow_estimation.c 2024-05-07 19:57:02.510000000 +0000 @@ -18,14 +18,6 @@ #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS] = { - 1, // GLOBAL_MOTION_METHOD_FEATURE_MATCH - 16, // GLOBAL_MOTION_METHOD_DISFLOW -}; - // clang-format off const double kIdentityParams[MAX_PARAMDIM] = { 0.0, 0.0, 1.0, 0.0, 0.0, 1.0 @@ -43,17 +35,17 @@ bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed) { switch (gm_method) { case GLOBAL_MOTION_METHOD_FEATURE_MATCH: return av1_compute_global_motion_feature_match( - type, src, ref, bit_depth, motion_models, num_motion_models, - mem_alloc_failed); + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); case GLOBAL_MOTION_METHOD_DISFLOW: - return av1_compute_global_motion_disflow(type, src, ref, bit_depth, - motion_models, num_motion_models, - mem_alloc_failed); + return av1_compute_global_motion_disflow( + type, src, ref, bit_depth, downsample_level, motion_models, + num_motion_models, mem_alloc_failed); default: assert(0 && "Unknown global motion estimation type"); } return false; diff -Nru aom-3.8.2/aom_dsp/flow_estimation/flow_estimation.h aom-3.9.0/aom_dsp/flow_estimation/flow_estimation.h --- aom-3.8.2/aom_dsp/flow_estimation/flow_estimation.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/flow_estimation.h 2024-05-07 19:57:02.511000000 +0000 @@ -61,11 +61,6 @@ double rx, ry; } Correspondence; -// For each global motion method, how many pyramid levels should we allocate? -// Note that this is a maximum, and fewer levels will be allocated if the frame -// is not large enough to need all of the specified levels -extern const int global_motion_pyr_levels[GLOBAL_MOTION_METHODS]; - // Which global motion method should we use in practice? // Disflow is both faster and gives better results than feature matching in // practically all cases, so we use disflow by default @@ -85,7 +80,7 @@ bool aom_compute_global_motion(TransformationType type, YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *ref, int bit_depth, GlobalMotionMethod gm_method, - MotionModel *motion_models, + int downsample_level, MotionModel *motion_models, int num_motion_models, bool *mem_alloc_failed); #ifdef __cplusplus diff -Nru aom-3.8.2/aom_dsp/flow_estimation/ransac.c aom-3.9.0/aom_dsp/flow_estimation/ransac.c --- aom-3.8.2/aom_dsp/flow_estimation/ransac.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/ransac.c 2024-05-07 19:57:02.511000000 +0000 @@ -29,8 +29,13 @@ #define INLIER_THRESHOLD 1.25 #define INLIER_THRESHOLD_SQUARED (INLIER_THRESHOLD * INLIER_THRESHOLD) + +// Number of initial models to generate #define NUM_TRIALS 20 +// Number of times to refine the best model found +#define NUM_REFINES 5 + // Flag to enable functions for finding TRANSLATION type models. // // These modes are not considered currently due to a spec bug (see comments @@ -39,63 +44,110 @@ // but disabled, for completeness. #define ALLOW_TRANSLATION_MODELS 0 +typedef struct { + int num_inliers; + double sse; // Sum of squared errors of inliers + int *inlier_indices; +} RANSAC_MOTION; + //////////////////////////////////////////////////////////////////////////////// // ransac -typedef bool (*IsDegenerateFunc)(double *p); -typedef bool (*FindTransformationFunc)(int points, const double *points1, - const double *points2, double *params); -typedef void (*ProjectPointsFunc)(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj); +typedef bool (*FindTransformationFunc)(const Correspondence *points, + const int *indices, int num_indices, + double *params); +typedef void (*ScoreModelFunc)(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model); // vtable-like structure which stores all of the information needed by RANSAC // for a particular model type typedef struct { - IsDegenerateFunc is_degenerate; FindTransformationFunc find_transformation; - ProjectPointsFunc project_points; + ScoreModelFunc score_model; + + // The minimum number of points which can be passed to find_transformation + // to generate a model. + // + // This should be set as small as possible. This is due to an observation + // from section 4 of "Optimal Ransac" by A. Hast, J. Nysjö and + // A. Marchetti (https://dspace5.zcu.cz/bitstream/11025/6869/1/Hast.pdf): + // using the minimum possible number of points in the initial model maximizes + // the chance that all of the selected points are inliers. + // + // That paper proposes a method which can deal with models which are + // contaminated by outliers, which helps in cases where the inlier fraction + // is low. However, for our purposes, global motion only gives significant + // gains when the inlier fraction is high. + // + // So we do not use the method from this paper, but we do find that + // minimizing the number of points used for initial model fitting helps + // make the best use of the limited number of models we consider. int minpts; } RansacModelInfo; #if ALLOW_TRANSLATION_MODELS -static void project_points_translation(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = x + mat[0]; - *(proj++) = y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_translation(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = x1 + mat[0]; + const double proj_y = y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #endif // ALLOW_TRANSLATION_MODELS -static void project_points_affine(const double *mat, const double *points, - double *proj, int n, int stride_points, - int stride_proj) { - int i; - for (i = 0; i < n; ++i) { - const double x = *(points++), y = *(points++); - *(proj++) = mat[2] * x + mat[3] * y + mat[0]; - *(proj++) = mat[4] * x + mat[5] * y + mat[1]; - points += stride_points - 2; - proj += stride_proj - 2; +static void score_affine(const double *mat, const Correspondence *points, + int num_points, RANSAC_MOTION *model) { + model->num_inliers = 0; + model->sse = 0.0; + + for (int i = 0; i < num_points; ++i) { + const double x1 = points[i].x; + const double y1 = points[i].y; + const double x2 = points[i].rx; + const double y2 = points[i].ry; + + const double proj_x = mat[2] * x1 + mat[3] * y1 + mat[0]; + const double proj_y = mat[4] * x1 + mat[5] * y1 + mat[1]; + + const double dx = proj_x - x2; + const double dy = proj_y - y2; + const double sse = dx * dx + dy * dy; + + if (sse < INLIER_THRESHOLD_SQUARED) { + model->inlier_indices[model->num_inliers++] = i; + model->sse += sse; + } } } #if ALLOW_TRANSLATION_MODELS -static bool find_translation(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_translation(const Correspondence *points, const int *indices, + int num_indices, double *params) { double sumx = 0; double sumy = 0; - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; sumx += dx - sx; sumy += dy - sy; @@ -111,8 +163,8 @@ } #endif // ALLOW_TRANSLATION_MODELS -static bool find_rotzoom(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_rotzoom(const Correspondence *points, const int *indices, + int num_indices, double *params) { const int n = 4; // Size of least-squares problem double mat[4 * 4]; // Accumulator for A'A double y[4]; // Accumulator for A'b @@ -120,11 +172,12 @@ double b; // Single element of b least_squares_init(mat, y, n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0] = 1; a[1] = 0; @@ -153,8 +206,8 @@ return true; } -static bool find_affine(int np, const double *pts1, const double *pts2, - double *params) { +static bool find_affine(const Correspondence *points, const int *indices, + int num_indices, double *params) { // Note: The least squares problem for affine models is 6-dimensional, // but it splits into two independent 3-dimensional subproblems. // Solving these two subproblems separately and recombining at the end @@ -174,11 +227,12 @@ least_squares_init(mat[0], y[0], n); least_squares_init(mat[1], y[1], n); - for (int i = 0; i < np; ++i) { - double dx = *(pts2++); - double dy = *(pts2++); - double sx = *(pts1++); - double sy = *(pts1++); + for (int i = 0; i < num_indices; ++i) { + int index = indices[i]; + const double sx = points[index].x; + const double sy = points[index].y; + const double dx = points[index].rx; + const double dy = points[index].ry; a[0][0] = 1; a[0][1] = sx; @@ -211,12 +265,6 @@ return true; } -typedef struct { - int num_inliers; - double sse; // Sum of squared errors of inliers - int *inlier_indices; -} RANSAC_MOTION; - // Return -1 if 'a' is a better motion, 1 if 'b' is better, 0 otherwise. static int compare_motions(const void *arg_a, const void *arg_b) { const RANSAC_MOTION *motion_a = (RANSAC_MOTION *)arg_a; @@ -234,15 +282,6 @@ return compare_motions(motion_a, motion_b) < 0; } -static void copy_points_at_indices(double *dest, const double *src, - const int *indices, int num_points) { - for (int i = 0; i < num_points; ++i) { - const int index = indices[i]; - dest[i * 2] = src[index * 2]; - dest[i * 2 + 1] = src[index * 2 + 1]; - } -} - // Returns true on success, false on error static bool ransac_internal(const Correspondence *matched_points, int npoints, MotionModel *motion_models, int num_desired_motions, @@ -257,10 +296,6 @@ int indices[MAX_MINPTS] = { 0 }; - double *points1, *points2; - double *corners1, *corners2; - double *projected_corners; - // Store information for the num_desired_motions best transformations found // and the worst motion among them, as well as the motion currently under // consideration. @@ -271,18 +306,19 @@ // currently under consideration. double params_this_motion[MAX_PARAMDIM]; + // Initialize output models, as a fallback in case we can't find a model + for (i = 0; i < num_desired_motions; i++) { + memcpy(motion_models[i].params, kIdentityParams, + MAX_PARAMDIM * sizeof(*(motion_models[i].params))); + motion_models[i].num_inliers = 0; + } + if (npoints < minpts * MINPTS_MULTIPLIER || npoints == 0) { return false; } int min_inliers = AOMMAX((int)(MIN_INLIER_PROB * npoints), minpts); - points1 = (double *)aom_malloc(sizeof(*points1) * npoints * 2); - points2 = (double *)aom_malloc(sizeof(*points2) * npoints * 2); - corners1 = (double *)aom_malloc(sizeof(*corners1) * npoints * 2); - corners2 = (double *)aom_malloc(sizeof(*corners2) * npoints * 2); - projected_corners = - (double *)aom_malloc(sizeof(*projected_corners) * npoints * 2); motions = (RANSAC_MOTION *)aom_calloc(num_desired_motions, sizeof(RANSAC_MOTION)); @@ -295,8 +331,7 @@ int *inlier_buffer = (int *)aom_malloc(sizeof(*inlier_buffer) * npoints * (num_desired_motions + 1)); - if (!(points1 && points2 && corners1 && corners2 && projected_corners && - motions && inlier_buffer)) { + if (!(motions && inlier_buffer)) { ret_val = false; *mem_alloc_failed = true; goto finish_ransac; @@ -311,50 +346,22 @@ memset(¤t_motion, 0, sizeof(current_motion)); current_motion.inlier_indices = inlier_buffer + num_desired_motions * npoints; - for (i = 0; i < npoints; ++i) { - corners1[2 * i + 0] = matched_points[i].x; - corners1[2 * i + 1] = matched_points[i].y; - corners2[2 * i + 0] = matched_points[i].rx; - corners2[2 * i + 1] = matched_points[i].ry; - } - for (int trial_count = 0; trial_count < NUM_TRIALS; trial_count++) { lcg_pick(npoints, minpts, indices, &seed); - copy_points_at_indices(points1, corners1, indices, minpts); - copy_points_at_indices(points2, corners2, indices, minpts); - - if (model_info->is_degenerate(points1)) { - continue; - } - - if (!model_info->find_transformation(minpts, points1, points2, + if (!model_info->find_transformation(matched_points, indices, minpts, params_this_motion)) { continue; } - model_info->project_points(params_this_motion, corners1, projected_corners, - npoints, 2, 2); - - current_motion.num_inliers = 0; - double sse = 0.0; - for (i = 0; i < npoints; ++i) { - double dx = projected_corners[i * 2] - corners2[i * 2]; - double dy = projected_corners[i * 2 + 1] - corners2[i * 2 + 1]; - double squared_error = dx * dx + dy * dy; - - if (squared_error < INLIER_THRESHOLD_SQUARED) { - current_motion.inlier_indices[current_motion.num_inliers++] = i; - sse += squared_error; - } - } + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); if (current_motion.num_inliers < min_inliers) { // Reject models with too few inliers continue; } - current_motion.sse = sse; if (is_better_motion(¤t_motion, worst_kept_motion)) { // This motion is better than the worst currently kept motion. Remember // the inlier points and sse. The parameters for each kept motion @@ -386,86 +393,98 @@ // Sort the motions, best first. qsort(motions, num_desired_motions, sizeof(RANSAC_MOTION), compare_motions); - // Recompute the motions using only the inliers. + // Refine each of the best N models using iterative estimation. + // + // The idea here is loosely based on the iterative method from + // "Locally Optimized RANSAC" by O. Chum, J. Matas and Josef Kittler: + // https://cmp.felk.cvut.cz/ftp/articles/matas/chum-dagm03.pdf + // + // However, we implement a simpler version than their proposal, and simply + // refit the model repeatedly until the number of inliers stops increasing, + // with a cap on the number of iterations to defend against edge cases which + // only improve very slowly. for (i = 0; i < num_desired_motions; ++i) { - int num_inliers = motions[i].num_inliers; - if (num_inliers > 0) { - assert(num_inliers >= minpts); - - copy_points_at_indices(points1, corners1, motions[i].inlier_indices, - num_inliers); - copy_points_at_indices(points2, corners2, motions[i].inlier_indices, - num_inliers); - - if (!model_info->find_transformation(num_inliers, points1, points2, - motion_models[i].params)) { - // In the unlikely event that this model fitting fails, - // we don't have a good fallback. So just clear the output - // model and move on - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; - continue; + if (motions[i].num_inliers <= 0) { + // Output model has already been initialized to the identity model, + // so just skip setup + continue; + } + + bool bad_model = false; + for (int refine_count = 0; refine_count < NUM_REFINES; refine_count++) { + int num_inliers = motions[i].num_inliers; + assert(num_inliers >= min_inliers); + + if (!model_info->find_transformation(matched_points, + motions[i].inlier_indices, + num_inliers, params_this_motion)) { + // In the unlikely event that this model fitting fails, we don't have a + // good fallback. So leave this model set to the identity model + bad_model = true; + break; } - // Populate inliers array - for (int j = 0; j < num_inliers; j++) { - int index = motions[i].inlier_indices[j]; - const Correspondence *corr = &matched_points[index]; - motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); - motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + // Score the newly generated model + model_info->score_model(params_this_motion, matched_points, npoints, + ¤t_motion); + + // At this point, there are three possibilities: + // 1) If we found more inliers, keep refining. + // 2) If we found the same number of inliers but a lower SSE, we want to + // keep the new model, but further refinement is unlikely to gain much. + // So commit to this new model + // 3) It is possible, but very unlikely, that the new model will have + // fewer inliers. If it does happen, we probably just lost a few + // borderline inliers. So treat the same as case (2). + if (current_motion.num_inliers > motions[i].num_inliers) { + motions[i].num_inliers = current_motion.num_inliers; + motions[i].sse = current_motion.sse; + int *tmp = motions[i].inlier_indices; + motions[i].inlier_indices = current_motion.inlier_indices; + current_motion.inlier_indices = tmp; + } else { + // Refined model is no better, so stop + // This shouldn't be significantly worse than the previous model, + // so it's fine to use the parameters in params_this_motion. + // This saves us from having to cache the previous iteration's params. + break; } - motion_models[i].num_inliers = num_inliers; - } else { - memcpy(motion_models[i].params, kIdentityParams, - MAX_PARAMDIM * sizeof(*(motion_models[i].params))); - motion_models[i].num_inliers = 0; } + + if (bad_model) continue; + + // Fill in output struct + memcpy(motion_models[i].params, params_this_motion, + MAX_PARAMDIM * sizeof(*motion_models[i].params)); + for (int j = 0; j < motions[i].num_inliers; j++) { + int index = motions[i].inlier_indices[j]; + const Correspondence *corr = &matched_points[index]; + motion_models[i].inliers[2 * j + 0] = (int)rint(corr->x); + motion_models[i].inliers[2 * j + 1] = (int)rint(corr->y); + } + motion_models[i].num_inliers = motions[i].num_inliers; } finish_ransac: aom_free(inlier_buffer); aom_free(motions); - aom_free(projected_corners); - aom_free(corners2); - aom_free(corners1); - aom_free(points2); - aom_free(points1); return ret_val; } -static bool is_collinear3(double *p1, double *p2, double *p3) { - static const double collinear_eps = 1e-3; - const double v = - (p2[0] - p1[0]) * (p3[1] - p1[1]) - (p2[1] - p1[1]) * (p3[0] - p1[0]); - return fabs(v) < collinear_eps; -} - -#if ALLOW_TRANSLATION_MODELS -static bool is_degenerate_translation(double *p) { - return (p[0] - p[2]) * (p[0] - p[2]) + (p[1] - p[3]) * (p[1] - p[3]) <= 2; -} -#endif // ALLOW_TRANSLATION_MODELS - -static bool is_degenerate_affine(double *p) { - return is_collinear3(p, p + 2, p + 4); -} - static const RansacModelInfo ransac_model_info[TRANS_TYPES] = { // IDENTITY - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, // TRANSLATION #if ALLOW_TRANSLATION_MODELS - { is_degenerate_translation, find_translation, project_points_translation, - 3 }, + { find_translation, score_translation, 1 }, #else - { NULL, NULL, NULL, 0 }, + { NULL, NULL, 0 }, #endif // ROTZOOM - { is_degenerate_affine, find_rotzoom, project_points_affine, 3 }, + { find_rotzoom, score_affine, 2 }, // AFFINE - { is_degenerate_affine, find_affine, project_points_affine, 3 }, + { find_affine, score_affine, 3 }, }; // Returns true on success, false on error diff -Nru aom-3.8.2/aom_dsp/flow_estimation/x86/corner_match_avx2.c aom-3.9.0/aom_dsp/flow_estimation/x86/corner_match_avx2.c --- aom-3.8.2/aom_dsp/flow_estimation/x86/corner_match_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/x86/corner_match_avx2.c 2024-05-07 19:57:02.512000000 +0000 @@ -17,64 +17,112 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(32, static const uint16_t, ones_array[16]) = { 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_avx2.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the -correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows -of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * and + *one_over_stddev = 1 / (MATCH_SZ * ) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -double av1_compute_cross_correlation_avx2(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i, stride1_i = 0, stride2_i = 0; - __m256i temp1, sum_vec, sumsq2_vec, cross_vec, v, v1_1, v2_1; - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m256i zero = _mm256_setzero_si256(); - __m128i v1, v2; - - sum_vec = zero; - sumsq2_vec = zero; - cross_vec = zero; +bool aom_compute_mean_stddev_avx2(const unsigned char *frame, int stride, int x, + int y, double *mean, + double *one_over_stddev) { + __m256i sum_vec = _mm256_setzero_si256(); + __m256i sumsq_vec = _mm256_setzero_si256(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame)); + + sum_vec = _mm256_add_epi16(sum_vec, v); + sumsq_vec = _mm256_add_epi32(sumsq_vec, _mm256_madd_epi16(v, v)); + + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 8x32-bit values, hadd() to perform 8 + // additions, sum vertically to do 4 more, then the last 2 in scalar code. + const __m256i ones = _mm256_load_si256((__m256i *)ones_array); + const __m256i partial_sum = _mm256_madd_epi16(sum_vec, ones); + const __m256i tmp_8x32 = _mm256_hadd_epi32(partial_sum, sumsq_vec); + const __m128i tmp_4x32 = _mm_add_epi32(_mm256_extracti128_si256(tmp_8x32, 0), + _mm256_extracti128_si256(tmp_8x32, 1)); + const int sum = + _mm_extract_epi32(tmp_4x32, 0) + _mm_extract_epi32(tmp_4x32, 1); + const int sumsq = + _mm_extract_epi32(tmp_4x32, 2) + _mm_extract_epi32(tmp_4x32, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. +*/ +double aom_compute_correlation_avx2(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + __m256i cross_vec = _mm256_setzero_si256(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - v1 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[stride1_i]), mask); - v1_1 = _mm256_cvtepu8_epi16(v1); - v2 = _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[stride2_i]), mask); - v2_1 = _mm256_cvtepu8_epi16(v2); - - v = _mm256_insertf128_si256(_mm256_castsi128_si256(v1), v2, 1); - sumsq2_vec = _mm256_add_epi32(sumsq2_vec, _mm256_madd_epi16(v2_1, v2_1)); - - sum_vec = _mm256_add_epi16(sum_vec, _mm256_sad_epu8(v, zero)); - cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1_1, v2_1)); - stride1_i += stride1; - stride2_i += stride2; + for (int i = 0; i < MATCH_SZ; ++i) { + const __m256i v1 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame1)); + const __m256i v2 = _mm256_cvtepu8_epi16(_mm_loadu_si128((__m128i *)frame2)); + + cross_vec = _mm256_add_epi32(cross_vec, _mm256_madd_epi16(v1, v2)); + + frame1 += stride1; + frame2 += stride2; } - __m256i sum_vec1 = _mm256_srli_si256(sum_vec, 8); - sum_vec = _mm256_add_epi32(sum_vec, sum_vec1); - int sum1_acc = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_vec)); - int sum2_acc = _mm256_extract_epi32(sum_vec, 4); - - __m256i unp_low = _mm256_unpacklo_epi64(sumsq2_vec, cross_vec); - __m256i unp_hig = _mm256_unpackhi_epi64(sumsq2_vec, cross_vec); - temp1 = _mm256_add_epi32(unp_low, unp_hig); - - __m128i low_sumsq = _mm256_castsi256_si128(temp1); - low_sumsq = _mm_add_epi32(low_sumsq, _mm256_extractf128_si256(temp1, 1)); - low_sumsq = _mm_add_epi32(low_sumsq, _mm_srli_epi64(low_sumsq, 32)); - int sumsq2_acc = _mm_cvtsi128_si32(low_sumsq); - int cross_acc = _mm_extract_epi32(low_sumsq, 2); - - int var2 = sumsq2_acc * MATCH_SZ_SQ - sum2_acc * sum2_acc; - int cov = cross_acc * MATCH_SZ_SQ - sum1_acc * sum2_acc; - return cov / sqrt((double)var2); + + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(_mm256_extracti128_si256(cross_vec, 0), + _mm256_extracti128_si256(cross_vec, 1)); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } diff -Nru aom-3.8.2/aom_dsp/flow_estimation/x86/corner_match_sse4.c aom-3.9.0/aom_dsp/flow_estimation/x86/corner_match_sse4.c --- aom-3.8.2/aom_dsp/flow_estimation/x86/corner_match_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/x86/corner_match_sse4.c 2024-05-07 19:57:02.513000000 +0000 @@ -21,84 +21,125 @@ #include "aom_ports/mem.h" #include "aom_dsp/flow_estimation/corner_match.h" -DECLARE_ALIGNED(16, static const uint8_t, - byte_mask[16]) = { 255, 255, 255, 255, 255, 255, 255, 255, - 255, 255, 255, 255, 255, 0, 0, 0 }; -#if MATCH_SZ != 13 -#error "Need to change byte_mask in corner_match_sse4.c if MATCH_SZ != 13" +DECLARE_ALIGNED(16, static const uint16_t, ones_array[8]) = { 1, 1, 1, 1, + 1, 1, 1, 1 }; + +#if MATCH_SZ != 16 +#error "Need to apply pixel mask in corner_match_sse4.c if MATCH_SZ != 16" #endif -/* Compute corr(frame1, frame2) * MATCH_SZ * stddev(frame1), where the - correlation/standard deviation are taken over MATCH_SZ by MATCH_SZ windows - of each image, centered at (x1, y1) and (x2, y2) respectively. +/* Compute mean and standard deviation of pixels in a window of size + MATCH_SZ by MATCH_SZ centered at (x, y). + Store results into *mean and *one_over_stddev + + Note: The output of this function is scaled by MATCH_SZ, as in + *mean = MATCH_SZ * and + *one_over_stddev = 1 / (MATCH_SZ * ) + + Combined with the fact that we return 1/stddev rather than the standard + deviation itself, this allows us to completely avoid divisions in + aom_compute_correlation, which is much hotter than this function is. + + Returns true if this feature point is usable, false otherwise. */ -double av1_compute_cross_correlation_sse4_1(const unsigned char *frame1, - int stride1, int x1, int y1, - const unsigned char *frame2, - int stride2, int x2, int y2) { - int i; - // 2 16-bit partial sums in lanes 0, 4 (== 2 32-bit partial sums in lanes 0, - // 2) - __m128i sum1_vec = _mm_setzero_si128(); - __m128i sum2_vec = _mm_setzero_si128(); - // 4 32-bit partial sums of squares - __m128i sumsq2_vec = _mm_setzero_si128(); - __m128i cross_vec = _mm_setzero_si128(); +bool aom_compute_mean_stddev_sse4_1(const unsigned char *frame, int stride, + int x, int y, double *mean, + double *one_over_stddev) { + // 8 16-bit partial sums of pixels + // Each lane sums at most 2*MATCH_SZ pixels, which can have values up to 255, + // and is therefore at most 2*MATCH_SZ*255, which is > 2^8 but < 2^16. + // Thus this value is safe to store in 16 bits. + __m128i sum_vec = _mm_setzero_si128(); + + // 8 32-bit partial sums of squares + __m128i sumsq_vec_l = _mm_setzero_si128(); + __m128i sumsq_vec_r = _mm_setzero_si128(); + + frame += (y - MATCH_SZ_BY2) * stride + (x - MATCH_SZ_BY2); + + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v = _mm_loadu_si128((__m128i *)frame); + const __m128i v_l = _mm_cvtepu8_epi16(v); + const __m128i v_r = _mm_cvtepu8_epi16(_mm_srli_si128(v, 8)); + + sum_vec = _mm_add_epi16(sum_vec, _mm_add_epi16(v_l, v_r)); + sumsq_vec_l = _mm_add_epi32(sumsq_vec_l, _mm_madd_epi16(v_l, v_l)); + sumsq_vec_r = _mm_add_epi32(sumsq_vec_r, _mm_madd_epi16(v_r, v_r)); - const __m128i mask = _mm_load_si128((__m128i *)byte_mask); - const __m128i zero = _mm_setzero_si128(); + frame += stride; + } + + // Reduce sum_vec and sumsq_vec into single values + // Start by reducing each vector to 4x32-bit values, hadd() to perform four + // additions, then perform the last two additions in scalar code. + const __m128i ones = _mm_load_si128((__m128i *)ones_array); + const __m128i partial_sum = _mm_madd_epi16(sum_vec, ones); + const __m128i partial_sumsq = _mm_add_epi32(sumsq_vec_l, sumsq_vec_r); + const __m128i tmp = _mm_hadd_epi32(partial_sum, partial_sumsq); + const int sum = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1); + const int sumsq = _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + *mean = (double)sum / MATCH_SZ; + const double variance = sumsq - (*mean) * (*mean); + if (variance < MIN_FEATURE_VARIANCE) { + *one_over_stddev = 0.0; + return false; + } + *one_over_stddev = 1.0 / sqrt(variance); + return true; +} + +/* Compute corr(frame1, frame2) over a window of size MATCH_SZ by MATCH_SZ. + To save on computation, the mean and (1 divided by the) standard deviation + of the window in each frame are precomputed and passed into this function + as arguments. +*/ +double aom_compute_correlation_sse4_1(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2) { + // 8 32-bit partial sums of products + __m128i cross_vec_l = _mm_setzero_si128(); + __m128i cross_vec_r = _mm_setzero_si128(); frame1 += (y1 - MATCH_SZ_BY2) * stride1 + (x1 - MATCH_SZ_BY2); frame2 += (y2 - MATCH_SZ_BY2) * stride2 + (x2 - MATCH_SZ_BY2); - for (i = 0; i < MATCH_SZ; ++i) { - const __m128i v1 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame1[i * stride1]), mask); - const __m128i v2 = - _mm_and_si128(_mm_loadu_si128((__m128i *)&frame2[i * stride2]), mask); - - // Using the 'sad' intrinsic here is a bit faster than adding - // v1_l + v1_r and v2_l + v2_r, plus it avoids the need for a 16->32 bit - // conversion step later, for a net speedup of ~10% - sum1_vec = _mm_add_epi16(sum1_vec, _mm_sad_epu8(v1, zero)); - sum2_vec = _mm_add_epi16(sum2_vec, _mm_sad_epu8(v2, zero)); + for (int i = 0; i < MATCH_SZ; ++i) { + const __m128i v1 = _mm_loadu_si128((__m128i *)frame1); + const __m128i v2 = _mm_loadu_si128((__m128i *)frame2); const __m128i v1_l = _mm_cvtepu8_epi16(v1); const __m128i v1_r = _mm_cvtepu8_epi16(_mm_srli_si128(v1, 8)); const __m128i v2_l = _mm_cvtepu8_epi16(v2); const __m128i v2_r = _mm_cvtepu8_epi16(_mm_srli_si128(v2, 8)); - sumsq2_vec = _mm_add_epi32( - sumsq2_vec, - _mm_add_epi32(_mm_madd_epi16(v2_l, v2_l), _mm_madd_epi16(v2_r, v2_r))); - cross_vec = _mm_add_epi32( - cross_vec, - _mm_add_epi32(_mm_madd_epi16(v1_l, v2_l), _mm_madd_epi16(v1_r, v2_r))); + cross_vec_l = _mm_add_epi32(cross_vec_l, _mm_madd_epi16(v1_l, v2_l)); + cross_vec_r = _mm_add_epi32(cross_vec_r, _mm_madd_epi16(v1_r, v2_r)); + + frame1 += stride1; + frame2 += stride2; } - // Now we can treat the four registers (sum1_vec, sum2_vec, sumsq2_vec, - // cross_vec) - // as holding 4 32-bit elements each, which we want to sum horizontally. - // We do this by transposing and then summing vertically. - __m128i tmp_0 = _mm_unpacklo_epi32(sum1_vec, sum2_vec); - __m128i tmp_1 = _mm_unpackhi_epi32(sum1_vec, sum2_vec); - __m128i tmp_2 = _mm_unpacklo_epi32(sumsq2_vec, cross_vec); - __m128i tmp_3 = _mm_unpackhi_epi32(sumsq2_vec, cross_vec); - - __m128i tmp_4 = _mm_unpacklo_epi64(tmp_0, tmp_2); - __m128i tmp_5 = _mm_unpackhi_epi64(tmp_0, tmp_2); - __m128i tmp_6 = _mm_unpacklo_epi64(tmp_1, tmp_3); - __m128i tmp_7 = _mm_unpackhi_epi64(tmp_1, tmp_3); - - __m128i res = - _mm_add_epi32(_mm_add_epi32(tmp_4, tmp_5), _mm_add_epi32(tmp_6, tmp_7)); - - int sum1 = _mm_extract_epi32(res, 0); - int sum2 = _mm_extract_epi32(res, 1); - int sumsq2 = _mm_extract_epi32(res, 2); - int cross = _mm_extract_epi32(res, 3); - - int var2 = sumsq2 * MATCH_SZ_SQ - sum2 * sum2; - int cov = cross * MATCH_SZ_SQ - sum1 * sum2; - return cov / sqrt((double)var2); + // Sum cross_vec into a single value + const __m128i tmp = _mm_add_epi32(cross_vec_l, cross_vec_r); + const int cross = _mm_extract_epi32(tmp, 0) + _mm_extract_epi32(tmp, 1) + + _mm_extract_epi32(tmp, 2) + _mm_extract_epi32(tmp, 3); + + // Note: In theory, the calculations here "should" be + // covariance = cross / N^2 - mean1 * mean2 + // correlation = covariance / (stddev1 * stddev2). + // + // However, because of the scaling in aom_compute_mean_stddev, the + // lines below actually calculate + // covariance * N^2 = cross - (mean1 * N) * (mean2 * N) + // correlation = (covariance * N^2) / ((stddev1 * N) * (stddev2 * N)) + // + // ie. we have removed the need for a division, and still end up with the + // correct unscaled correlation (ie, in the range [-1, +1]) + const double covariance = cross - mean1 * mean2; + const double correlation = covariance * (one_over_stddev1 * one_over_stddev2); + return correlation; } diff -Nru aom-3.8.2/aom_dsp/flow_estimation/x86/disflow_avx2.c aom-3.9.0/aom_dsp/flow_estimation/x86/disflow_avx2.c --- aom-3.8.2/aom_dsp/flow_estimation/x86/disflow_avx2.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/x86/disflow_avx2.c 2024-05-07 19:57:02.513000000 +0000 @@ -0,0 +1,417 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/flow_estimation/disflow.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +#include "config/aom_dsp_rtcd.h" + +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_avx2.c if DISFLOW_PATCH_SIZE != 8" +#endif + +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); + + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); +} + +// Compare two regions of width x height pixels, one rooted at position +// (x, y) in src and the other at (x + u, y + v) in ref. +// This function returns the sum of squared pixel differences between +// the two regions. +// +// TODO(rachelbarker): Test speed/quality impact of using bilinear interpolation +// instad of bicubic interpolation +static INLINE void compute_flow_vector(const uint8_t *src, const uint8_t *ref, + int width, int height, int stride, int x, + int y, double u, double v, + const int16_t *dx, const int16_t *dy, + int *b) { + const __m256i zero = _mm256_setzero_si256(); + + // Accumulate 8 32-bit partial sums for each element of b + // These will be flattened at the end. + __m256i b0_acc = _mm256_setzero_si256(); + __m256i b1_acc = _mm256_setzero_si256(); + + // Split offset into integer and fractional parts, and compute cubic + // interpolation kernels + const int u_int = (int)floor(u); + const int v_int = (int)floor(v); + const double u_frac = u - floor(u); + const double v_frac = v - floor(v); + + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); + + // Storage for intermediate values between the two convolution directions + // In the AVX2 implementation, this needs a dummy row at the end, because + // we generate 2 rows at a time but the total number of rows is odd. + // So we generate one more row than we actually need. + DECLARE_ALIGNED(32, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 4)]); + int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row + + // Clamp coordinates so that all pixels we fetch will remain within the + // allocated border region, but allow them to go far enough out that + // the border pixels' values do not change. + // Since we are calculating an 8x8 block, the bottom-right pixel + // in the block has coordinates (x0 + 7, y0 + 7). Then, the cubic + // interpolation has 4 taps, meaning that the output of pixel + // (x_w, y_w) depends on the pixels in the range + // ([x_w - 1, x_w + 2], [y_w - 1, y_w + 2]). + // + // Thus the most extreme coordinates which will be fetched are + // (x0 - 1, y0 - 1) and (x0 + 9, y0 + 9). + const int x0 = clamp(x + u_int, -9, width); + const int y0 = clamp(y + v_int, -9, height); + + // Horizontal convolution + + // Prepare the kernel vectors + // We split the kernel into two vectors with kernel indices: + // 0, 1, 0, 1, 0, 1, 0, 1, and + // 2, 3, 2, 3, 2, 3, 2, 3 + __m256i h_kernel_01 = _mm256_broadcastd_epi32(kernels); + __m256i h_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 8)); + + __m256i round_const_h = _mm256_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); + + for (int i = -1; i < DISFLOW_PATCH_SIZE + 2; i += 2) { + const int y_w = y0 + i; + const uint8_t *ref_row = &ref[y_w * stride + (x0 - 1)]; + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load this row of pixels. + // For an 8x8 patch, we need to load the 8 image pixels + 3 extras, + // for a total of 11 pixels. Here we load 16 pixels, but only use + // the first 11. + __m256i row = + yy_loadu2_128((__m128i *)(ref_row + stride), (__m128i *)ref_row); + + // Expand pixels to int16s + // We must use unpacks here, as we have one row in each 128-bit lane + // and want to handle each of those independently. + // This is in contrast to _mm256_cvtepu8_epi16(), which takes a single + // 128-bit input and widens it to 256 bits. + __m256i px_0to7_i16 = _mm256_unpacklo_epi8(row, zero); + __m256i px_4to10_i16 = + _mm256_unpacklo_epi8(_mm256_srli_si256(row, 4), zero); + + // Compute first four outputs + // input pixels 0, 1, 1, 2, 2, 3, 3, 4 + // * kernel 0, 1, 0, 1, 0, 1, 0, 1 + __m256i px0 = + _mm256_unpacklo_epi16(px_0to7_i16, _mm256_srli_si256(px_0to7_i16, 2)); + // input pixels 2, 3, 3, 4, 4, 5, 5, 6 + // * kernel 2, 3, 2, 3, 2, 3, 2, 3 + __m256i px1 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_0to7_i16, 4), + _mm256_srli_si256(px_0to7_i16, 6)); + // Convolve with kernel and sum 2x2 boxes to form first 4 outputs + __m256i sum0 = _mm256_add_epi32(_mm256_madd_epi16(px0, h_kernel_01), + _mm256_madd_epi16(px1, h_kernel_23)); + + __m256i out0 = _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_h), + DISFLOW_INTERP_BITS - 6); + + // Compute second four outputs + __m256i px2 = + _mm256_unpacklo_epi16(px_4to10_i16, _mm256_srli_si256(px_4to10_i16, 2)); + __m256i px3 = _mm256_unpacklo_epi16(_mm256_srli_si256(px_4to10_i16, 4), + _mm256_srli_si256(px_4to10_i16, 6)); + __m256i sum1 = _mm256_add_epi32(_mm256_madd_epi16(px2, h_kernel_01), + _mm256_madd_epi16(px3, h_kernel_23)); + + // Round by just enough bits that the result is + // guaranteed to fit into an i16. Then the next stage can use 16 x 16 -> 32 + // bit multiplies, which should be a fair bit faster than 32 x 32 -> 32 + // as it does now + // This means shifting down so we have 6 extra bits, for a maximum value + // of +18360, which can occur if u_frac == 0.5 and the input pixels are + // {0, 255, 255, 0}. + __m256i out1 = _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_h), + DISFLOW_INTERP_BITS - 6); + + _mm256_storeu_si256((__m256i *)tmp_row, _mm256_packs_epi32(out0, out1)); + } + + // Vertical convolution + const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; + __m256i round_const_v = _mm256_set1_epi32(1 << (round_bits - 1)); + + __m256i v_kernel_01 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 4)); + __m256i v_kernel_23 = _mm256_broadcastd_epi32(_mm_srli_si128(kernels, 12)); + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; + + // Load 5 rows of 8 x 16-bit values, and pack into 4 registers + // holding rows {0, 1}, {1, 2}, {2, 3}, {3, 4} + __m128i row0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); + __m128i row1 = _mm_loadu_si128((__m128i *)tmp_row); + __m128i row2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); + __m128i row3 = + _mm_loadu_si128((__m128i *)(tmp_row + 2 * DISFLOW_PATCH_SIZE)); + __m128i row4 = + _mm_loadu_si128((__m128i *)(tmp_row + 3 * DISFLOW_PATCH_SIZE)); + + __m256i px0 = _mm256_set_m128i(row1, row0); + __m256i px1 = _mm256_set_m128i(row2, row1); + __m256i px2 = _mm256_set_m128i(row3, row2); + __m256i px3 = _mm256_set_m128i(row4, row3); + + // We want to calculate px0 * v_kernel[0] + px1 * v_kernel[1] + ... , + // but each multiply expands its output to 32 bits. So we need to be + // a little clever about how we do this + __m256i sum0 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpacklo_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpacklo_epi16(px2, px3), v_kernel_23)); + __m256i sum1 = _mm256_add_epi32( + _mm256_madd_epi16(_mm256_unpackhi_epi16(px0, px1), v_kernel_01), + _mm256_madd_epi16(_mm256_unpackhi_epi16(px2, px3), v_kernel_23)); + + __m256i sum0_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum0, round_const_v), round_bits); + __m256i sum1_rounded = + _mm256_srai_epi32(_mm256_add_epi32(sum1, round_const_v), round_bits); + + __m256i warped = _mm256_packs_epi32(sum0_rounded, sum1_rounded); + __m128i src_pixels_u8 = xx_loadu_2x64(&src[(y + i + 1) * stride + x], + &src[(y + i) * stride + x]); + __m256i src_pixels = + _mm256_slli_epi16(_mm256_cvtepu8_epi16(src_pixels_u8), 3); + + // Calculate delta from the target patch + __m256i dt = _mm256_sub_epi16(warped, src_pixels); + + // Load 2x8 elements each of dx and dt, to pair with the 2x8 elements of dt + // that we have just computed. Then compute 2x8 partial sums of dx * dt + // and dy * dt, implicitly sum to give 2x4 partial sums of each, and + // accumulate. + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * DISFLOW_PATCH_SIZE]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * DISFLOW_PATCH_SIZE]); + b0_acc = _mm256_add_epi32(b0_acc, _mm256_madd_epi16(dx_row, dt)); + b1_acc = _mm256_add_epi32(b1_acc, _mm256_madd_epi16(dy_row, dt)); + } + + // Flatten the two sets of partial sums to find the final value of b + // We need to set b[0] = sum(b0_acc), b[1] = sum(b1_acc). + // We need to do 14 additions in total; a `hadd` instruction can take care + // of eight of them, then a vertical sum can do four more, leaving two + // scalar additions. + __m256i partial_sum_256 = _mm256_hadd_epi32(b0_acc, b1_acc); + __m128i partial_sum = + _mm_add_epi32(_mm256_extracti128_si256(partial_sum_256, 0), + _mm256_extracti128_si256(partial_sum_256, 1)); + b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); + b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); +} + +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + const __m256i zero = _mm256_setzero_si256(); + + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m256i row_m1_0 = + yy_loadu2_128((__m128i *)(src - 1), (__m128i *)(src - src_stride - 1)); + __m256i row_m1_0_a = _mm256_unpacklo_epi8(row_m1_0, zero); + __m256i row_m1_0_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 1), zero); + __m256i row_m1_0_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_m1_0, 2), zero); + + __m256i row_m1_0_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_a, row_m1_0_c), + _mm256_slli_epi16(row_m1_0_b, 1)); + __m256i row_m1_0_hdiff = _mm256_sub_epi16(row_m1_0_a, row_m1_0_c); + + // Main loop: For each pair of output rows (i, i+1): + // * Load rows (i+1, i+2) and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + // Load rows (i+1, i+2) and apply both horizontal filters + const __m256i row_p1_p2 = + yy_loadu2_128((__m128i *)(src + (i + 2) * src_stride - 1), + (__m128i *)(src + (i + 1) * src_stride - 1)); + const __m256i row_p1_p2_a = _mm256_unpacklo_epi8(row_p1_p2, zero); + const __m256i row_p1_p2_b = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 1), zero); + const __m256i row_p1_p2_c = + _mm256_unpacklo_epi8(_mm256_srli_si256(row_p1_p2, 2), zero); + + const __m256i row_p1_p2_hsmooth = + _mm256_add_epi16(_mm256_add_epi16(row_p1_p2_a, row_p1_p2_c), + _mm256_slli_epi16(row_p1_p2_b, 1)); + const __m256i row_p1_p2_hdiff = _mm256_sub_epi16(row_p1_p2_a, row_p1_p2_c); + + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m256i row_0_p1_hdiff = + _mm256_permute2x128_si256(row_m1_0_hdiff, row_p1_p2_hdiff, 0x21); + const __m256i dx_row = + _mm256_add_epi16(_mm256_add_epi16(row_m1_0_hdiff, row_p1_p2_hdiff), + _mm256_slli_epi16(row_0_p1_hdiff, 1)); + const __m256i dy_row = + _mm256_sub_epi16(row_m1_0_hsmooth, row_p1_p2_hsmooth); + + _mm256_storeu_si256((__m256i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm256_storeu_si256((__m256i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); + + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_0_hsmooth = row_p1_p2_hsmooth; + row_m1_0_hdiff = row_p1_p2_hdiff; + } +} + +static INLINE void compute_flow_matrix(const int16_t *dx, int dx_stride, + const int16_t *dy, int dy_stride, + double *M) { + __m256i acc[4] = { 0 }; + + for (int i = 0; i < DISFLOW_PATCH_SIZE; i += 2) { + __m256i dx_row = _mm256_loadu_si256((__m256i *)&dx[i * dx_stride]); + __m256i dy_row = _mm256_loadu_si256((__m256i *)&dy[i * dy_stride]); + + acc[0] = _mm256_add_epi32(acc[0], _mm256_madd_epi16(dx_row, dx_row)); + acc[1] = _mm256_add_epi32(acc[1], _mm256_madd_epi16(dx_row, dy_row)); + // Don't compute acc[2], as it should be equal to acc[1] + acc[3] = _mm256_add_epi32(acc[3], _mm256_madd_epi16(dy_row, dy_row)); + } + + // Condense sums + __m256i partial_sum_0 = _mm256_hadd_epi32(acc[0], acc[1]); + __m256i partial_sum_1 = _mm256_hadd_epi32(acc[1], acc[3]); + __m256i result_256 = _mm256_hadd_epi32(partial_sum_0, partial_sum_1); + __m128i result = _mm_add_epi32(_mm256_extracti128_si256(result_256, 0), + _mm256_extracti128_si256(result_256, 1)); + + // Apply regularization + // We follow the standard regularization method of adding `k * I` before + // inverting. This ensures that the matrix will be invertible. + // + // Setting the regularization strength k to 1 seems to work well here, as + // typical values coming from the other equations are very large (1e5 to + // 1e6, with an upper limit of around 6e7, at the time of writing). + // It also preserves the property that all matrix values are whole numbers, + // which is convenient for integerized SIMD implementation. + result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); + + // Convert results to doubles and store + _mm256_storeu_pd(M, _mm256_cvtepi32_pd(result)); +} + +// Try to invert the matrix M +// Note: Due to the nature of how a least-squares matrix is constructed, all of +// the eigenvalues will be >= 0, and therefore det M >= 0 as well. +// The regularization term `+ k * I` further ensures that det M >= k^2. +// As mentioned in compute_flow_matrix(), here we use k = 1, so det M >= 1. +// So we don't have to worry about non-invertible matrices here. +static INLINE void invert_2x2(const double *M, double *M_inv) { + double det = (M[0] * M[3]) - (M[1] * M[2]); + assert(det >= 1); + const double det_inv = 1 / det; + + M_inv[0] = M[3] * det_inv; + M_inv[1] = -M[1] * det_inv; + M_inv[2] = -M[2] * det_inv; + M_inv[3] = M[0] * det_inv; +} + +void aom_compute_flow_at_point_avx2(const uint8_t *src, const uint8_t *ref, + int x, int y, int width, int height, + int stride, double *u, double *v) { + DECLARE_ALIGNED(32, double, M[4]); + DECLARE_ALIGNED(32, double, M_inv[4]); + DECLARE_ALIGNED(32, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(32, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + int b[2]; + + // Compute gradients within this patch + const uint8_t *src_patch = &src[y * stride + x]; + sobel_filter(src_patch, stride, dx, dy); + + compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); + invert_2x2(M, M_inv); + + for (int itr = 0; itr < DISFLOW_MAX_ITR; itr++) { + compute_flow_vector(src, ref, width, height, stride, x, y, *u, *v, dx, dy, + b); + + // Solve flow equations to find a better estimate for the flow vector + // at this point + const double step_u = M_inv[0] * b[0] + M_inv[1] * b[1]; + const double step_v = M_inv[2] * b[0] + M_inv[3] * b[1]; + *u += fclamp(step_u * DISFLOW_STEP_SIZE, -2, 2); + *v += fclamp(step_v * DISFLOW_STEP_SIZE, -2, 2); + + if (fabs(step_u) + fabs(step_v) < DISFLOW_STEP_SIZE_THRESOLD) { + // Stop iteration when we're close to convergence + break; + } + } +} diff -Nru aom-3.8.2/aom_dsp/flow_estimation/x86/disflow_sse4.c aom-3.9.0/aom_dsp/flow_estimation/x86/disflow_sse4.c --- aom-3.8.2/aom_dsp/flow_estimation/x86/disflow_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/flow_estimation/x86/disflow_sse4.c 2024-05-07 19:57:02.514000000 +0000 @@ -1,13 +1,12 @@ /* - * Copyright (c) 2022, Alliance for Open Media. All rights reserved + * Copyright (c) 2024, Alliance for Open Media. All rights reserved * - * This source code is subject to the terms of the BSD 3-Clause Clear License - * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear - * License was not distributed with this source code in the LICENSE file, you - * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the - * Alliance for Open Media Patent License 1.0 was not distributed with this - * source code in the PATENTS file, you can obtain it at - * aomedia.org/license/patent-license/. + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include @@ -20,46 +19,59 @@ #include "config/aom_dsp_rtcd.h" -// Internal cross-check against C code -// If you set this to 1 and compile in debug mode, then the outputs of the two -// convolution stages will be checked against the plain C version of the code, -// and an assertion will be fired if the results differ. -#define CHECK_RESULTS 0 - -// Note: Max sum(+ve coefficients) = 1.125 * scale -static INLINE void get_cubic_kernel_dbl(double x, double *kernel) { - // Check that the fractional position is in range. - // - // Note: x is calculated from (eg.) `u_frac = u - floor(u)`. - // Mathematically, this implies that 0 <= x < 1. However, in practice it is - // possible to have x == 1 due to floating point rounding. This is fine, - // and we still interpolate correctly if we allow x = 1. - assert(0 <= x && x <= 1); - - double x2 = x * x; - double x3 = x2 * x; - kernel[0] = -0.5 * x + x2 - 0.5 * x3; - kernel[1] = 1.0 - 2.5 * x2 + 1.5 * x3; - kernel[2] = 0.5 * x + 2.0 * x2 - 1.5 * x3; - kernel[3] = -0.5 * x2 + 0.5 * x3; -} - -static INLINE void get_cubic_kernel_int(double x, int16_t *kernel) { - double kernel_dbl[4]; - get_cubic_kernel_dbl(x, kernel_dbl); - - kernel[0] = (int16_t)rint(kernel_dbl[0] * (1 << DISFLOW_INTERP_BITS)); - kernel[1] = (int16_t)rint(kernel_dbl[1] * (1 << DISFLOW_INTERP_BITS)); - kernel[2] = (int16_t)rint(kernel_dbl[2] * (1 << DISFLOW_INTERP_BITS)); - kernel[3] = (int16_t)rint(kernel_dbl[3] * (1 << DISFLOW_INTERP_BITS)); -} - -#if CHECK_RESULTS -static INLINE int get_cubic_value_int(const int *p, const int16_t *kernel) { - return kernel[0] * p[0] + kernel[1] * p[1] + kernel[2] * p[2] + - kernel[3] * p[3]; +#if DISFLOW_PATCH_SIZE != 8 +#error "Need to change disflow_sse4.c if DISFLOW_PATCH_SIZE != 8" +#endif + +// Compute horizontal and vertical kernels and return them packed into a +// register. The coefficient ordering is: +// h0, h1, v0, v1, h2, h3, v2, v3 +// This is chosen because it takes less work than fully separating the kernels, +// but it is separated enough that we can pick out each coefficient pair in the +// main compute_flow_at_point function +static INLINE __m128i compute_cubic_kernels(double u, double v) { + const __m128d x = _mm_set_pd(v, u); + + const __m128d x2 = _mm_mul_pd(x, x); + const __m128d x3 = _mm_mul_pd(x2, x); + + // Macro to multiply a value v by a constant coefficient c +#define MULC(c, v) _mm_mul_pd(_mm_set1_pd(c), v) + + // Compute floating-point kernel + // Note: To ensure results are bit-identical to the C code, we need to perform + // exactly the same sequence of operations here as in the C code. + __m128d k0 = _mm_sub_pd(_mm_add_pd(MULC(-0.5, x), x2), MULC(0.5, x3)); + __m128d k1 = + _mm_add_pd(_mm_sub_pd(_mm_set1_pd(1.0), MULC(2.5, x2)), MULC(1.5, x3)); + __m128d k2 = + _mm_sub_pd(_mm_add_pd(MULC(0.5, x), MULC(2.0, x2)), MULC(1.5, x3)); + __m128d k3 = _mm_add_pd(MULC(-0.5, x2), MULC(0.5, x3)); +#undef MULC + + // Integerize + __m128d prec = _mm_set1_pd((double)(1 << DISFLOW_INTERP_BITS)); + + k0 = _mm_round_pd(_mm_mul_pd(k0, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k1 = _mm_round_pd(_mm_mul_pd(k1, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k2 = _mm_round_pd(_mm_mul_pd(k2, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + k3 = _mm_round_pd(_mm_mul_pd(k3, prec), + _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); + + const __m128i c0 = _mm_cvtpd_epi32(k0); + const __m128i c1 = _mm_cvtpd_epi32(k1); + const __m128i c2 = _mm_cvtpd_epi32(k2); + const __m128i c3 = _mm_cvtpd_epi32(k3); + + // Rearrange results and convert down to 16 bits, giving the target output + // ordering + const __m128i c01 = _mm_unpacklo_epi32(c0, c1); + const __m128i c23 = _mm_unpacklo_epi32(c2, c3); + return _mm_packs_epi32(c01, c23); } -#endif // CHECK_RESULTS // Compare two regions of width x height pixels, one rooted at position // (x, y) in src and the other at (x + u, y + v) in ref. @@ -80,10 +92,6 @@ // These will be flattened at the end. __m128i b0_acc = _mm_setzero_si128(); __m128i b1_acc = _mm_setzero_si128(); -#if CHECK_RESULTS - // Also keep a running sum using the C algorithm, for cross-checking - int c_result[2] = { 0 }; -#endif // CHECK_RESULTS // Split offset into integer and fractional parts, and compute cubic // interpolation kernels @@ -92,13 +100,11 @@ const double u_frac = u - floor(u); const double v_frac = v - floor(v); - int16_t h_kernel[4]; - int16_t v_kernel[4]; - get_cubic_kernel_int(u_frac, h_kernel); - get_cubic_kernel_int(v_frac, v_kernel); + const __m128i kernels = compute_cubic_kernels(u_frac, v_frac); // Storage for intermediate values between the two convolution directions - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]; + DECLARE_ALIGNED(16, int16_t, + tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 3)]); int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; // Offset by one row // Clamp coordinates so that all pixels we fetch will remain within the @@ -121,8 +127,8 @@ // We split the kernel into two vectors with kernel indices: // 0, 1, 0, 1, 0, 1, 0, 1, and // 2, 3, 2, 3, 2, 3, 2, 3 - __m128i h_kernel_01 = xx_set2_epi16(h_kernel[0], h_kernel[1]); - __m128i h_kernel_23 = xx_set2_epi16(h_kernel[2], h_kernel[3]); + __m128i h_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 0)); + __m128i h_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 2)); __m128i round_const_h = _mm_set1_epi32(1 << (DISFLOW_INTERP_BITS - 6 - 1)); @@ -141,10 +147,6 @@ __m128i px_0to7_i16 = _mm_cvtepu8_epi16(row); __m128i px_4to10_i16 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 4)); - // Relevant multiply instruction - // This multiplies pointwise, then sums in pairs. - //_mm_madd_epi16(); - // Compute first four outputs // input pixels 0, 1, 1, 2, 2, 3, 3, 4 // * kernel 0, 1, 0, 1, 0, 1, 0, 1 @@ -180,43 +182,14 @@ DISFLOW_INTERP_BITS - 6); _mm_storeu_si128((__m128i *)tmp_row, _mm_packs_epi32(out0, out1)); - -#if CHECK_RESULTS && !defined(NDEBUG) - // Cross-check - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - const int x_w = x0 + j; - int arr[4]; - - arr[0] = (int)ref[y_w * stride + (x_w - 1)]; - arr[1] = (int)ref[y_w * stride + (x_w + 0)]; - arr[2] = (int)ref[y_w * stride + (x_w + 1)]; - arr[3] = (int)ref[y_w * stride + (x_w + 2)]; - - // Apply kernel and round, keeping 6 extra bits of precision. - // - // 6 is the maximum allowable number of extra bits which will avoid - // the intermediate values overflowing an int16_t. The most extreme - // intermediate value occurs when: - // * The input pixels are [0, 255, 255, 0] - // * u_frac = 0.5 - // In this case, the un-scaled output is 255 * 1.125 = 286.875. - // As an integer with 6 fractional bits, that is 18360, which fits - // in an int16_t. But with 7 fractional bits it would be 36720, - // which is too large. - const int c_value = ROUND_POWER_OF_TWO(get_cubic_value_int(arr, h_kernel), - DISFLOW_INTERP_BITS - 6); - (void)c_value; // Suppress warnings - assert(tmp_row[j] == c_value); - } -#endif // CHECK_RESULTS } // Vertical convolution const int round_bits = DISFLOW_INTERP_BITS + 6 - DISFLOW_DERIV_SCALE_LOG2; __m128i round_const_v = _mm_set1_epi32(1 << (round_bits - 1)); - __m128i v_kernel_01 = xx_set2_epi16(v_kernel[0], v_kernel[1]); - __m128i v_kernel_23 = xx_set2_epi16(v_kernel[2], v_kernel[3]); + __m128i v_kernel_01 = _mm_set1_epi32(_mm_extract_epi32(kernels, 1)); + __m128i v_kernel_23 = _mm_set1_epi32(_mm_extract_epi32(kernels, 3)); for (int i = 0; i < DISFLOW_PATCH_SIZE; ++i) { int16_t *tmp_row = &tmp[i * DISFLOW_PATCH_SIZE]; @@ -259,30 +232,6 @@ __m128i dy_row = _mm_loadu_si128((__m128i *)&dy[i * DISFLOW_PATCH_SIZE]); b0_acc = _mm_add_epi32(b0_acc, _mm_madd_epi16(dx_row, dt)); b1_acc = _mm_add_epi32(b1_acc, _mm_madd_epi16(dy_row, dt)); - -#if CHECK_RESULTS - int16_t dt_arr[8]; - memcpy(dt_arr, &dt, 8 * sizeof(*dt_arr)); - for (int j = 0; j < DISFLOW_PATCH_SIZE; ++j) { - int16_t *p = &tmp[i * DISFLOW_PATCH_SIZE + j]; - int arr[4] = { p[-DISFLOW_PATCH_SIZE], p[0], p[DISFLOW_PATCH_SIZE], - p[2 * DISFLOW_PATCH_SIZE] }; - const int result = get_cubic_value_int(arr, v_kernel); - - // Apply kernel and round. - // This time, we have to round off the 6 extra bits which were kept - // earlier, but we also want to keep DISFLOW_DERIV_SCALE_LOG2 extra bits - // of precision to match the scale of the dx and dy arrays. - const int c_warped = ROUND_POWER_OF_TWO(result, round_bits); - const int c_src_px = src[(x + j) + (y + i) * stride] << 3; - const int c_dt = c_warped - c_src_px; - - assert(dt_arr[j] == c_dt); - - c_result[0] += dx[i * DISFLOW_PATCH_SIZE + j] * c_dt; - c_result[1] += dy[i * DISFLOW_PATCH_SIZE + j] * c_dt; - } -#endif // CHECK_RESULTS } // Flatten the two sets of partial sums to find the final value of b @@ -292,156 +241,66 @@ __m128i partial_sum = _mm_hadd_epi32(b0_acc, b1_acc); b[0] = _mm_extract_epi32(partial_sum, 0) + _mm_extract_epi32(partial_sum, 1); b[1] = _mm_extract_epi32(partial_sum, 2) + _mm_extract_epi32(partial_sum, 3); - -#if CHECK_RESULTS - assert(b[0] == c_result[0]); - assert(b[1] == c_result[1]); -#endif // CHECK_RESULTS -} - -static INLINE void sobel_filter_x(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS - - // Horizontal filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - - __m128i out = _mm_sub_epi16(px0, px2); - - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); - -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } - - // Vertical filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; - - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px1 = _mm_loadu_si128((__m128i *)tmp_row); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); - - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS - } } -static INLINE void sobel_filter_y(const uint8_t *src, int src_stride, - int16_t *dst, int dst_stride) { - int16_t tmp_[DISFLOW_PATCH_SIZE * (DISFLOW_PATCH_SIZE + 2)]; - int16_t *tmp = tmp_ + DISFLOW_PATCH_SIZE; -#if CHECK_RESULTS - const int taps = 3; -#endif // CHECK_RESULTS - - // Horizontal filter - // Here the kernel is {1, 2, 1}, which can be implemented - // with simple sums rather than multiplies and adds. - // In order to minimize dependency chains, we evaluate in the order - // (image[y - 1] + image[y + 1]) + (image[y] << 1) - // This way, the first addition and the shift can happen in parallel - for (int y = -1; y < DISFLOW_PATCH_SIZE + 1; ++y) { - const uint8_t *src_row = src + y * src_stride; - int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - - // Load pixels and expand to 16 bits - __m128i row = _mm_loadu_si128((__m128i *)(src_row - 1)); - __m128i px0 = _mm_cvtepu8_epi16(row); - __m128i px1 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); - __m128i px2 = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); - - __m128i out = - _mm_add_epi16(_mm_add_epi16(px0, px2), _mm_slli_epi16(px1, 1)); - - // Store to intermediate array - _mm_storeu_si128((__m128i *)tmp_row, out); - -#if CHECK_RESULTS - // Cross-check - static const int16_t h_kernel[3] = { 1, 2, 1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += h_kernel[k] * src_row[x + k - 1]; - } - (void)sum; - assert(tmp_row[x] == sum); - } -#endif // CHECK_RESULTS - } - - // Vertical filter - // As the kernel is simply {1, 0, -1}, we implement this as simply - // out[x] = image[x-1] - image[x+1] - // rather than doing a "proper" convolution operation - for (int y = 0; y < DISFLOW_PATCH_SIZE; ++y) { - const int16_t *tmp_row = tmp + y * DISFLOW_PATCH_SIZE; - int16_t *dst_row = dst + y * dst_stride; - - __m128i px0 = _mm_loadu_si128((__m128i *)(tmp_row - DISFLOW_PATCH_SIZE)); - __m128i px2 = _mm_loadu_si128((__m128i *)(tmp_row + DISFLOW_PATCH_SIZE)); - - __m128i out = _mm_sub_epi16(px0, px2); - - _mm_storeu_si128((__m128i *)dst_row, out); - -#if CHECK_RESULTS - static const int16_t v_kernel[3] = { 1, 0, -1 }; - for (int x = 0; x < DISFLOW_PATCH_SIZE; ++x) { - int sum = 0; - for (int k = 0; k < taps; ++k) { - sum += v_kernel[k] * tmp[(y + k - 1) * DISFLOW_PATCH_SIZE + x]; - } - (void)sum; - assert(dst_row[x] == sum); - } -#endif // CHECK_RESULTS +// Compute the x and y gradients of the source patch in a single pass, +// and store into dx and dy respectively. +static INLINE void sobel_filter(const uint8_t *src, int src_stride, int16_t *dx, + int16_t *dy) { + // Loop setup: Load the first two rows (of 10 input rows) and apply + // the horizontal parts of the two filters + __m128i row_m1 = _mm_loadu_si128((__m128i *)(src - src_stride - 1)); + __m128i row_m1_a = _mm_cvtepu8_epi16(row_m1); + __m128i row_m1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 1)); + __m128i row_m1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_m1, 2)); + + __m128i row_m1_hsmooth = _mm_add_epi16(_mm_add_epi16(row_m1_a, row_m1_c), + _mm_slli_epi16(row_m1_b, 1)); + __m128i row_m1_hdiff = _mm_sub_epi16(row_m1_a, row_m1_c); + + __m128i row = _mm_loadu_si128((__m128i *)(src - 1)); + __m128i row_a = _mm_cvtepu8_epi16(row); + __m128i row_b = _mm_cvtepu8_epi16(_mm_srli_si128(row, 1)); + __m128i row_c = _mm_cvtepu8_epi16(_mm_srli_si128(row, 2)); + + __m128i row_hsmooth = + _mm_add_epi16(_mm_add_epi16(row_a, row_c), _mm_slli_epi16(row_b, 1)); + __m128i row_hdiff = _mm_sub_epi16(row_a, row_c); + + // Main loop: For each of the 8 output rows: + // * Load row i+1 and apply both horizontal filters + // * Apply vertical filters and store results + // * Shift rows for next iteration + for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { + // Load row i+1 and apply both horizontal filters + const __m128i row_p1 = + _mm_loadu_si128((__m128i *)(src + (i + 1) * src_stride - 1)); + const __m128i row_p1_a = _mm_cvtepu8_epi16(row_p1); + const __m128i row_p1_b = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 1)); + const __m128i row_p1_c = _mm_cvtepu8_epi16(_mm_srli_si128(row_p1, 2)); + + const __m128i row_p1_hsmooth = _mm_add_epi16( + _mm_add_epi16(row_p1_a, row_p1_c), _mm_slli_epi16(row_p1_b, 1)); + const __m128i row_p1_hdiff = _mm_sub_epi16(row_p1_a, row_p1_c); + + // Apply vertical filters and store results + // dx = vertical smooth(horizontal diff(input)) + // dy = vertical diff(horizontal smooth(input)) + const __m128i dx_row = + _mm_add_epi16(_mm_add_epi16(row_m1_hdiff, row_p1_hdiff), + _mm_slli_epi16(row_hdiff, 1)); + const __m128i dy_row = _mm_sub_epi16(row_m1_hsmooth, row_p1_hsmooth); + + _mm_storeu_si128((__m128i *)(dx + i * DISFLOW_PATCH_SIZE), dx_row); + _mm_storeu_si128((__m128i *)(dy + i * DISFLOW_PATCH_SIZE), dy_row); + + // Shift rows for next iteration + // This allows a lot of work to be reused, reducing the number of + // horizontal filtering operations from 2*3*8 = 48 to 2*10 = 20 + row_m1_hsmooth = row_hsmooth; + row_m1_hdiff = row_hdiff; + row_hsmooth = row_p1_hsmooth; + row_hdiff = row_p1_hdiff; } } @@ -476,30 +335,6 @@ // which is convenient for integerized SIMD implementation. result = _mm_add_epi32(result, _mm_set_epi32(1, 0, 0, 1)); -#if CHECK_RESULTS - int tmp[4] = { 0 }; - - for (int i = 0; i < DISFLOW_PATCH_SIZE; i++) { - for (int j = 0; j < DISFLOW_PATCH_SIZE; j++) { - tmp[0] += dx[i * dx_stride + j] * dx[i * dx_stride + j]; - tmp[1] += dx[i * dx_stride + j] * dy[i * dy_stride + j]; - // Don't compute tmp[2], as it should be equal to tmp[1] - tmp[3] += dy[i * dy_stride + j] * dy[i * dy_stride + j]; - } - } - - // Apply regularization - tmp[0] += 1; - tmp[3] += 1; - - tmp[2] = tmp[1]; - - assert(tmp[0] == _mm_extract_epi32(result, 0)); - assert(tmp[1] == _mm_extract_epi32(result, 1)); - assert(tmp[2] == _mm_extract_epi32(result, 2)); - assert(tmp[3] == _mm_extract_epi32(result, 3)); -#endif // CHECK_RESULTS - // Convert results to doubles and store _mm_storeu_pd(M, _mm_cvtepi32_pd(result)); _mm_storeu_pd(M + 2, _mm_cvtepi32_pd(_mm_srli_si128(result, 8))); @@ -525,16 +360,15 @@ void aom_compute_flow_at_point_sse4_1(const uint8_t *src, const uint8_t *ref, int x, int y, int width, int height, int stride, double *u, double *v) { - double M[4]; - double M_inv[4]; + DECLARE_ALIGNED(16, double, M[4]); + DECLARE_ALIGNED(16, double, M_inv[4]); + DECLARE_ALIGNED(16, int16_t, dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); + DECLARE_ALIGNED(16, int16_t, dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]); int b[2]; - int16_t dx[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; - int16_t dy[DISFLOW_PATCH_SIZE * DISFLOW_PATCH_SIZE]; // Compute gradients within this patch const uint8_t *src_patch = &src[y * stride + x]; - sobel_filter_x(src_patch, stride, dx, DISFLOW_PATCH_SIZE); - sobel_filter_y(src_patch, stride, dy, DISFLOW_PATCH_SIZE); + sobel_filter(src_patch, stride, dx, dy); compute_flow_matrix(dx, DISFLOW_PATCH_SIZE, dy, DISFLOW_PATCH_SIZE, M); invert_2x2(M, M_inv); diff -Nru aom-3.8.2/aom_dsp/mathutils.h aom-3.9.0/aom_dsp/mathutils.h --- aom-3.8.2/aom_dsp/mathutils.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/mathutils.h 2024-05-07 19:57:02.519000000 +0000 @@ -17,7 +17,6 @@ #include #include "aom_dsp/aom_dsp_common.h" -#include "aom_mem/aom_mem.h" static const double TINY_NEAR_ZERO = 1.0E-16; diff -Nru aom-3.8.2/aom_dsp/noise_model.c aom-3.9.0/aom_dsp/noise_model.c --- aom-3.8.2/aom_dsp/noise_model.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/noise_model.c 2024-05-07 19:57:02.520000000 +0000 @@ -19,6 +19,8 @@ #include "aom_dsp/noise_model.h" #include "aom_dsp/noise_util.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_scale/yv12config.h" #define kLowPolyNumParams 3 @@ -1555,7 +1557,7 @@ } static int denoise_and_model_realloc_if_necessary( - struct aom_denoise_and_model_t *ctx, YV12_BUFFER_CONFIG *sd) { + struct aom_denoise_and_model_t *ctx, const YV12_BUFFER_CONFIG *sd) { if (ctx->width == sd->y_width && ctx->height == sd->y_height && ctx->y_stride == sd->y_stride && ctx->uv_stride == sd->uv_stride) return 1; @@ -1624,7 +1626,7 @@ // TODO(aomedia:3151): Handle a monochrome image (sd->u_buffer and sd->v_buffer // are null pointers) correctly. int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *sd, + const YV12_BUFFER_CONFIG *sd, aom_film_grain_t *film_grain, int apply_denoise) { const int block_size = ctx->block_size; const int use_highbd = (sd->flags & YV12_FLAG_HIGHBITDEPTH) != 0; diff -Nru aom-3.8.2/aom_dsp/noise_model.h aom-3.9.0/aom_dsp/noise_model.h --- aom-3.8.2/aom_dsp/noise_model.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/noise_model.h 2024-05-07 19:57:02.523000000 +0000 @@ -297,14 +297,14 @@ * aom_denoise_and_model_alloc that holds some * buffers for denoising and the current noise * estimate. - * \param[in,out] buf The raw input buffer to be denoised. + * \param[in,out] sd The raw input buffer to be denoised. * \param[out] grain Output film grain parameters * \param[in] apply_denoise Whether or not to apply the denoising to the * frame that will be encoded */ int aom_denoise_and_model_run(struct aom_denoise_and_model_t *ctx, - YV12_BUFFER_CONFIG *buf, aom_film_grain_t *grain, - int apply_denoise); + const YV12_BUFFER_CONFIG *sd, + aom_film_grain_t *grain, int apply_denoise); /*!\brief Allocates a context that can be used for denoising and noise modeling. * diff -Nru aom-3.8.2/aom_dsp/odintrin.h aom-3.9.0/aom_dsp/odintrin.h --- aom-3.8.2/aom_dsp/odintrin.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/odintrin.h 2024-05-07 19:57:02.526000000 +0000 @@ -70,20 +70,6 @@ #define OD_ARG_NONNULL(x) #endif -/** Copy n elements of memory from src to dst. The 0* term provides - compile-time type checking */ -#if !defined(OVERRIDE_OD_COPY) -#define OD_COPY(dst, src, n) \ - (memcpy((dst), (src), sizeof(*(dst)) * (n) + 0 * ((dst) - (src)))) -#endif - -/** Copy n elements of memory from src to dst, allowing overlapping regions. - The 0* term provides compile-time type checking */ -#if !defined(OVERRIDE_OD_MOVE) -# define OD_MOVE(dst, src, n) \ - (memmove((dst), (src), sizeof(*(dst))*(n) + 0*((dst) - (src)) )) -#endif - /*All of these macros should expect floats as arguments.*/ # define OD_SIGNMASK(a) (-((a) < 0)) # define OD_FLIPSIGNI(a, b) (((a) + OD_SIGNMASK(b)) ^ OD_SIGNMASK(b)) diff -Nru aom-3.8.2/aom_dsp/psnr.c aom-3.9.0/aom_dsp/psnr.c --- aom-3.8.2/aom_dsp/psnr.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/psnr.c 2024-05-07 19:57:02.526000000 +0000 @@ -349,7 +349,11 @@ int i; uint64_t total_sse = 0; uint32_t total_samples = 0; +#if CONFIG_LIBVMAF_PSNR_PEAK + double peak = (double)(255 << (in_bit_depth - 8)); +#else double peak = (double)((1 << in_bit_depth) - 1); +#endif // CONFIG_LIBVMAF_PSNR_PEAK const unsigned int input_shift = bit_depth - in_bit_depth; for (i = 0; i < 3; ++i) { @@ -384,7 +388,11 @@ // Compute PSNR based on stream bit depth if ((a->flags & YV12_FLAG_HIGHBITDEPTH) && (in_bit_depth < bit_depth)) { +#if CONFIG_LIBVMAF_PSNR_PEAK + peak = (double)(255 << (bit_depth - 8)); +#else peak = (double)((1 << bit_depth) - 1); +#endif // CONFIG_LIBVMAF_PSNR_PEAK total_sse = 0; total_samples = 0; for (i = 0; i < 3; ++i) { diff -Nru aom-3.8.2/aom_dsp/psnr.h aom-3.9.0/aom_dsp/psnr.h --- aom-3.8.2/aom_dsp/psnr.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/psnr.h 2024-05-07 19:57:02.527000000 +0000 @@ -31,7 +31,7 @@ /*!\brief Converts SSE to PSNR * - * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR). + * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PSNR). * * \param[in] samples Number of samples * \param[in] peak Max sample value diff -Nru aom-3.8.2/aom_dsp/pyramid.c aom-3.9.0/aom_dsp/pyramid.c --- aom-3.8.2/aom_dsp/pyramid.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/pyramid.c 2024-05-07 19:57:02.528000000 +0000 @@ -12,7 +12,7 @@ #include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "aom_ports/bitops.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" // TODO(rachelbarker): Move needed code from av1/ to aom_dsp/ #include "av1/common/resize.h" @@ -26,18 +26,16 @@ // levels. This is counted in the size checked against the max allocation // limit // * Then calls aom_alloc_pyramid() to actually create the pyramid -// * Pyramid is initially marked as invalid (no data) -// * Whenever pyramid is needed, we check the valid flag. If set, use existing -// data. If not set, compute full pyramid -// * Whenever frame buffer is reused, clear the valid flag +// * Pyramid is initially marked as containing no valid data +// * Each pyramid layer is computed on-demand, the first time it is requested +// * Whenever frame buffer is reused, reset the counter of filled levels. +// This invalidates all of the existing pyramid levels. // * Whenever frame buffer is resized, reallocate pyramid -size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, - bool image_is_16bit) { - // Limit number of levels on small frames +size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit) { + // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); - const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); - n_levels = AOMMIN(n_levels, max_levels); + const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); size_t alloc_size = 0; alloc_size += sizeof(ImagePyramid); @@ -100,12 +98,10 @@ return alloc_size; } -ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, - bool image_is_16bit) { - // Limit number of levels on small frames +ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit) { + // Allocate the maximum possible number of layers for this width and height const int msb = get_msb(AOMMIN(width, height)); - const int max_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); - n_levels = AOMMIN(n_levels, max_levels); + const int n_levels = AOMMAX(msb - MIN_PYRAMID_SIZE_LOG2, 1); ImagePyramid *pyr = aom_calloc(1, sizeof(*pyr)); if (!pyr) { @@ -118,8 +114,8 @@ return NULL; } - pyr->valid = false; - pyr->n_levels = n_levels; + pyr->max_levels = n_levels; + pyr->filled_levels = 0; // Compute sizes and offsets for each pyramid level // These are gathered up first, so that we can allocate all pyramid levels @@ -248,46 +244,67 @@ } } -// Compute coarse to fine pyramids for a frame +// Compute downsampling pyramid for a frame +// +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. +// +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +// // This must only be called while holding frame_pyr->mutex -static INLINE bool fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *frame_pyr) { - int n_levels = frame_pyr->n_levels; +static INLINE int fill_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *frame_pyr) { + int already_filled_levels = frame_pyr->filled_levels; + + // This condition should already be enforced by aom_compute_pyramid + assert(n_levels <= frame_pyr->max_levels); + + if (already_filled_levels >= n_levels) { + return n_levels; + } + const int frame_width = frame->y_crop_width; const int frame_height = frame->y_crop_height; const int frame_stride = frame->y_stride; assert((frame_width >> n_levels) >= 0); assert((frame_height >> n_levels) >= 0); - PyramidLayer *first_layer = &frame_pyr->layers[0]; - if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { - // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits - assert(first_layer->width == frame_width); - assert(first_layer->height == frame_height); - - uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); - uint8_t *pyr_buffer = first_layer->buffer; - int pyr_stride = first_layer->stride; - for (int y = 0; y < frame_height; y++) { - uint16_t *frame_row = frame_buffer + y * frame_stride; - uint8_t *pyr_row = pyr_buffer + y * pyr_stride; - for (int x = 0; x < frame_width; x++) { - pyr_row[x] = frame_row[x] >> (bit_depth - 8); + if (already_filled_levels == 0) { + // Fill in largest level from the original image + PyramidLayer *first_layer = &frame_pyr->layers[0]; + if (frame->flags & YV12_FLAG_HIGHBITDEPTH) { + // For frames stored in a 16-bit buffer, we need to downconvert to 8 bits + assert(first_layer->width == frame_width); + assert(first_layer->height == frame_height); + + uint16_t *frame_buffer = CONVERT_TO_SHORTPTR(frame->y_buffer); + uint8_t *pyr_buffer = first_layer->buffer; + int pyr_stride = first_layer->stride; + for (int y = 0; y < frame_height; y++) { + uint16_t *frame_row = frame_buffer + y * frame_stride; + uint8_t *pyr_row = pyr_buffer + y * pyr_stride; + for (int x = 0; x < frame_width; x++) { + pyr_row[x] = frame_row[x] >> (bit_depth - 8); + } } + + fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); + } else { + // For frames stored in an 8-bit buffer, we don't need to copy anything - + // we can just reference the original image buffer + first_layer->buffer = frame->y_buffer; + first_layer->width = frame_width; + first_layer->height = frame_height; + first_layer->stride = frame_stride; } - fill_border(pyr_buffer, frame_width, frame_height, pyr_stride); - } else { - // For frames stored in an 8-bit buffer, we need to configure the first - // pyramid layer to point at the original image buffer - first_layer->buffer = frame->y_buffer; - first_layer->width = frame_width; - first_layer->height = frame_height; - first_layer->stride = frame_stride; + already_filled_levels = 1; } // Fill in the remaining levels through progressive downsampling - for (int level = 1; level < n_levels; ++level) { + for (int level = already_filled_levels; level < n_levels; ++level) { PyramidLayer *prev_layer = &frame_pyr->layers[level - 1]; uint8_t *prev_buffer = prev_layer->buffer; int prev_stride = prev_layer->stride; @@ -314,11 +331,16 @@ // TODO(rachelbarker): Use optimized downsample-by-2 function if (!av1_resize_plane(prev_buffer, this_height << 1, this_width << 1, prev_stride, this_buffer, this_height, this_width, - this_stride)) - return false; + this_stride)) { + // If we can't allocate memory, we'll have to terminate early + frame_pyr->filled_levels = n_levels; + return -1; + } fill_border(this_buffer, this_width, this_height, this_stride); } - return true; + + frame_pyr->filled_levels = n_levels; + return n_levels; } // Fill out a downsampling pyramid for a given frame. @@ -327,63 +349,72 @@ // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // -// For small input frames, the number of levels actually constructed -// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE -// pixels along each side. +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. +// No matter how small the frame is, at least one level is guaranteed +// to be filled. // -// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, -// we will still construct the top level. -bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *pyr) { +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *pyr) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex - // before reading or writing the "valid" flag, and hold it while computing - // the pyramid, to ensure proper behaviour if multiple threads call this - // function simultaneously + // before reading or writing the filled_levels field, and hold it while + // computing any additional pyramid levels, to ensure proper behaviour + // when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - if (!pyr->valid) { - pyr->valid = fill_pyramid(frame, bit_depth, pyr); + n_levels = AOMMIN(n_levels, pyr->max_levels); + int result = n_levels; + if (pyr->filled_levels < n_levels) { + // Compute any missing levels that we need + result = fill_pyramid(frame, bit_depth, n_levels, pyr); } - bool valid = pyr->valid; - - // At this point, the pyramid is guaranteed to be valid, and can be safely - // read from without holding the mutex any more + // At this point, as long as result >= 0, the requested number of pyramid + // levels are guaranteed to be valid, and can be safely read from without + // holding the mutex any further + assert(IMPLIES(result >= 0, pyr->filled_levels >= n_levels)); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - return valid; + return result; } #ifndef NDEBUG -// Check if a pyramid has already been computed. +// Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex -// while reading the valid flag, we cannot just write: -// assert(pyr->valid); +// while reading the number of already-computed levels, we cannot just write: +// assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: -// assert(aom_is_pyramid_valid(pyr)); -bool aom_is_pyramid_valid(ImagePyramid *pyr) { +// assert(aom_is_pyramid_valid(pyr, n_levels)); +// +// Note: This deliberately does not restrict n_levels based on the maximum +// number of permitted levels for the frame size. This allows the check to +// catch cases where the caller forgets to handle the case where +// max_levels is less than the requested number of levels +bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels) { assert(pyr); // Per the comments in the ImagePyramid struct, we must take this mutex - // before reading or writing the "valid" flag, and hold it while computing - // the pyramid, to ensure proper behaviour if multiple threads call this - // function simultaneously + // before reading or writing the filled_levels field, to ensure proper + // behaviour when multithreading is used #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - bool valid = pyr->valid; + bool result = (pyr->filled_levels >= n_levels); #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - return valid; + return result; } #endif @@ -394,7 +425,7 @@ #if CONFIG_MULTITHREAD pthread_mutex_lock(&pyr->mutex); #endif // CONFIG_MULTITHREAD - pyr->valid = false; + pyr->filled_levels = 0; #if CONFIG_MULTITHREAD pthread_mutex_unlock(&pyr->mutex); #endif // CONFIG_MULTITHREAD diff -Nru aom-3.8.2/aom_dsp/pyramid.h aom-3.9.0/aom_dsp/pyramid.h --- aom-3.8.2/aom_dsp/pyramid.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/pyramid.h 2024-05-07 19:57:02.531000000 +0000 @@ -19,7 +19,7 @@ #include "config/aom_config.h" #include "aom_scale/yv12config.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #ifdef __cplusplus extern "C" { @@ -57,23 +57,31 @@ // same time // // Semantics: - // * This mutex must be held whenever reading or writing the `valid` flag + // * This mutex must be held whenever reading or writing the + // `filled_levels` field // // * This mutex must also be held while computing the image pyramid, // to ensure that only one thread may do so at a time. // - // * However, once you have read the valid flag and seen a true value, - // it is safe to drop the mutex and read from the remaining fields. - // This is because, once the image pyramid is computed, its contents + // * However, once you have read the filled_levels field and observed + // a value N, it is safe to drop the mutex and read from the remaining + // fields, including the first N pyramid levels (but no higher). + // Note that filled_levels must be read once and cached in a local variable + // in order for this to be safe - it cannot be re-read without retaking + // the mutex. + // + // This works because, once the image pyramid is computed, its contents // will not be changed until the parent frame buffer is recycled, // which will not happen until there are no more outstanding references // to the frame buffer. pthread_mutex_t mutex; #endif - // Flag indicating whether the pyramid contains valid data - bool valid; - // Number of allocated/filled levels in this pyramid - int n_levels; + // Maximum number of levels for the given frame size + // We always allocate enough memory for this many levels, as the memory + // cost of higher levels of the pyramid is minimal. + int max_levels; + // Number of levels which currently hold valid data + int filled_levels; // Pointer to allocated buffer uint8_t *buffer_alloc; // Data for each level @@ -82,11 +90,9 @@ PyramidLayer *layers; } ImagePyramid; -size_t aom_get_pyramid_alloc_size(int width, int height, int n_levels, - bool image_is_16bit); +size_t aom_get_pyramid_alloc_size(int width, int height, bool image_is_16bit); -ImagePyramid *aom_alloc_pyramid(int width, int height, int n_levels, - bool image_is_16bit); +ImagePyramid *aom_alloc_pyramid(int width, int height, bool image_is_16bit); // Fill out a downsampling pyramid for a given frame. // @@ -94,23 +100,28 @@ // regardless of the input bit depth. Additional levels are then downscaled // by powers of 2. // -// For small input frames, the number of levels actually constructed -// will be limited so that the smallest image is at least MIN_PYRAMID_SIZE -// pixels along each side. +// This function will ensure that the first `n_levels` levels of the pyramid +// are filled, unless the frame is too small to have this many levels. +// In that case, we will fill all available levels and then stop. // -// However, if the input frame has a side of length < MIN_PYRAMID_SIZE, -// we will still construct the top level. -bool aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, - ImagePyramid *pyr); +// Returns the actual number of levels filled, capped at n_levels, +// or -1 on error. +int aom_compute_pyramid(const YV12_BUFFER_CONFIG *frame, int bit_depth, + int n_levels, ImagePyramid *pyr); #ifndef NDEBUG -// Check if a pyramid has already been computed. +// Check if a pyramid has already been computed to at least n levels // This is mostly a debug helper - as it is necessary to hold pyr->mutex -// while reading the valid flag, we cannot just write: -// assert(pyr->valid); +// while reading the number of already-computed levels, we cannot just write: +// assert(pyr->filled_levels >= n_levels); // This function allows the check to be correctly written as: -// assert(aom_is_pyramid_valid(pyr)); -bool aom_is_pyramid_valid(ImagePyramid *pyr); +// assert(aom_is_pyramid_valid(pyr, n_levels)); +// +// Note: This deliberately does not restrict n_levels based on the maximum +// number of permitted levels for the frame size. This allows the check to +// catch cases where the caller forgets to handle the case where +// max_levels is less than the requested number of levels +bool aom_is_pyramid_valid(ImagePyramid *pyr, int n_levels); #endif // Mark a pyramid as no longer containing valid data. diff -Nru aom-3.8.2/aom_dsp/rect.h aom-3.9.0/aom_dsp/rect.h --- aom-3.8.2/aom_dsp/rect.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/rect.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,35 +0,0 @@ -/* - * Copyright (c) 2022, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_RECT_H_ -#define AOM_AOM_DSP_RECT_H_ - -#include "config/aom_config.h" - -#include - -// Struct representing a rectangle of pixels. -// The axes are inclusive-exclusive, ie. the point (top, left) is included -// in the rectangle but (bottom, right) is not. -typedef struct { - int left, right, top, bottom; -} PixelRect; - -static INLINE int rect_width(const PixelRect *r) { return r->right - r->left; } - -static INLINE int rect_height(const PixelRect *r) { return r->bottom - r->top; } - -static INLINE bool is_inside_rect(const int x, const int y, - const PixelRect *r) { - return (r->left <= x && x < r->right) && (r->top <= y && y < r->bottom); -} - -#endif // AOM_AOM_DSP_RECT_H_ diff -Nru aom-3.8.2/aom_dsp/simd/v128_intrinsics_arm.h aom-3.9.0/aom_dsp/simd/v128_intrinsics_arm.h --- aom-3.8.2/aom_dsp/simd/v128_intrinsics_arm.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/simd/v128_intrinsics_arm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,977 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ - -#include - -#include "config/aom_config.h" - -#include "aom_dsp/simd/v64_intrinsics_arm.h" - -typedef int64x2_t v128; - -SIMD_INLINE uint32_t v128_low_u32(v128 a) { - return v64_low_u32(vget_low_s64(a)); -} - -SIMD_INLINE v64 v128_low_v64(v128 a) { return vget_low_s64(a); } - -SIMD_INLINE v64 v128_high_v64(v128 a) { return vget_high_s64(a); } - -SIMD_INLINE v128 v128_from_v64(v64 a, v64 b) { return vcombine_s64(b, a); } - -SIMD_INLINE v128 v128_from_64(uint64_t a, uint64_t b) { - return vcombine_s64(vcreate_s64(b), vcreate_s64(a)); -} - -SIMD_INLINE v128 v128_from_32(uint32_t a, uint32_t b, uint32_t c, uint32_t d) { - return vcombine_s64(v64_from_32(c, d), v64_from_32(a, b)); -} - -SIMD_INLINE v128 v128_load_aligned(const void *p) { - return vreinterpretq_s64_u8(vld1q_u8((const uint8_t *)p)); -} - -SIMD_INLINE v128 v128_load_unaligned(const void *p) { - return v128_load_aligned(p); -} - -SIMD_INLINE void v128_store_aligned(void *p, v128 r) { - vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); -} - -SIMD_INLINE void v128_store_unaligned(void *p, v128 r) { - vst1q_u8((uint8_t *)p, vreinterpretq_u8_s64(r)); -} - -SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) { -// The following functions require an immediate. -// Some compilers will check this during optimisation, others wont. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - return c ? vreinterpretq_s64_s8( - vextq_s8(vreinterpretq_s8_s64(b), vreinterpretq_s8_s64(a), c)) - : b; -#else - return c < 8 ? v128_from_v64(v64_align(v128_low_v64(a), v128_high_v64(b), c), - v64_align(v128_high_v64(b), v128_low_v64(b), c)) - : v128_from_v64( - v64_align(v128_high_v64(a), v128_low_v64(a), c - 8), - v64_align(v128_low_v64(a), v128_high_v64(b), c - 8)); -#endif -} - -SIMD_INLINE v128 v128_zero(void) { return vreinterpretq_s64_u8(vdupq_n_u8(0)); } - -SIMD_INLINE v128 v128_ones(void) { - return vreinterpretq_s64_u8(vdupq_n_u8(-1)); -} - -SIMD_INLINE v128 v128_dup_8(uint8_t x) { - return vreinterpretq_s64_u8(vdupq_n_u8(x)); -} - -SIMD_INLINE v128 v128_dup_16(uint16_t x) { - return vreinterpretq_s64_u16(vdupq_n_u16(x)); -} - -SIMD_INLINE v128 v128_dup_32(uint32_t x) { - return vreinterpretq_s64_u32(vdupq_n_u32(x)); -} - -SIMD_INLINE v128 v128_dup_64(uint64_t x) { - return vreinterpretq_s64_u64(vdupq_n_u64(x)); -} - -SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) { - int16x8_t t1 = vmulq_s16( - vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b))))); - int16x8_t t2 = vmulq_s16( - vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b))))); -#if AOM_ARCH_AARCH64 - return vaddlvq_s16(t1) + vaddlvq_s16(t2); -#else - int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2))); - return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0); -#endif -} - -SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) { - return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) + - v64_dotp_s16(vget_low_s64(a), vget_low_s64(b)); -} - -SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) { - int64x2_t t = vpaddlq_s32( - vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); - return vget_lane_s64(vadd_s64(vget_high_s64(t), vget_low_s64(t)), 0); -} - -SIMD_INLINE uint64_t v128_hadd_u8(v128 x) { -#if AOM_ARCH_AARCH64 - return vaddlvq_u8(vreinterpretq_u8_s64(x)); -#else - uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x)))); - return vget_lane_s32( - vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); -#endif -} - -SIMD_INLINE v128 v128_padd_s16(v128 a) { - return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a))); -} - -SIMD_INLINE v128 v128_padd_u8(v128 a) { - return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a))); -} - -typedef struct { - sad64_internal hi, lo; -} sad128_internal; - -SIMD_INLINE sad128_internal v128_sad_u8_init(void) { - sad128_internal s; - s.hi = s.lo = vdupq_n_u16(0); - return s; -} - -/* Implementation dependent return value. Result must be finalised with - v128_sad_u8_sum(). - The result for more than 32 v128_sad_u8() calls is undefined. */ -SIMD_INLINE sad128_internal v128_sad_u8(sad128_internal s, v128 a, v128 b) { - sad128_internal r; - r.hi = v64_sad_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); - r.lo = v64_sad_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); - return r; -} - -SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) { -#if AOM_ARCH_AARCH64 - return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo); -#else - uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo))); - return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)), - 0); -#endif -} - -typedef struct { - ssd64_internal hi, lo; -} ssd128_internal; - -SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { - ssd128_internal s; - s.hi = s.lo = v64_ssd_u8_init(); - return s; -} - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_u8_sum(). */ -SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) { - ssd128_internal r; - r.hi = v64_ssd_u8(s.hi, vget_high_s64(a), vget_high_s64(b)); - r.lo = v64_ssd_u8(s.lo, vget_low_s64(a), vget_low_s64(b)); - return r; -} - -SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) { - return (uint32_t)(v64_ssd_u8_sum(s.hi) + v64_ssd_u8_sum(s.lo)); -} - -SIMD_INLINE v128 v128_or(v128 x, v128 y) { return vorrq_s64(x, y); } - -SIMD_INLINE v128 v128_xor(v128 x, v128 y) { return veorq_s64(x, y); } - -SIMD_INLINE v128 v128_and(v128 x, v128 y) { return vandq_s64(x, y); } - -SIMD_INLINE v128 v128_andn(v128 x, v128 y) { return vbicq_s64(x, y); } - -SIMD_INLINE v128 v128_add_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_add_16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_sadd_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vqaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_add_32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y))); -} - -SIMD_INLINE v128 v128_add_64(v128 x, v128 y) { - return vreinterpretq_s64_u64( - vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y))); -} - -SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_sub_16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vqsubq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vqsubq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vqsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_ssub_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vqsubq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_sub_32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); } - -SIMD_INLINE v128 v128_abs_s16(v128 x) { - return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x))); -} - -SIMD_INLINE v128 v128_abs_s8(v128 x) { - return vreinterpretq_s64_s8(vabsq_s8(vreinterpretq_s8_s64(x))); -} - -SIMD_INLINE v128 v128_mul_s16(v64 a, v64 b) { - return vreinterpretq_s64_s32( - vmull_s16(vreinterpret_s16_s64(a), vreinterpret_s16_s64(b))); -} - -SIMD_INLINE v128 v128_mullo_s16(v128 a, v128 b) { - return vreinterpretq_s64_s16( - vmulq_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))); -} - -SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_s16(vuzp2q_s16( - vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), - vreinterpret_s16_s64(vget_low_s64(b)))), - vreinterpretq_s16_s32( - vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b))))); -#else - return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)), - v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) { - return vreinterpretq_s64_s32( - vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b))); -} - -SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { -#if AOM_ARCH_AARCH64 - int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)), - vreinterpret_s16_s64(vget_low_s64(b))); - int32x4_t t2 = - vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)); - return vreinterpretq_s64_s32(vpaddq_s32(t1, t2)); -#else - return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)), - v64_madd_s16(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { -#if AOM_ARCH_AARCH64 - int16x8_t t1 = vmulq_s16( - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))), - vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b)))); - int16x8_t t2 = vmulq_s16( - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))), - vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b)))); - return vreinterpretq_s64_s16( - vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2))); -#else - return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)), - v64_madd_us8(vget_low_s64(a), vget_low_s64(b))); -#endif -} - -SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vrhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_rdavg_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y))); -} - -SIMD_INLINE v128 v128_min_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vminq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_max_u8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vmaxq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_min_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE uint32_t v128_movemask_8(v128 a) { - a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0))); -#if AOM_ARCH_AARCH64 - uint8x16_t m = - vandq_u8(vreinterpretq_u8_s64(a), - vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))); - return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8); -#else - uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8( - vandq_u8(vreinterpretq_u8_s64(a), - vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)))))); - int64x2_t s = vreinterpretq_s64_u64(m); - return v64_low_u32(v64_ziplo_8(vget_high_s64(s), vget_low_s64(s))); -#endif -} - -SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) { - c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0))); - return v128_or(v128_and(b, c), v128_andn(a, c)); -} - -SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) { - return vreinterpretq_s64_s8( - vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_min_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vminq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_max_s16(v128 x, v128 y) { - return vreinterpretq_s64_s16( - vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) { - return vreinterpretq_s64_s32( - vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) { - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpretq_s64_u8(vcombine_u8(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); - return vreinterpretq_s64_s16(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x)); - return vreinterpretq_s64_s16(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) { - uint16x4x2_t r = vzip_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpretq_s64_u16(vcombine_u16(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); - return vreinterpretq_s64_s32(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x)); - return vreinterpretq_s64_s32(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) { - uint32x2x2_t r = vzip_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)); - return vreinterpretq_s64_u32(vcombine_u32(r.val[0], r.val[1])); -} - -SIMD_INLINE v128 v128_ziplo_64(v128 a, v128 b) { - return v128_from_v64(vget_low_s64(a), vget_low_s64(b)); -} - -SIMD_INLINE v128 v128_ziphi_64(v128 a, v128 b) { - return v128_from_v64(vget_high_s64(a), vget_high_s64(b)); -} - -SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x))); -#else - uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)); - return vreinterpretq_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); - return vreinterpretq_s64_u16(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u16( - vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x))); -#else - uint16x8x2_t r = - vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)); - return vreinterpretq_s64_u16(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - uint32x4x2_t r = - vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); - return vreinterpretq_s64_u32(r.val[0]); -#endif -} - -SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u32( - vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x))); -#else - uint32x4x2_t r = - vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)); - return vreinterpretq_s64_u32(r.val[1]); -#endif -} - -SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_u8_s16(v128 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_u8_s16(v128 a) { - return vreinterpretq_s64_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_unpack_s8_s16(v64 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_s8_s16(v128 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_s8_s16(v128 a) { - return vreinterpretq_s64_s16(vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(a))), - vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))), - vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))), - vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(b)))); -} - -SIMD_INLINE v128 v128_pack_s16_s8(v128 a, v128 b) { - return v128_from_v64( - vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(a))), - vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s64(b)))); -} - -SIMD_INLINE v128 v128_unpack_u16_s32(v64 a) { - return vreinterpretq_s64_u32(vmovl_u16(vreinterpret_u16_s64(a))); -} - -SIMD_INLINE v128 v128_unpack_s16_s32(v64 a) { - return vreinterpretq_s64_s32(vmovl_s16(vreinterpret_s16_s64(a))); -} - -SIMD_INLINE v128 v128_unpacklo_u16_s32(v128 a) { - return vreinterpretq_s64_u32( - vmovl_u16(vreinterpret_u16_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpacklo_s16_s32(v128 a) { - return vreinterpretq_s64_s32( - vmovl_s16(vreinterpret_s16_s64(vget_low_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_u16_s32(v128 a) { - return vreinterpretq_s64_u32( - vmovl_u16(vreinterpret_u16_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_unpackhi_s16_s32(v128 a) { - return vreinterpretq_s64_s32( - vmovl_s16(vreinterpret_s16_s64(vget_high_s64(a)))); -} - -SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) { -#if AOM_ARCH_AARCH64 - return vreinterpretq_s64_u8( - vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern))); -#else - uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)), - vget_high_u8(vreinterpretq_u8_s64(x)) } }; - uint8x8_t shuffle_hi = - vtbl2_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern))); - uint8x8_t shuffle_lo = - vtbl2_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern))); - return v128_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle_lo), 0)); -#endif -} - -SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vcgtq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vcltq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_8(v128 x, v128 y) { - return vreinterpretq_s64_u8( - vceqq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y))); -} - -SIMD_INLINE v128 v128_cmpgt_s16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vcgtq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vcltq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_16(v128 x, v128 y) { - return vreinterpretq_s64_u16( - vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y))); -} - -SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) { - return vreinterpretq_s64_u32( - vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y))); -} - -SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() - : vreinterpretq_s64_u8(vshlq_u8(vreinterpretq_u8_s64(a), - vdupq_n_s8((int8_t)c))); -} - -SIMD_INLINE v128 v128_shr_u8(v128 a, unsigned int c) { - return (c > 7) ? v128_zero() - : vreinterpretq_s64_u8(vshlq_u8(vreinterpretq_u8_s64(a), - vdupq_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v128 v128_shr_s8(v128 a, unsigned int c) { - return (c > 7) ? v128_ones() - : vreinterpretq_s64_s8(vshlq_s8(vreinterpretq_s8_s64(a), - vdupq_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v128 v128_shl_16(v128 a, unsigned int c) { - return (c > 15) ? v128_zero() - : vreinterpretq_s64_u16(vshlq_u16(vreinterpretq_u16_s64(a), - vdupq_n_s16((int16_t)c))); -} - -SIMD_INLINE v128 v128_shr_u16(v128 a, unsigned int c) { - return (c > 15) ? v128_zero() - : vreinterpretq_s64_u16(vshlq_u16(vreinterpretq_u16_s64(a), - vdupq_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v128 v128_shr_s16(v128 a, unsigned int c) { - return (c > 15) ? v128_ones() - : vreinterpretq_s64_s16(vshlq_s16(vreinterpretq_s16_s64(a), - vdupq_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v128 v128_shl_32(v128 a, unsigned int c) { - return (c > 31) ? v128_zero() - : vreinterpretq_s64_u32(vshlq_u32(vreinterpretq_u32_s64(a), - vdupq_n_s32((int32_t)c))); -} - -SIMD_INLINE v128 v128_shr_u32(v128 a, unsigned int c) { - return (c > 31) ? v128_zero() - : vreinterpretq_s64_u32(vshlq_u32(vreinterpretq_u32_s64(a), - vdupq_n_s32(-(int32_t)c))); -} - -SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) { - return (c > 31) ? v128_ones() - : vreinterpretq_s64_s32(vshlq_s32(vreinterpretq_s32_s64(a), - vdupq_n_s32(-(int32_t)c))); -} - -SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) { - return (c > 63) ? v128_zero() - : vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a), - vdupq_n_s64((int64_t)c))); -} - -SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) { - return (c > 63) ? v128_zero() - : vreinterpretq_s64_u64(vshlq_u64(vreinterpretq_u64_s64(a), - vdupq_n_s64(-(int64_t)c))); -} - -SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) { - return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-(int64_t)c)); -} - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - -SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { - return n < 8 - ? v128_from_64( - (uint64_t)vorr_u64( - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - n * 8), - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - (8 - n) * 8)), - (uint64_t)vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - n * 8)) - : (n == 8 ? v128_from_64( - (uint64_t)vreinterpret_u64_s64(vget_low_s64(a)), 0) - : v128_from_64((uint64_t)vshl_n_u64( - vreinterpret_u64_s64(vget_low_s64(a)), - (n - 8) * 8), - 0)); -} - -SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { - return n == 0 - ? a - : (n < 8 - ? v128_from_64( - (uint64_t)vshr_n_u64( - vreinterpret_u64_s64(vget_high_s64(a)), n * 8), - (uint64_t)vorr_u64( - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a)), - n * 8), - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a)), - (8 - n) * 8))) - : (n == 8 ? v128_from_64(0, (uint64_t)vreinterpret_u64_s64( - vget_high_s64(a))) - : v128_from_64(0, (uint64_t)vshr_n_u64( - vreinterpret_u64_s64( - vget_high_s64(a)), - (n - 8) * 8)))); -} - -SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u8(vshlq_n_u8(vreinterpretq_u8_s64(a), c)) : a; -} - -SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u8(vshrq_n_u8(vreinterpretq_u8_s64(a), c)) : a; -} - -SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_s8(vshrq_n_s8(vreinterpretq_s8_s64(a), c)) : a; -} - -SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u16(vshlq_n_u16(vreinterpretq_u16_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u16(vshrq_n_u16(vreinterpretq_u16_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_s16(vshrq_n_s16(vreinterpretq_s16_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u32(vshlq_n_u32(vreinterpretq_u32_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u32(vshrq_n_u32(vreinterpretq_u32_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { - return c ? vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c)) - : a; -} - -SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { - return c ? vshrq_n_s64(a, c) : a; -} - -#else - -SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) { - if (n < 8) - return v128_from_v64(v64_or(v64_shl_n_byte(v128_high_v64(a), n), - v64_shr_n_byte(v128_low_v64(a), 8 - n)), - v64_shl_n_byte(v128_low_v64(a), n)); - else - return v128_from_v64(v64_shl_n_byte(v128_low_v64(a), n - 8), v64_zero()); -} - -SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) { - if (n < 8) - return v128_from_v64(v64_shr_n_byte(v128_high_v64(a), n), - v64_or(v64_shr_n_byte(v128_low_v64(a), n), - v64_shl_n_byte(v128_high_v64(a), 8 - n))); - else - return v128_from_v64(v64_zero(), v64_shr_n_byte(v128_high_v64(a), n - 8)); -} - -SIMD_INLINE v128 v128_shl_n_8(v128 a, unsigned int c) { - return v128_shl_8(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int c) { - return v128_shr_u8(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int c) { - return v128_shr_s8(a, c); -} - -SIMD_INLINE v128 v128_shl_n_16(v128 a, unsigned int c) { - return v128_shl_16(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u16(v128 a, unsigned int c) { - return v128_shr_u16(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s16(v128 a, unsigned int c) { - return v128_shr_s16(a, c); -} - -SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int c) { - return v128_shl_32(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int c) { - return v128_shr_u32(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int c) { - return v128_shr_s32(a, c); -} - -SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) { - return v128_shl_64(a, c); -} - -SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) { - return v128_shr_u64(a, c); -} - -SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) { - return v128_shr_s64(a, c); -} - -#endif - -typedef uint32x4_t sad128_internal_u16; - -SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { - return vdupq_n_u32(0); -} - -/* Implementation dependent return value. Result must be finalised with - * v128_sad_u16_sum(). */ -SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a, - v128 b) { - return vaddq_u32( - s, vpaddlq_u16(vsubq_u16( - vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)), - vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b))))); -} - -SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) { - uint64x2_t t = vpaddlq_u32(s); - return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t)), - 0); -} - -typedef v128 ssd128_internal_s16; -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } - -/* Implementation dependent return value. Result must be finalised with - * v128_ssd_s16_sum(). */ -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a, - v128 b) { - v128 d = v128_sub_16(a, b); - d = v128_madd_s16(d, d); - return v128_add_64( - s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d)))); -} - -SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) { - return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s)); -} - -#endif // AOM_AOM_DSP_SIMD_V128_INTRINSICS_ARM_H_ diff -Nru aom-3.8.2/aom_dsp/simd/v128_intrinsics_x86.h aom-3.9.0/aom_dsp/simd/v128_intrinsics_x86.h --- aom-3.8.2/aom_dsp/simd/v128_intrinsics_x86.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/simd/v128_intrinsics_x86.h 2024-05-07 19:57:02.535000000 +0000 @@ -79,7 +79,7 @@ #endif #endif -SIMD_INLINE v128 v128_zero() { return _mm_setzero_si128(); } +SIMD_INLINE v128 v128_zero(void) { return _mm_setzero_si128(); } SIMD_INLINE v128 v128_dup_8(uint8_t x) { return _mm_set1_epi8((char)x); } @@ -345,7 +345,9 @@ typedef v128 sad128_internal; -SIMD_INLINE sad128_internal v128_sad_u8_init() { return _mm_setzero_si128(); } +SIMD_INLINE sad128_internal v128_sad_u8_init(void) { + return _mm_setzero_si128(); +} /* Implementation dependent return value. Result must be finalised with v128_sad_sum(). @@ -360,7 +362,7 @@ typedef int32_t ssd128_internal; -SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; } +SIMD_INLINE ssd128_internal v128_ssd_u8_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v128_ssd_sum(). */ @@ -612,7 +614,7 @@ typedef v128 sad128_internal_u16; -SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); } +SIMD_INLINE sad128_internal_u16 v128_sad_u16_init(void) { return v128_zero(); } /* Implementation dependent return value. Result must be finalised with * v128_sad_u16_sum(). */ @@ -638,7 +640,7 @@ typedef v128 ssd128_internal_s16; -SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); } +SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init(void) { return v128_zero(); } /* Implementation dependent return value. Result must be finalised with * v128_ssd_s16_sum(). */ diff -Nru aom-3.8.2/aom_dsp/simd/v256_intrinsics_arm.h aom-3.9.0/aom_dsp/simd/v256_intrinsics_arm.h --- aom-3.8.2/aom_dsp/simd/v256_intrinsics_arm.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/simd/v256_intrinsics_arm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ - -#include "aom_dsp/simd/v256_intrinsics_v128.h" - -#endif // AOM_AOM_DSP_SIMD_V256_INTRINSICS_ARM_H_ diff -Nru aom-3.8.2/aom_dsp/simd/v256_intrinsics_c.h aom-3.9.0/aom_dsp/simd/v256_intrinsics_c.h --- aom-3.8.2/aom_dsp/simd/v256_intrinsics_c.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/simd/v256_intrinsics_c.h 2024-05-07 19:57:02.537000000 +0000 @@ -95,7 +95,7 @@ c_v256_store_unaligned(p, a); } -SIMD_INLINE c_v256 c_v256_zero() { +SIMD_INLINE c_v256 c_v256_zero(void) { c_v256 t; t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = 0; return t; @@ -176,7 +176,7 @@ typedef uint32_t c_ssd256_internal; -SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init() { return 0; } +SIMD_INLINE c_ssd256_internal c_v256_ssd_u8_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_u8_sum(). */ @@ -929,7 +929,7 @@ typedef uint32_t c_sad256_internal_u16; -SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; } +SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with v256_sad_u16_sum(). */ @@ -945,7 +945,7 @@ typedef uint64_t c_ssd256_internal_s16; -SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; } +SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init(void) { return 0; } /* Implementation dependent return value. Result must be finalised with * v256_ssd_s16_sum(). */ diff -Nru aom-3.8.2/aom_dsp/simd/v256_intrinsics_v128.h aom-3.9.0/aom_dsp/simd/v256_intrinsics_v128.h --- aom-3.8.2/aom_dsp/simd/v256_intrinsics_v128.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/simd/v256_intrinsics_v128.h 2024-05-07 19:57:02.538000000 +0000 @@ -15,20 +15,18 @@ #include "config/aom_config.h" #if HAVE_NEON -#include "aom_dsp/simd/v128_intrinsics_arm.h" -#elif HAVE_SSE2 +#error "Do not use this file for Neon" +#endif + +#if HAVE_SSE2 #include "aom_dsp/simd/v128_intrinsics_x86.h" #else #include "aom_dsp/simd/v128_intrinsics.h" #endif -#if HAVE_NEON -typedef int64x2x2_t v256; -#else typedef struct { v128 val[2]; } v256; -#endif SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); } @@ -615,33 +613,6 @@ } SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) { -#if HAVE_NEON -#if AOM_ARCH_AARCH64 - uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]), - vreinterpretq_u8_s64(x.val[1]) } }; - return v256_from_v128( - vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), - vreinterpretq_s64_u8( - vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); -#else - uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), - vget_high_u8(vreinterpretq_u8_s64(x.val[0])), - vget_low_u8(vreinterpretq_u8_s64(x.val[1])), - vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; - uint8x8_t shuffle1_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1]))); - uint8x8_t shuffle1_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1]))); - uint8x8_t shuffle0_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0]))); - uint8x8_t shuffle0_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))); - return v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0)); -#endif -#else v128 c16 = v128_dup_8(16); v128 maskhi = v128_cmplt_s8(pattern.val[1], c16); v128 masklo = v128_cmplt_s8(pattern.val[0], c16); @@ -650,56 +621,9 @@ v128_shuffle_8(x.val[0], pattern.val[1]), maskhi), v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)), v128_shuffle_8(x.val[0], pattern.val[0]), masklo)); -#endif } SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) { -#if HAVE_NEON -#if AOM_ARCH_AARCH64 - uint8x16x4_t p = { { - vreinterpretq_u8_s64(y.val[0]), - vreinterpretq_u8_s64(y.val[1]), - vreinterpretq_u8_s64(x.val[0]), - vreinterpretq_u8_s64(x.val[1]), - } }; - return v256_from_v128( - vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))), - vreinterpretq_s64_u8( - vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0])))); -#else - v256 c32 = v256_dup_8(32); - v256 p32 = v256_sub_8(pattern, c32); - uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])), - vget_high_u8(vreinterpretq_u8_s64(x.val[0])), - vget_low_u8(vreinterpretq_u8_s64(x.val[1])), - vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } }; - uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])), - vget_high_u8(vreinterpretq_u8_s64(y.val[0])), - vget_low_u8(vreinterpretq_u8_s64(y.val[1])), - vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } }; - uint8x8_t shuffle1_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[1]))); - uint8x8_t shuffle1_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[1]))); - uint8x8_t shuffle0_hi = - vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(p32.val[0]))); - uint8x8_t shuffle0_lo = - vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(p32.val[0]))); - v256 r1 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0)); - shuffle1_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1]))); - shuffle1_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1]))); - shuffle0_hi = vtbl4_u8(q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0]))); - shuffle0_lo = vtbl4_u8(q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0]))); - v256 r2 = v256_from_64(vget_lane_u64(vreinterpret_u64_u8(shuffle1_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle1_lo), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_hi), 0), - vget_lane_u64(vreinterpret_u64_u8(shuffle0_lo), 0)); - return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32)); -#endif -#else v128 c16 = v128_dup_8(16); v128 c32 = v128_dup_8(32); v128 c48 = v128_dup_8(48); @@ -720,7 +644,6 @@ v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)), v128_shuffle_8(y.val[0], pattern.val[0]), masklo16)); return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern)); -#endif } SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) { diff -Nru aom-3.8.2/aom_dsp/simd/v64_intrinsics_arm.h aom-3.9.0/aom_dsp/simd/v64_intrinsics_arm.h --- aom-3.8.2/aom_dsp/simd/v64_intrinsics_arm.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/simd/v64_intrinsics_arm.h 1970-01-01 00:00:00.000000000 +0000 @@ -1,679 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#ifndef AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ -#define AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ - -#include -#include - -#include "config/aom_config.h" - -#include "aom_dsp/simd/v64_intrinsics_arm.h" -#include "aom_ports/arm.h" - -#ifdef AOM_INCOMPATIBLE_GCC -#error Incompatible gcc -#endif - -typedef int64x1_t v64; - -SIMD_INLINE uint32_t v64_low_u32(v64 a) { - return vget_lane_u32(vreinterpret_u32_s64(a), 0); -} - -SIMD_INLINE uint32_t v64_high_u32(v64 a) { - return vget_lane_u32(vreinterpret_u32_s64(a), 1); -} - -SIMD_INLINE int32_t v64_low_s32(v64 a) { - return vget_lane_s32(vreinterpret_s32_s64(a), 0); -} - -SIMD_INLINE int32_t v64_high_s32(v64 a) { - return vget_lane_s32(vreinterpret_s32_s64(a), 1); -} - -SIMD_INLINE v64 v64_from_16(uint16_t a, uint16_t b, uint16_t c, uint16_t d) { - return vcreate_s64((uint64_t)a << 48 | (uint64_t)b << 32 | (uint64_t)c << 16 | - d); -} - -SIMD_INLINE v64 v64_from_32(uint32_t x, uint32_t y) { - return vcreate_s64((uint64_t)x << 32 | y); -} - -SIMD_INLINE v64 v64_from_64(uint64_t x) { return vcreate_s64(x); } - -SIMD_INLINE uint64_t v64_u64(v64 x) { return (uint64_t)vget_lane_s64(x, 0); } - -SIMD_INLINE uint32_t u32_load_aligned(const void *p) { - return *((uint32_t *)p); -} - -SIMD_INLINE uint32_t u32_load_unaligned(const void *p) { - return vget_lane_u32(vreinterpret_u32_u8(vld1_u8((const uint8_t *)p)), 0); -} - -SIMD_INLINE void u32_store_aligned(void *p, uint32_t a) { - *((uint32_t *)p) = a; -} - -SIMD_INLINE void u32_store_unaligned(void *p, uint32_t a) { memcpy(p, &a, 4); } - -SIMD_INLINE v64 v64_load_aligned(const void *p) { - return vreinterpret_s64_u8(vld1_u8((const uint8_t *)p)); -} - -SIMD_INLINE v64 v64_load_unaligned(const void *p) { - return v64_load_aligned(p); -} - -SIMD_INLINE void v64_store_aligned(void *p, v64 r) { - vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); -} - -SIMD_INLINE void v64_store_unaligned(void *p, v64 r) { - vst1_u8((uint8_t *)p, vreinterpret_u8_s64(r)); -} - -// The following function requires an immediate. -// Some compilers will check this if it's optimising, others wont. -SIMD_INLINE v64 v64_align(v64 a, v64 b, unsigned int c) { -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - return c ? vreinterpret_s64_s8( - vext_s8(vreinterpret_s8_s64(b), vreinterpret_s8_s64(a), c)) - : b; -#else - return c ? v64_from_64(((uint64_t)vget_lane_s64(b, 0) >> c * 8) | - ((uint64_t)vget_lane_s64(a, 0) << (8 - c) * 8)) - : b; -#endif -} - -SIMD_INLINE v64 v64_zero(void) { return vreinterpret_s64_u8(vdup_n_u8(0)); } - -SIMD_INLINE v64 v64_dup_8(uint8_t x) { - return vreinterpret_s64_u8(vdup_n_u8(x)); -} - -SIMD_INLINE v64 v64_dup_16(uint16_t x) { - return vreinterpret_s64_u16(vdup_n_u16(x)); -} - -SIMD_INLINE v64 v64_dup_32(uint32_t x) { - return vreinterpret_s64_u32(vdup_n_u32(x)); -} - -SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) { - int16x8_t t = - vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)), - vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y)))); -#if AOM_ARCH_AARCH64 - return vaddlvq_s16(t); -#else - int64x2_t r = vpaddlq_s32(vpaddlq_s16(t)); - return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0); -#endif -} - -SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vaddlvq_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -#else - int64x2_t r = - vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); - return vget_lane_s64(vadd_s64(vget_high_s64(r), vget_low_s64(r)), 0); -#endif -} - -SIMD_INLINE uint64_t v64_hadd_u8(v64 x) { -#if AOM_ARCH_AARCH64 - return vaddlv_u8(vreinterpret_u8_s64(x)); -#else - return vget_lane_u64( - vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x)))), 0); -#endif -} - -SIMD_INLINE int64_t v64_hadd_s16(v64 a) { - return vget_lane_s64(vpaddl_s32(vpaddl_s16(vreinterpret_s16_s64(a))), 0); -} - -typedef uint16x8_t sad64_internal; - -SIMD_INLINE sad64_internal v64_sad_u8_init(void) { return vdupq_n_u16(0); } - -// Implementation dependent return value. Result must be finalised with -// v64_sad_u8_sum(). -SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) { - return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); -} - -SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) { -#if AOM_ARCH_AARCH64 - return vaddlvq_u16(s); -#else - uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s)); - return (uint32_t)vget_lane_u64(vadd_u64(vget_high_u64(r), vget_low_u64(r)), - 0); -#endif -} - -typedef uint32x4_t ssd64_internal; - -SIMD_INLINE ssd64_internal v64_ssd_u8_init(void) { return vdupq_n_u32(0); } - -// Implementation dependent return value. Result must be finalised with -// v64_ssd_u8_sum(). -SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) { - uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b)); - return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t))); -} - -SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) { -#if AOM_ARCH_AARCH64 - return vaddvq_u32(s); -#else - uint64x2_t t = vpaddlq_u32(s); - return vget_lane_u32( - vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0); -#endif -} - -SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); } - -SIMD_INLINE v64 v64_xor(v64 x, v64 y) { return veor_s64(x, y); } - -SIMD_INLINE v64 v64_and(v64 x, v64 y) { return vand_s64(x, y); } - -SIMD_INLINE v64 v64_andn(v64 x, v64 y) { return vbic_s64(x, y); } - -SIMD_INLINE v64 v64_add_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_add_16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_sadd_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vqadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_add_32(v64 x, v64 y) { - return vreinterpret_s64_u32( - vadd_u32(vreinterpret_u32_s64(x), vreinterpret_u32_s64(y))); -} - -SIMD_INLINE v64 v64_sub_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_sub_16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vqsub_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vqsub_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vqsub_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_ssub_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vqsub_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_sub_32(v64 x, v64 y) { - return vreinterpret_s64_s32( - vsub_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); -} - -SIMD_INLINE v64 v64_abs_s16(v64 x) { - return vreinterpret_s64_s16(vabs_s16(vreinterpret_s16_s64(x))); -} - -SIMD_INLINE v64 v64_abs_s8(v64 x) { - return vreinterpret_s64_s8(vabs_s8(vreinterpret_s8_s64(x))); -} - -SIMD_INLINE v64 v64_mullo_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmul_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - int16x8_t t = vreinterpretq_s16_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); - return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t))); -#else - return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32( - vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16))); -#endif -} - -SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) { - return vreinterpret_s64_s32( - vmul_s32(vreinterpret_s32_s64(x), vreinterpret_s32_s64(y))); -} - -SIMD_INLINE v64 v64_madd_s16(v64 x, v64 y) { - int32x4_t t = vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)); - return vreinterpret_s64_s32( - vpadd_s32(vreinterpret_s32_s64(vget_low_s64(vreinterpretq_s64_s32(t))), - vreinterpret_s32_s64(vget_high_s64(vreinterpretq_s64_s32(t))))); -} - -SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) { - int16x8_t t = - vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))), - vmovl_s8(vreinterpret_s8_s64(y))); - return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t))); -} - -SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vrhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_rdavg_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y))); -} - -SIMD_INLINE v64 v64_max_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vmax_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_min_u8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vmin_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_max_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vmax_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_min_s8(v64 x, v64 y) { - return vreinterpret_s64_s8( - vmin_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_max_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmax_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_min_s16(v64 x, v64 y) { - return vreinterpret_s64_s16( - vmin_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); - return vreinterpret_s64_s16(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x)); - return vreinterpret_s64_s16(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u32( - vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); -#else - int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); - return vreinterpret_s64_s32(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u32( - vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x))); -#else - int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)); - return vreinterpret_s64_s32(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) { - return vreinterpret_s64_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpackhi_u8_s16(v64 a) { - return vreinterpret_s64_u16(vget_high_u16(vmovl_u8(vreinterpret_u8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpacklo_s8_s16(v64 a) { - return vreinterpret_s64_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_s64(a)))); -} - -SIMD_INLINE v64 v64_unpackhi_s8_s16(v64 a) { - return vreinterpret_s64_s16(vget_high_s16(vmovl_s8(vreinterpret_s8_s64(a)))); -} - -SIMD_INLINE v64 v64_pack_s32_s16(v64 x, v64 y) { - return vreinterpret_s64_s16(vqmovn_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); -} - -SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) { - return vreinterpret_s64_u16(vqmovun_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))); -} - -SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) { - return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); -} - -SIMD_INLINE v64 v64_pack_s16_s8(v64 x, v64 y) { - return vreinterpret_s64_s8(vqmovn_s16(vreinterpretq_s16_s32( - vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))))); -} - -SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u8( - vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x))); -#else - uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)); - return vreinterpret_s64_u8(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpret_s64_u16(r.val[0]); -#endif -} - -SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) { -#if AOM_ARCH_AARCH64 - return vreinterpret_s64_u16( - vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x))); -#else - uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)); - return vreinterpret_s64_u16(r.val[1]); -#endif -} - -SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) { - return vreinterpret_s64_s32(vget_low_s32(vmovl_s16(vreinterpret_s16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpacklo_u16_s32(v64 x) { - return vreinterpret_s64_u32(vget_low_u32(vmovl_u16(vreinterpret_u16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpackhi_s16_s32(v64 x) { - return vreinterpret_s64_s32( - vget_high_s32(vmovl_s16(vreinterpret_s16_s64(x)))); -} - -SIMD_INLINE v64 v64_unpackhi_u16_s32(v64 x) { - return vreinterpret_s64_u32( - vget_high_u32(vmovl_u16(vreinterpret_u16_s64(x)))); -} - -SIMD_INLINE v64 v64_shuffle_8(v64 x, v64 pattern) { - return vreinterpret_s64_u8( - vtbl1_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(pattern))); -} - -SIMD_INLINE v64 v64_cmpgt_s8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vcgt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_cmplt_s8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vclt_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y))); -} - -SIMD_INLINE v64 v64_cmpeq_8(v64 x, v64 y) { - return vreinterpret_s64_u8( - vceq_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y))); -} - -SIMD_INLINE v64 v64_cmpgt_s16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vcgt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_cmplt_s16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vclt_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_cmpeq_16(v64 x, v64 y) { - return vreinterpret_s64_u16( - vceq_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y))); -} - -SIMD_INLINE v64 v64_shl_8(v64 a, unsigned int c) { - return vreinterpret_s64_u8( - vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8((int8_t)c))); -} - -SIMD_INLINE v64 v64_shr_u8(v64 a, unsigned int c) { - return vreinterpret_s64_u8( - vshl_u8(vreinterpret_u8_s64(a), vdup_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v64 v64_shr_s8(v64 a, unsigned int c) { - return vreinterpret_s64_s8( - vshl_s8(vreinterpret_s8_s64(a), vdup_n_s8(-(int8_t)c))); -} - -SIMD_INLINE v64 v64_shl_16(v64 a, unsigned int c) { - return vreinterpret_s64_u16( - vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16((int16_t)c))); -} - -SIMD_INLINE v64 v64_shr_u16(v64 a, unsigned int c) { - return vreinterpret_s64_u16( - vshl_u16(vreinterpret_u16_s64(a), vdup_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v64 v64_shr_s16(v64 a, unsigned int c) { - return vreinterpret_s64_s16( - vshl_s16(vreinterpret_s16_s64(a), vdup_n_s16(-(int16_t)c))); -} - -SIMD_INLINE v64 v64_shl_32(v64 a, unsigned int c) { - return vreinterpret_s64_u32( - vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32((int32_t)c))); -} - -SIMD_INLINE v64 v64_shr_u32(v64 a, unsigned int c) { - return vreinterpret_s64_u32( - vshl_u32(vreinterpret_u32_s64(a), vdup_n_s32(-(int32_t)c))); -} - -SIMD_INLINE v64 v64_shr_s32(v64 a, unsigned int c) { - return vreinterpret_s64_s32( - vshl_s32(vreinterpret_s32_s64(a), vdup_n_s32(-(int32_t)c))); -} - -// The following functions require an immediate. -// Some compilers will check this during optimisation, others wont. -#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__) - -SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { - return vshl_n_s64(a, c * 8); -} - -SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { - return c ? (v64)vshr_n_u64(vreinterpret_u64_s64(a), c * 8) : a; -} - -SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u8(vshl_n_u8(vreinterpret_u8_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u8(vshr_n_u8(vreinterpret_u8_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { - return c ? vreinterpret_s64_s8(vshr_n_s8(vreinterpret_s8_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u16(vshl_n_u16(vreinterpret_u16_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u16(vshr_n_u16(vreinterpret_u16_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { - return c ? vreinterpret_s64_s16(vshr_n_s16(vreinterpret_s16_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u32(vshl_n_u32(vreinterpret_u32_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { - return c ? vreinterpret_s64_u32(vshr_n_u32(vreinterpret_u32_s64(a), c)) : a; -} - -SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { - return c ? vreinterpret_s64_s32(vshr_n_s32(vreinterpret_s32_s64(a), c)) : a; -} - -#else - -SIMD_INLINE v64 v64_shl_n_byte(v64 a, unsigned int c) { - return v64_from_64(v64_u64(a) << c * 8); -} - -SIMD_INLINE v64 v64_shr_n_byte(v64 a, unsigned int c) { - return v64_from_64(v64_u64(a) >> c * 8); -} - -SIMD_INLINE v64 v64_shl_n_8(v64 a, unsigned int c) { return v64_shl_8(a, c); } - -SIMD_INLINE v64 v64_shr_n_u8(v64 a, unsigned int c) { return v64_shr_u8(a, c); } - -SIMD_INLINE v64 v64_shr_n_s8(v64 a, unsigned int c) { return v64_shr_s8(a, c); } - -SIMD_INLINE v64 v64_shl_n_16(v64 a, unsigned int c) { return v64_shl_16(a, c); } - -SIMD_INLINE v64 v64_shr_n_u16(v64 a, unsigned int c) { - return v64_shr_u16(a, c); -} - -SIMD_INLINE v64 v64_shr_n_s16(v64 a, unsigned int c) { - return v64_shr_s16(a, c); -} - -SIMD_INLINE v64 v64_shl_n_32(v64 a, unsigned int c) { return v64_shl_32(a, c); } - -SIMD_INLINE v64 v64_shr_n_u32(v64 a, unsigned int c) { - return v64_shr_u32(a, c); -} - -SIMD_INLINE v64 v64_shr_n_s32(v64 a, unsigned int c) { - return v64_shr_s32(a, c); -} - -#endif - -#endif // AOM_AOM_DSP_SIMD_V64_INTRINSICS_ARM_H_ diff -Nru aom-3.8.2/aom_dsp/variance.c aom-3.9.0/aom_dsp/variance.c --- aom-3.8.2/aom_dsp/variance.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/variance.c 2024-05-07 19:57:02.545000000 +0000 @@ -10,7 +10,6 @@ */ #include #include -#include #include "config/aom_config.h" #include "config/aom_dsp_rtcd.h" @@ -70,12 +69,10 @@ // taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is // applied horizontally (pixel_step = 1) or vertically (pixel_step = stride). // It defines the offset required to move from one input to the next. -void aom_var_filter_block2d_bil_first_pass_c(const uint8_t *a, uint16_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_first_pass_c( + const uint8_t *a, uint16_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -100,12 +97,10 @@ // filter is applied horizontally (pixel_step = 1) or vertically // (pixel_step = stride). It defines the offset required to move from one input // to the next. Output is 8-bit. -void aom_var_filter_block2d_bil_second_pass_c(const uint16_t *a, uint8_t *b, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - const uint8_t *filter) { +static void var_filter_block2d_bil_second_pass_c( + const uint16_t *a, uint8_t *b, unsigned int src_pixels_per_line, + unsigned int pixel_step, unsigned int output_height, + unsigned int output_width, const uint8_t *filter) { unsigned int i, j; for (i = 0; i < output_height; ++i) { @@ -129,19 +124,19 @@ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \ } -#define SUBPIX_VAR(W, H) \ - uint32_t aom_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *a, int a_stride, int xoffset, int yoffset, \ - const uint8_t *b, int b_stride, uint32_t *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ +#define SUBPIX_VAR(W, H) \ + uint32_t aom_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *a, int a_stride, int xoffset, int yoffset, \ + const uint8_t *b, int b_stride, uint32_t *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \ } #define SUBPIX_AVG_VAR(W, H) \ @@ -153,10 +148,10 @@ uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ \ aom_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \ \ @@ -170,10 +165,10 @@ uint8_t temp2[H * W]; \ DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ \ - aom_var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ - bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ + var_filter_block2d_bil_first_pass_c(a, fdata3, a_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ \ aom_dist_wtd_comp_avg_pred(temp3, second_pred, W, H, temp2, W, jcp_param); \ \ @@ -730,24 +725,24 @@ } } -#define MASK_SUBPIX_VAR(W, H) \ - unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *src, int src_stride, int xoffset, int yoffset, \ - const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ - const uint8_t *msk, int msk_stride, int invert_mask, \ - unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ - \ - aom_var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ - invert_mask); \ - return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ +#define MASK_SUBPIX_VAR(W, H) \ + unsigned int aom_masked_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *src, int src_stride, int xoffset, int yoffset, \ + const uint8_t *ref, int ref_stride, const uint8_t *second_pred, \ + const uint8_t *msk, int msk_stride, int invert_mask, \ + unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \ + \ + var_filter_block2d_bil_first_pass_c(src, fdata3, src_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + aom_comp_mask_pred_c(temp3, second_pred, W, H, temp2, W, msk, msk_stride, \ + invert_mask); \ + return aom_variance##W##x##H##_c(temp3, W, ref, ref_stride, sse); \ } MASK_SUBPIX_VAR(4, 4) @@ -924,19 +919,19 @@ return *sse - (unsigned int)(((int64_t)sum * sum) / (W * H)); \ } -#define OBMC_SUBPIX_VAR(W, H) \ - unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ - const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ - const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ - uint16_t fdata3[(H + 1) * W]; \ - uint8_t temp2[H * W]; \ - \ - aom_var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, \ - W, bilinear_filters_2t[xoffset]); \ - aom_var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ - bilinear_filters_2t[yoffset]); \ - \ - return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ +#define OBMC_SUBPIX_VAR(W, H) \ + unsigned int aom_obmc_sub_pixel_variance##W##x##H##_c( \ + const uint8_t *pre, int pre_stride, int xoffset, int yoffset, \ + const int32_t *wsrc, const int32_t *mask, unsigned int *sse) { \ + uint16_t fdata3[(H + 1) * W]; \ + uint8_t temp2[H * W]; \ + \ + var_filter_block2d_bil_first_pass_c(pre, fdata3, pre_stride, 1, H + 1, W, \ + bilinear_filters_2t[xoffset]); \ + var_filter_block2d_bil_second_pass_c(fdata3, temp2, W, W, H, W, \ + bilinear_filters_2t[yoffset]); \ + \ + return aom_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse); \ } OBMC_VAR(4, 4) diff -Nru aom-3.8.2/aom_dsp/x86/aom_asm_stubs.c aom-3.9.0/aom_dsp/x86/aom_asm_stubs.c --- aom-3.8.2/aom_dsp/x86/aom_asm_stubs.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/aom_asm_stubs.c 2024-05-07 19:57:02.548000000 +0000 @@ -15,40 +15,6 @@ #include "aom_dsp/x86/convolve.h" #if HAVE_SSE2 -filter8_1dfunction aom_filter_block1d16_v8_sse2; -filter8_1dfunction aom_filter_block1d16_h8_sse2; -filter8_1dfunction aom_filter_block1d8_v8_sse2; -filter8_1dfunction aom_filter_block1d8_h8_sse2; -filter8_1dfunction aom_filter_block1d4_v8_sse2; -filter8_1dfunction aom_filter_block1d4_h8_sse2; -filter8_1dfunction aom_filter_block1d16_v4_sse2; -filter8_1dfunction aom_filter_block1d16_h4_sse2; - -filter8_1dfunction aom_filter_block1d8_h4_sse2; -filter8_1dfunction aom_filter_block1d8_v4_sse2; -filter8_1dfunction aom_filter_block1d4_h4_sse2; -filter8_1dfunction aom_filter_block1d4_v4_sse2; - -filter8_1dfunction aom_filter_block1d16_v2_sse2; -filter8_1dfunction aom_filter_block1d16_h2_sse2; -filter8_1dfunction aom_filter_block1d8_v2_sse2; -filter8_1dfunction aom_filter_block1d8_h2_sse2; -filter8_1dfunction aom_filter_block1d4_v2_sse2; -filter8_1dfunction aom_filter_block1d4_h2_sse2; - -// void aom_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -// void aom_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, -// uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2) -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2) - #if CONFIG_AV1_HIGHBITDEPTH highbd_filter8_1dfunction aom_highbd_filter_block1d16_v8_sse2; highbd_filter8_1dfunction aom_highbd_filter_block1d16_h8_sse2; diff -Nru aom-3.8.2/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c aom-3.9.0/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c --- aom-3.8.2/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/aom_subpixel_8t_intrin_sse2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,569 +0,0 @@ -/* - * Copyright (c) 2018, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include // SSE2 - -#include "config/aom_dsp_rtcd.h" -#include "aom_dsp/x86/convolve.h" -#include "aom_ports/mem.h" - -void aom_filter_block1d16_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2, srcRegFilt32b2_1, - srcRegFilt32b2_2; - __m128i srcReg32b1, srcReg32b2; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - __m128i ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_1 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 5); - __m128i ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - __m128i ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // reading stride of the next 16 bytes - // (part of it was being read by earlier read) - srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); - - ss_2 = _mm_srli_si128(srcReg32b2, 2); - ss_4 = _mm_srli_si128(srcReg32b2, 4); - ss_1_1 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_2_1 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_1, secondFilters); - d2 = _mm_madd_epi16(ss_2_1, thirdFilters); - srcRegFilt32b2_1 = _mm_add_epi32(d1, d2); - - ss_1 = _mm_srli_si128(srcReg32b2, 3); - ss_3 = _mm_srli_si128(srcReg32b2, 5); - ss_1_2 = _mm_unpacklo_epi8(ss_1, _mm_setzero_si128()); - ss_2_2 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_1_2, secondFilters); - d2 = _mm_madd_epi16(ss_2_2, thirdFilters); - srcRegFilt32b2_2 = _mm_add_epi32(d1, d2); - - res_lo = _mm_unpacklo_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - res_hi = _mm_unpackhi_epi32(srcRegFilt32b2_1, srcRegFilt32b2_2); - srcRegFilt32b2_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); - - src_ptr += src_pixels_per_line; - - _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; - __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; - __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_hi_1 = _mm_unpacklo_epi8(srcReg23_hi, _mm_setzero_si128()); - __m128i resReg23_hi_2 = _mm_unpackhi_epi8(srcReg23_hi, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_hi_1 = _mm_unpacklo_epi8(srcReg34_hi, _mm_setzero_si128()); - __m128i resReg34_hi_2 = _mm_unpackhi_epi8(srcReg34_hi, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_hi_2, secondFilters); - resReg23_hi = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_hi_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_hi_2, secondFilters); - resReg34_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_hi_1 = _mm_unpacklo_epi8(srcReg45_hi, _mm_setzero_si128()); - __m128i resReg45_hi_2 = _mm_unpackhi_epi8(srcReg45_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_hi_2, thirdFilters); - resReg45_hi = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_hi_1 = _mm_unpacklo_epi8(srcReg56_hi, _mm_setzero_si128()); - __m128i resReg56_hi_2 = _mm_unpackhi_epi8(srcReg56_hi, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_hi_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_hi_2, thirdFilters); - resReg56_hi = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); - resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); - resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); - resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); - - src_ptr += src_stride; - - _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); - _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg23_hi_1 = resReg45_hi_1; - resReg23_hi_2 = resReg45_hi_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - resReg34_hi_1 = resReg56_hi_1; - resReg34_hi_2 = resReg56_hi_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d8_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1, srcRegFilt32b1_2; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - __m128i d1 = _mm_madd_epi16(ss_2, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_4, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - d1 = _mm_madd_epi16(ss_3, secondFilters); - d2 = _mm_madd_epi16(ss_5, thirdFilters); - srcRegFilt32b1_2 = _mm_add_epi32(d1, d2); - - __m128i res_lo = _mm_unpacklo_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - __m128i res_hi = _mm_unpackhi_epi32(srcRegFilt32b1_1, srcRegFilt32b1_2); - srcRegFilt32b1_1 = _mm_packs_epi32(res_lo, res_hi); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23_lo, srcReg34_lo; - __m128i srcReg45_lo, srcReg56_lo; - __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; - __m128i resReg23_45_lo, resReg34_56_lo; - __m128i resReg23_45, resReg34_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23_lo_1 = _mm_unpacklo_epi8(srcReg23_lo, _mm_setzero_si128()); - __m128i resReg23_lo_2 = _mm_unpackhi_epi8(srcReg23_lo, _mm_setzero_si128()); - - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34_lo_1 = _mm_unpacklo_epi8(srcReg34_lo, _mm_setzero_si128()); - __m128i resReg34_lo_2 = _mm_unpackhi_epi8(srcReg34_lo, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); - - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - - tmp_0 = _mm_madd_epi16(resReg23_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg23_lo_2, secondFilters); - resReg23_lo = _mm_packs_epi32(tmp_0, tmp_1); - - tmp_0 = _mm_madd_epi16(resReg34_lo_1, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34_lo_2, secondFilters); - resReg34_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45_lo_1 = _mm_unpacklo_epi8(srcReg45_lo, _mm_setzero_si128()); - __m128i resReg45_lo_2 = _mm_unpackhi_epi8(srcReg45_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg45_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg45_lo_2, thirdFilters); - resReg45_lo = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg56_lo_1 = _mm_unpacklo_epi8(srcReg56_lo, _mm_setzero_si128()); - __m128i resReg56_lo_2 = _mm_unpackhi_epi8(srcReg56_lo, _mm_setzero_si128()); - tmp_0 = _mm_madd_epi16(resReg56_lo_1, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56_lo_2, thirdFilters); - resReg56_lo = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); - resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); - - // shift by 6 bit each 16 bit - resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); - resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); - resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); - resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_45 = _mm_packus_epi16(resReg23_45_lo, _mm_setzero_si128()); - resReg34_56 = _mm_packus_epi16(resReg34_56_lo, _mm_setzero_si128()); - - src_ptr += src_stride; - - _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45)); - _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23_lo_1 = resReg45_lo_1; - resReg23_lo_2 = resReg45_lo_2; - resReg34_lo_1 = resReg56_lo_1; - resReg34_lo_2 = resReg56_lo_2; - srcReg4 = srcReg6; - } -} - -void aom_filter_block1d4_h4_sse2(const uint8_t *src_ptr, - ptrdiff_t src_pixels_per_line, - uint8_t *output_ptr, ptrdiff_t output_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i addFilterReg32; - __m128i secondFilters, thirdFilters; - __m128i srcRegFilt32b1_1; - __m128i srcReg32b1; - unsigned int i; - src_ptr -= 3; - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 4 5 4 5 4 5 4 5 - - for (i = output_height; i > 0; i -= 1) { - srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); - - __m128i ss_2 = _mm_srli_si128(srcReg32b1, 2); - __m128i ss_3 = _mm_srli_si128(srcReg32b1, 3); - __m128i ss_4 = _mm_srli_si128(srcReg32b1, 4); - __m128i ss_5 = _mm_srli_si128(srcReg32b1, 5); - - ss_2 = _mm_unpacklo_epi8(ss_2, _mm_setzero_si128()); - ss_3 = _mm_unpacklo_epi8(ss_3, _mm_setzero_si128()); - ss_4 = _mm_unpacklo_epi8(ss_4, _mm_setzero_si128()); - ss_5 = _mm_unpacklo_epi8(ss_5, _mm_setzero_si128()); - - __m128i ss_1_1 = _mm_unpacklo_epi32(ss_2, ss_3); - __m128i ss_1_2 = _mm_unpacklo_epi32(ss_4, ss_5); - - __m128i d1 = _mm_madd_epi16(ss_1_1, secondFilters); - __m128i d2 = _mm_madd_epi16(ss_1_2, thirdFilters); - srcRegFilt32b1_1 = _mm_add_epi32(d1, d2); - - srcRegFilt32b1_1 = _mm_packs_epi32(srcRegFilt32b1_1, _mm_setzero_si128()); - - // shift by 6 bit each 16 bit - srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); - srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve result - srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); - - src_ptr += src_pixels_per_line; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); - - output_ptr += output_pitch; - } -} - -void aom_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_pitch, - uint8_t *output_ptr, ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i filtersReg; - __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; - __m128i srcReg23, srcReg34, srcReg45, srcReg56; - __m128i resReg23_34, resReg45_56; - __m128i resReg23_34_45_56; - __m128i addFilterReg32, secondFilters, thirdFilters; - __m128i tmp_0, tmp_1; - unsigned int i; - ptrdiff_t src_stride, dst_stride; - - addFilterReg32 = _mm_set1_epi16(32); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - filtersReg = _mm_srai_epi16(filtersReg, 1); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp0 = _mm_unpacklo_epi32(filtersReg, filtersReg); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp1 = _mm_unpackhi_epi32(filtersReg, filtersReg); - - secondFilters = _mm_unpackhi_epi64(tmp0, tmp0); // coeffs 2 3 2 3 2 3 2 3 - thirdFilters = _mm_unpacklo_epi64(tmp1, tmp1); // coeffs 4 5 4 5 4 5 4 5 - - // multiply the size of the source and destination stride by two - src_stride = src_pitch << 1; - dst_stride = out_pitch << 1; - - srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3); - __m128i resReg23 = _mm_unpacklo_epi8(srcReg23, _mm_setzero_si128()); - - srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4); - __m128i resReg34 = _mm_unpacklo_epi8(srcReg34, _mm_setzero_si128()); - - for (i = output_height; i > 1; i -= 2) { - srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5); - srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); - srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - tmp_0 = _mm_madd_epi16(resReg23, secondFilters); - tmp_1 = _mm_madd_epi16(resReg34, secondFilters); - resReg23_34 = _mm_packs_epi32(tmp_0, tmp_1); - - __m128i resReg45 = _mm_unpacklo_epi8(srcReg45, _mm_setzero_si128()); - __m128i resReg56 = _mm_unpacklo_epi8(srcReg56, _mm_setzero_si128()); - - tmp_0 = _mm_madd_epi16(resReg45, thirdFilters); - tmp_1 = _mm_madd_epi16(resReg56, thirdFilters); - resReg45_56 = _mm_packs_epi32(tmp_0, tmp_1); - - // add and saturate the results together - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34, resReg45_56); - - // shift by 6 bit each 16 bit - resReg23_34_45_56 = _mm_adds_epi16(resReg23_34_45_56, addFilterReg32); - resReg23_34_45_56 = _mm_srai_epi16(resReg23_34_45_56, 6); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - resReg23_34_45_56 = - _mm_packus_epi16(resReg23_34_45_56, _mm_setzero_si128()); - - src_ptr += src_stride; - - *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReg23_34_45_56); - *((int *)(output_ptr + out_pitch)) = - _mm_cvtsi128_si32(_mm_srli_si128(resReg23_34_45_56, 4)); - - output_ptr += dst_stride; - - // save part of the registers for next strides - resReg23 = resReg45; - resReg34 = resReg56; - srcReg4 = srcReg6; - } -} diff -Nru aom-3.8.2/aom_dsp/x86/aom_subpixel_8t_sse2.asm aom-3.9.0/aom_dsp/x86/aom_subpixel_8t_sse2.asm --- aom-3.8.2/aom_dsp/x86/aom_subpixel_8t_sse2.asm 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/aom_subpixel_8t_sse2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,615 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - - -%include "aom_ports/x86_abi_support.asm" - -;Note: tap3 and tap4 have to be applied and added after other taps to avoid -;overflow. - -%macro GET_FILTERS_4 0 - mov rdx, arg(5) ;filter ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - psrldq xmm7, 8 - pshuflw xmm4, xmm7, 0b ;k4 - pshuflw xmm5, xmm7, 01010101b ;k5 - pshuflw xmm6, xmm7, 10101010b ;k6 - pshuflw xmm7, xmm7, 11111111b ;k7 - - punpcklqdq xmm0, xmm1 - punpcklqdq xmm2, xmm3 - punpcklqdq xmm5, xmm4 - punpcklqdq xmm6, xmm7 - - movdqa k0k1, xmm0 - movdqa k2k3, xmm2 - movdqa k5k4, xmm5 - movdqa k6k7, xmm6 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro APPLY_FILTER_4 1 - punpckldq xmm0, xmm1 ;two row in one register - punpckldq xmm6, xmm7 - punpckldq xmm2, xmm3 - punpckldq xmm5, xmm4 - - punpcklbw xmm0, zero ;unpack to word - punpcklbw xmm6, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - - pmullw xmm0, k0k1 ;multiply the filter factors - pmullw xmm6, k6k7 - pmullw xmm2, k2k3 - pmullw xmm5, k5k4 - - paddsw xmm0, xmm6 ;sum - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - paddsw xmm0, xmm2 - psrldq xmm2, 8 - paddsw xmm0, xmm5 - psrldq xmm5, 8 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movd [rdi], xmm0 -%endm - -%macro GET_FILTERS 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - pshuflw xmm0, xmm7, 0b ;k0 - pshuflw xmm1, xmm7, 01010101b ;k1 - pshuflw xmm2, xmm7, 10101010b ;k2 - pshuflw xmm3, xmm7, 11111111b ;k3 - pshufhw xmm4, xmm7, 0b ;k4 - pshufhw xmm5, xmm7, 01010101b ;k5 - pshufhw xmm6, xmm7, 10101010b ;k6 - pshufhw xmm7, xmm7, 11111111b ;k7 - - punpcklwd xmm0, xmm0 - punpcklwd xmm1, xmm1 - punpcklwd xmm2, xmm2 - punpcklwd xmm3, xmm3 - punpckhwd xmm4, xmm4 - punpckhwd xmm5, xmm5 - punpckhwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movdqa k0, xmm0 ;store filter factors on stack - movdqa k1, xmm1 - movdqa k2, xmm2 - movdqa k3, xmm3 - movdqa k4, xmm4 - movdqa k5, xmm5 - movdqa k6, xmm6 - movdqa k7, xmm7 - - movq xmm6, rcx - pshufd xmm6, xmm6, 0 - movdqa krd, xmm6 ;rounding - - pxor xmm7, xmm7 - movdqa zero, xmm7 -%endm - -%macro LOAD_VERT_8 1 - movq xmm0, [rsi + %1] ;0 - movq xmm1, [rsi + rax + %1] ;1 - movq xmm6, [rsi + rdx * 2 + %1] ;6 - lea rsi, [rsi + rax] - movq xmm7, [rsi + rdx * 2 + %1] ;7 - movq xmm2, [rsi + rax + %1] ;2 - movq xmm3, [rsi + rax * 2 + %1] ;3 - movq xmm4, [rsi + rdx + %1] ;4 - movq xmm5, [rsi + rax * 4 + %1] ;5 -%endm - -%macro APPLY_FILTER_8 2 - punpcklbw xmm0, zero - punpcklbw xmm1, zero - punpcklbw xmm6, zero - punpcklbw xmm7, zero - punpcklbw xmm2, zero - punpcklbw xmm5, zero - punpcklbw xmm3, zero - punpcklbw xmm4, zero - - pmullw xmm0, k0 - pmullw xmm1, k1 - pmullw xmm6, k6 - pmullw xmm7, k7 - pmullw xmm2, k2 - pmullw xmm5, k5 - pmullw xmm3, k3 - pmullw xmm4, k4 - - paddsw xmm0, xmm1 - paddsw xmm0, xmm6 - paddsw xmm0, xmm7 - paddsw xmm0, xmm2 - paddsw xmm0, xmm5 - paddsw xmm0, xmm3 - paddsw xmm0, xmm4 - - paddsw xmm0, krd ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi + %2] - pavgb xmm0, xmm1 -%endif - movq [rdi + %2], xmm0 -%endm - -SECTION .text - -;void aom_filter_block1d4_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_v8_sse2) -sym(aom_filter_block1d4_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movd xmm0, [rsi] ;load src: row 0 - movd xmm1, [rsi + rax] ;1 - movd xmm6, [rsi + rdx * 2] ;6 - lea rsi, [rsi + rax] - movd xmm7, [rsi + rdx * 2] ;7 - movd xmm2, [rsi + rax] ;2 - movd xmm3, [rsi + rax * 2] ;3 - movd xmm4, [rsi + rdx] ;4 - movd xmm5, [rsi + rax * 4] ;5 - - APPLY_FILTER_4 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_v8_sse2) -sym(aom_filter_block1d8_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - - lea rdi, [rdi + rbx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_v8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_v8_sse2) -sym(aom_filter_block1d16_v8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rbx, DWORD PTR arg(3) ;out_pitch - lea rdx, [rax + rax * 2] - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - LOAD_VERT_8 0 - APPLY_FILTER_8 0, 0 - sub rsi, rax - - LOAD_VERT_8 8 - APPLY_FILTER_8 0, 8 - add rdi, rbx - - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d4_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d4_h8_sse2) -sym(aom_filter_block1d4_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 6 - %define k0k1 [rsp + 16 * 0] - %define k2k3 [rsp + 16 * 1] - %define k5k4 [rsp + 16 * 2] - %define k6k7 [rsp + 16 * 3] - %define krd [rsp + 16 * 4] - %define zero [rsp + 16 * 5] - - GET_FILTERS_4 - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm3, xmm0 - movdqa xmm5, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm3, 3 - psrldq xmm5, 5 - psrldq xmm4, 4 - - APPLY_FILTER_4 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 6 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d8_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d8_h8_sse2) -sym(aom_filter_block1d8_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void aom_filter_block1d16_h8_sse2 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -globalsym(aom_filter_block1d16_h8_sse2) -sym(aom_filter_block1d16_h8_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16 * 10 - %define k0 [rsp + 16 * 0] - %define k1 [rsp + 16 * 1] - %define k2 [rsp + 16 * 2] - %define k3 [rsp + 16 * 3] - %define k4 [rsp + 16 * 4] - %define k5 [rsp + 16 * 5] - %define k6 [rsp + 16 * 6] - %define k7 [rsp + 16 * 7] - %define krd [rsp + 16 * 8] - %define zero [rsp + 16 * 9] - - GET_FILTERS - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height - -.loop: - movdqu xmm0, [rsi - 3] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 0 - - movdqu xmm0, [rsi + 5] ;load src - - movdqa xmm1, xmm0 - movdqa xmm6, xmm0 - movdqa xmm7, xmm0 - movdqa xmm2, xmm0 - movdqa xmm5, xmm0 - movdqa xmm3, xmm0 - movdqa xmm4, xmm0 - - psrldq xmm1, 1 - psrldq xmm6, 6 - psrldq xmm7, 7 - psrldq xmm2, 2 - psrldq xmm5, 5 - psrldq xmm3, 3 - psrldq xmm4, 4 - - APPLY_FILTER_8 0, 8 - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx - jnz .loop - - add rsp, 16 * 10 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff -Nru aom-3.8.2/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm aom-3.9.0/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm --- aom-3.8.2/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/aom_subpixel_bilinear_sse2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,295 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "aom_ports/x86_abi_support.asm" - -%macro GET_PARAM_4 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm3, [rdx] ;load filters - pshuflw xmm4, xmm3, 11111111b ;k3 - psrldq xmm3, 8 - pshuflw xmm3, xmm3, 0b ;k4 - punpcklqdq xmm4, xmm3 ;k3k4 - - movq xmm3, rcx ;rounding - pshufd xmm3, xmm3, 0 - - pxor xmm2, xmm2 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_4 1 - - punpckldq xmm0, xmm1 ;two row in one register - punpcklbw xmm0, xmm2 ;unpack to word - pmullw xmm0, xmm4 ;multiply the filter factors - - movdqa xmm1, xmm0 - psrldq xmm1, 8 - paddsw xmm0, xmm1 - - paddsw xmm0, xmm3 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack to byte - -%if %1 - movd xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - - movd [rdi], xmm0 - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro GET_PARAM 0 - mov rdx, arg(5) ;filter ptr - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;output_ptr - mov rcx, 0x0400040 - - movdqa xmm7, [rdx] ;load filters - - pshuflw xmm6, xmm7, 11111111b ;k3 - pshufhw xmm7, xmm7, 0b ;k4 - punpcklwd xmm6, xmm6 - punpckhwd xmm7, xmm7 - - movq xmm4, rcx ;rounding - pshufd xmm4, xmm4, 0 - - pxor xmm5, xmm5 - - movsxd rax, DWORD PTR arg(1) ;pixels_per_line - movsxd rdx, DWORD PTR arg(3) ;out_pitch - movsxd rcx, DWORD PTR arg(4) ;output_height -%endm - -%macro APPLY_FILTER_8 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - paddsw xmm0, xmm1 - paddsw xmm0, xmm4 ;rounding - psraw xmm0, 7 ;shift - packuswb xmm0, xmm0 ;pack back to byte -%if %1 - movq xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movq [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -%macro APPLY_FILTER_16 1 - punpcklbw xmm0, xmm5 - punpcklbw xmm1, xmm5 - punpckhbw xmm2, xmm5 - punpckhbw xmm3, xmm5 - - pmullw xmm0, xmm6 - pmullw xmm1, xmm7 - pmullw xmm2, xmm6 - pmullw xmm3, xmm7 - - paddsw xmm0, xmm1 - paddsw xmm2, xmm3 - - paddsw xmm0, xmm4 ;rounding - paddsw xmm2, xmm4 - psraw xmm0, 7 ;shift - psraw xmm2, 7 - packuswb xmm0, xmm2 ;pack back to byte -%if %1 - movdqu xmm1, [rdi] - pavgb xmm0, xmm1 -%endif - movdqu [rdi], xmm0 ;store the result - - lea rsi, [rsi + rax] - lea rdi, [rdi + rdx] - dec rcx -%endm - -SECTION .text - -globalsym(aom_filter_block1d4_v2_sse2) -sym(aom_filter_block1d4_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movd xmm0, [rsi] ;load src - movd xmm1, [rsi + rax] - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_v2_sse2) -sym(aom_filter_block1d8_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movq xmm0, [rsi] ;0 - movq xmm1, [rsi + rax] ;1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_v2_sse2) -sym(aom_filter_block1d16_v2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;0 - movdqu xmm1, [rsi + rax] ;1 - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d4_h2_sse2) -sym(aom_filter_block1d4_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - push rsi - push rdi - ; end prolog - - GET_PARAM_4 -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_4 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d8_h2_sse2) -sym(aom_filter_block1d8_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqa xmm1, xmm0 - psrldq xmm1, 1 - - APPLY_FILTER_8 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -globalsym(aom_filter_block1d16_h2_sse2) -sym(aom_filter_block1d16_h2_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - ; end prolog - - GET_PARAM -.loop: - movdqu xmm0, [rsi] ;load src - movdqu xmm1, [rsi + 1] - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - APPLY_FILTER_16 0 - jnz .loop - - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret diff -Nru aom-3.8.2/aom_dsp/x86/convolve.h aom-3.9.0/aom_dsp/x86/convolve.h --- aom-3.8.2/aom_dsp/x86/convolve.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/convolve.h 2024-05-07 19:57:02.567000000 +0000 @@ -14,6 +14,7 @@ #include #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom/aom_integer.h" #include "aom_ports/mem.h" diff -Nru aom-3.8.2/aom_dsp/x86/highbd_convolve_avx2.c aom-3.9.0/aom_dsp/x86/highbd_convolve_avx2.c --- aom-3.8.2/aom_dsp/x86/highbd_convolve_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/highbd_convolve_avx2.c 2024-05-07 19:57:02.575000000 +0000 @@ -11,7 +11,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve.h" #include "aom_dsp/x86/convolve_avx2.h" diff -Nru aom-3.8.2/aom_dsp/x86/highbd_convolve_ssse3.c aom-3.9.0/aom_dsp/x86/highbd_convolve_ssse3.c --- aom-3.8.2/aom_dsp/x86/highbd_convolve_ssse3.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/highbd_convolve_ssse3.c 2024-05-07 19:57:02.577000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_common_intrin.h" diff -Nru aom-3.8.2/aom_dsp/x86/highbd_quantize_intrin_sse2.c aom-3.9.0/aom_dsp/x86/highbd_quantize_intrin_sse2.c --- aom-3.8.2/aom_dsp/x86/highbd_quantize_intrin_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/highbd_quantize_intrin_sse2.c 2024-05-07 19:57:02.584000000 +0000 @@ -14,6 +14,7 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" +#include "config/aom_dsp_rtcd.h" void aom_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *zbin_ptr, diff -Nru aom-3.8.2/aom_dsp/x86/highbd_variance_avx2.c aom-3.9.0/aom_dsp/x86/highbd_variance_avx2.c --- aom-3.8.2/aom_dsp/x86/highbd_variance_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/highbd_variance_avx2.c 2024-05-07 19:57:02.587000000 +0000 @@ -618,9 +618,9 @@ return (var > 0) ? var : 0; } -void aom_highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc8x8var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); for (int i = 0; i < 8; i += 2) { @@ -653,9 +653,9 @@ *sse = _mm_extract_epi32(v_d, 1); } -void aom_highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, - const uint16_t *ref, int ref_stride, - uint32_t *sse, int *sum) { +static void highbd_calc16x16var_avx2(const uint16_t *src, int src_stride, + const uint16_t *ref, int ref_stride, + uint32_t *sse, int *sum) { __m256i v_sum_d = _mm256_setzero_si256(); __m256i v_sse_d = _mm256_setzero_si256(); const __m256i one = _mm256_set1_epi16(1); @@ -703,19 +703,19 @@ *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4); } -#define VAR_FN(w, h, block_size, shift) \ - uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ - const uint8_t *src8, int src_stride, const uint8_t *ref8, \ - int ref_stride, uint32_t *sse) { \ - int sum; \ - int64_t var; \ - uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ - highbd_10_variance_avx2( \ - src, src_stride, ref, ref_stride, w, h, sse, &sum, \ - aom_highbd_calc##block_size##x##block_size##var_avx2, block_size); \ - var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ - return (var >= 0) ? (uint32_t)var : 0; \ +#define VAR_FN(w, h, block_size, shift) \ + uint32_t aom_highbd_10_variance##w##x##h##_avx2( \ + const uint8_t *src8, int src_stride, const uint8_t *ref8, \ + int ref_stride, uint32_t *sse) { \ + int sum; \ + int64_t var; \ + uint16_t *src = CONVERT_TO_SHORTPTR(src8); \ + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \ + highbd_10_variance_avx2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \ + highbd_calc##block_size##x##block_size##var_avx2, \ + block_size); \ + var = (int64_t)(*sse) - (((int64_t)sum * sum) >> shift); \ + return (var >= 0) ? (uint32_t)var : 0; \ } VAR_FN(128, 128, 16, 14) @@ -782,8 +782,8 @@ #undef HIGHBD_SUBPIX_VAR -uint64_t aom_mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16, reg2_4x16, reg3_4x16; __m256i src0_8x16, src1_8x16, src_16x16; @@ -840,8 +840,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_avx2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m256i src0_8x16, src1_8x16, src_16x16; __m256i dst0_8x16, dst1_8x16, dst_16x16; @@ -897,8 +897,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } diff -Nru aom-3.8.2/aom_dsp/x86/highbd_variance_sse2.c aom-3.9.0/aom_dsp/x86/highbd_variance_sse2.c --- aom-3.8.2/aom_dsp/x86/highbd_variance_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/highbd_variance_sse2.c 2024-05-07 19:57:02.589000000 +0000 @@ -637,8 +637,8 @@ } } -uint64_t aom_mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_4xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i reg0_4x16, reg1_4x16; __m128i src_8x16; @@ -682,8 +682,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, - uint16_t *src, int sstride, int h) { +static uint64_t mse_8xh_16bit_highbd_sse2(uint16_t *dst, int dstride, + uint16_t *src, int sstride, int h) { uint64_t sum = 0; __m128i src_8x16; __m128i dst_8x16; @@ -728,8 +728,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_highbd_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } diff -Nru aom-3.8.2/aom_dsp/x86/intrapred_avx2.c aom-3.9.0/aom_dsp/x86/intrapred_avx2.c --- aom-3.8.2/aom_dsp/x86/intrapred_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/intrapred_avx2.c 2024-05-07 19:57:02.591000000 +0000 @@ -11,7 +11,7 @@ #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/intrapred_x86.h" #include "aom_dsp/x86/intrapred_utils.h" #include "aom_dsp/x86/lpf_common_sse2.h" diff -Nru aom-3.8.2/aom_dsp/x86/intrapred_sse4.c aom-3.9.0/aom_dsp/x86/intrapred_sse4.c --- aom-3.8.2/aom_dsp/x86/intrapred_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/intrapred_sse4.c 2024-05-07 19:57:02.597000000 +0000 @@ -12,7 +12,7 @@ #include // SSE2 #include /* SSE4.1 */ -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/intrapred_x86.h" #include "aom_dsp/x86/intrapred_utils.h" #include "aom_dsp/x86/lpf_common_sse2.h" diff -Nru aom-3.8.2/aom_dsp/x86/obmc_sad_avx2.c aom-3.9.0/aom_dsp/x86/obmc_sad_avx2.c --- aom-3.8.2/aom_dsp/x86/obmc_sad_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/obmc_sad_avx2.c 2024-05-07 19:57:02.613000000 +0000 @@ -13,6 +13,7 @@ #include #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" diff -Nru aom-3.8.2/aom_dsp/x86/obmc_sad_sse4.c aom-3.9.0/aom_dsp/x86/obmc_sad_sse4.c --- aom-3.8.2/aom_dsp/x86/obmc_sad_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/obmc_sad_sse4.c 2024-05-07 19:57:02.614000000 +0000 @@ -13,6 +13,7 @@ #include #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" diff -Nru aom-3.8.2/aom_dsp/x86/obmc_variance_avx2.c aom-3.9.0/aom_dsp/x86/obmc_variance_avx2.c --- aom-3.8.2/aom_dsp/x86/obmc_variance_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/obmc_variance_avx2.c 2024-05-07 19:57:02.614000000 +0000 @@ -13,6 +13,7 @@ #include #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" diff -Nru aom-3.8.2/aom_dsp/x86/obmc_variance_sse4.c aom-3.9.0/aom_dsp/x86/obmc_variance_sse4.c --- aom-3.8.2/aom_dsp/x86/obmc_variance_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/obmc_variance_sse4.c 2024-05-07 19:57:02.614000000 +0000 @@ -13,6 +13,7 @@ #include #include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" #include "aom_ports/mem.h" #include "aom/aom_integer.h" diff -Nru aom-3.8.2/aom_dsp/x86/subpel_variance_sse2.asm aom-3.9.0/aom_dsp/x86/subpel_variance_sse2.asm --- aom-3.8.2/aom_dsp/x86/subpel_variance_sse2.asm 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/subpel_variance_sse2.asm 1970-01-01 00:00:00.000000000 +0000 @@ -1,1470 +0,0 @@ -; -; Copyright (c) 2016, Alliance for Open Media. All rights reserved -; -; This source code is subject to the terms of the BSD 2 Clause License and -; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License -; was not distributed with this source code in the LICENSE file, you can -; obtain it at www.aomedia.org/license/software. If the Alliance for Open -; Media Patent License 1.0 was not distributed with this source code in the -; PATENTS file, you can obtain it at www.aomedia.org/license/patent. -; - -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_8: times 8 dw 8 -bilin_filter_m_sse2: times 8 dw 16 - times 8 dw 0 - times 8 dw 14 - times 8 dw 2 - times 8 dw 12 - times 8 dw 4 - times 8 dw 10 - times 8 dw 6 - times 16 dw 8 - times 8 dw 6 - times 8 dw 10 - times 8 dw 4 - times 8 dw 12 - times 8 dw 2 - times 8 dw 14 - -bilin_filter_m_ssse3: times 8 db 16, 0 - times 8 db 14, 2 - times 8 db 12, 4 - times 8 db 10, 6 - times 16 db 8 - times 8 db 6, 10 - times 8 db 4, 12 - times 8 db 2, 14 - -SECTION .text - -; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, -; int x_offset, int y_offset, -; const uint8_t *dst, ptrdiff_t dst_stride, -; int height, unsigned int *sse); -; -; This function returns the SE and stores SSE in the given pointer. - -%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse - psubw %3, %4 - psubw %1, %2 - paddw %5, %3 - pmaddwd %3, %3 - paddw %5, %1 - pmaddwd %1, %1 - paddd %6, %3 - paddd %6, %1 -%endmacro - -%macro STORE_AND_RET 1 -%if %1 > 4 - ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit - ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. - ; We have to sign-extend it before adding the words within the register - ; and outputing to a dword. - pcmpgtw m5, m6 ; mask for 0 > x - movhlps m3, m7 - punpcklwd m4, m6, m5 - punpckhwd m6, m5 ; sign-extend m6 word->dword - paddd m7, m3 - paddd m6, m4 - pshufd m3, m7, 0x1 - movhlps m4, m6 - paddd m7, m3 - paddd m6, m4 - mov r1, ssem ; r1 = unsigned int *sse - pshufd m4, m6, 0x1 - movd [r1], m7 ; store sse - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%else ; 4xh - pshuflw m4, m6, 0xe - pshuflw m3, m7, 0xe - paddw m6, m4 - paddd m7, m3 - pcmpgtw m5, m6 ; mask for 0 > x - mov r1, ssem ; r1 = unsigned int *sse - punpcklwd m6, m5 ; sign-extend m6 word->dword - movd [r1], m7 ; store sse - pshuflw m4, m6, 0xe - paddd m6, m4 - movd raxd, m6 ; store sum as return value -%endif - RET -%endmacro - -%macro INC_SRC_BY_SRC_STRIDE 0 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 - add srcq, src_stridemp -%else - add srcq, src_strideq -%endif -%endmacro - -%macro SUBPEL_VARIANCE 1-2 0 ; W -%if cpuflag(ssse3) -%define bilin_filter_m bilin_filter_m_ssse3 -%define filter_idx_shift 4 -%else -%define bilin_filter_m bilin_filter_m_sse2 -%define filter_idx_shift 5 -%endif -; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses -; 11, not 13, if the registers are ordered correctly. May make a minor speed -; difference on Win64 - -%if AOM_ARCH_X86_64 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse - %define sec_str sec_strideq - %else - cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %endif - %define block_height heightd - %define bilin_filter sseq -%else - %if CONFIG_PIC=1 - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - sec, sec_stride, height, sse - %define block_height dword heightm - %define sec_str sec_stridemp - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %define block_height heightd - %endif - - ; reuse argument stack space - %define g_bilin_filterm x_offsetm - %define g_pw_8m y_offsetm - - ;Store bilin_filter and pw_8 location in stack - %if GET_GOT_DEFINED == 1 - GET_GOT eax - add esp, 4 ; restore esp - %endif - - lea ecx, [GLOBAL(bilin_filter_m)] - mov g_bilin_filterm, ecx - - lea ecx, [GLOBAL(pw_8)] - mov g_pw_8m, ecx - - LOAD_IF_USED 0, 1 ; load eax, ecx back - %else - %if %2 == 1 ; avg - cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, \ - dst, dst_stride, sec, sec_stride, \ - height, sse - %define block_height dword heightm - %define sec_str sec_stridemp - %else - cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ - x_offset, y_offset, dst, dst_stride, \ - height, sse - %define block_height heightd - %endif - %define bilin_filter bilin_filter_m - %endif -%endif - -%if %1 == 4 - %define movx movd -%else - %define movx movh -%endif - - ASSERT %1 <= 16 ; m6 overflows if w > 16 - pxor m6, m6 ; sum - pxor m7, m7 ; sse - ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we - ; could perhaps use it for something more productive then - pxor m5, m5 ; dedicated zero register -%if %1 < 16 - sar block_height, 1 -%if %2 == 1 ; avg - shl sec_str, 1 -%endif -%endif - - ; FIXME(rbultje) replace by jumptable? - test x_offsetd, x_offsetd - jnz .x_nonzero - ; x_offset == 0 - test y_offsetd, y_offsetd - jnz .x_zero_y_nonzero - - ; x_offset == 0 && y_offset == 0 -.x_zero_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - mova m1, [dstq] -%if %2 == 1 ; avg - pavgb m0, [secq] - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - -%if %2 == 0 ; !avg - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] -%endif - - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - -%if %2 == 1 ; avg -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_zero_loop - STORE_AND_RET %1 - -.x_zero_y_nonzero: - cmp y_offsetd, 4 - jne .x_zero_y_nonhalf - - ; x_offset == 0 && y_offset == 0.5 -.x_zero_y_half_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq*2] -%else ; 4xh - movx m1, [srcq+src_strideq*2] - punpckldq m2, m1 -%endif - movx m1, [dstq] -%if %1 > 4 - movlhps m0, m2 -%else ; 4xh - punpckldq m0, m2 -%endif - movx m3, [dstq+dst_strideq] - pavgb m0, m2 - punpcklbw m1, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m3, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m4, [secq] - pavgb m0, m4 - punpcklbw m3, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq*2] - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_half_loop - STORE_AND_RET %1 - -.x_zero_y_nonhalf: - ; x_offset == 0 && y_offset == bilin interpolation -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ; x86-32 or mmx -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0, reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_zero_y_other_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+src_strideq] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can - ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of - ; instructions is the same (5), but it is 1 mul instead of 2, so might be - ; slightly faster because of pmullw latency. It would also cut our rodata - ; tables in half for this function, and save 1-2 registers on x86-64. - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m4, filter_y_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq*2] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m4, filter_y_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_zero_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonzero: - cmp x_offsetd, 4 - jne .x_nonhalf - ; x_offset == 0.5 - test y_offsetd, y_offsetd - jnz .x_half_y_nonzero - - ; x_offset == 0.5 && y_offset == 0 -.x_half_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] - pavgb m0, m4 - punpckhbw m3, m1, m5 -%if %2 == 1 ; avg - pavgb m0, [secq] -%endif - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m4, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m0, [srcq+src_strideq] - movhps m4, [srcq+src_strideq+1] -%else ; 4xh - movx m1, [srcq+src_strideq] - punpckldq m0, m1 - movx m2, [srcq+src_strideq+1] - punpckldq m4, m2 -%endif - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - pavgb m0, m4 - punpcklbw m3, m5 -%if %1 > 4 - pavgb m0, [secq] - punpcklbw m1, m5 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else ; 4xh - movh m2, [secq] - pavgb m0, m2 - punpcklbw m1, m5 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m2, [srcq+src_strideq] - movx m1, [dstq] - pavgb m0, m4 - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] - pavgb m2, m4 - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_zero_loop - STORE_AND_RET %1 - -.x_half_y_nonzero: - cmp y_offsetd, 4 - jne .x_half_y_nonhalf - - ; x_offset == 0.5 && y_offset == 0.5 -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - pavgb m4, m3 - punpckhbw m3, m1, m5 - pavgb m0, m4 -%if %2 == 1 ; avg - punpcklbw m1, m5 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_half_loop: - movx m2, [srcq] - movx m3, [srcq+1] -%if %2 == 1 ; avg -%if %1 > 4 - movhps m2, [srcq+src_strideq] - movhps m3, [srcq+src_strideq+1] -%else - movx m1, [srcq+src_strideq] - punpckldq m2, m1 - movx m1, [srcq+src_strideq+1] - punpckldq m3, m1 -%endif - pavgb m2, m3 -%if %1 > 4 - movlhps m0, m2 - movhlps m4, m2 -%else ; 4xh - punpckldq m0, m2 - pshuflw m4, m2, 0xe -%endif - movx m1, [dstq] - pavgb m0, m2 - movx m3, [dstq+dst_strideq] -%if %1 > 4 - pavgb m0, [secq] -%else - movh m2, [secq] - pavgb m0, m2 -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 -%if %1 > 4 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%else ; !avg - movx m4, [srcq+src_strideq] - movx m1, [srcq+src_strideq+1] - pavgb m2, m3 - pavgb m4, m1 - pavgb m0, m2 - pavgb m2, m4 - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - punpcklbw m0, m5 - punpcklbw m2, m5 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_half_loop - STORE_AND_RET %1 - -.x_half_y_nonhalf: - ; x_offset == 0.5 && y_offset == bilin interpolation -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl y_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+y_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_y_a m8 -%define filter_y_b m9 -%define filter_rnd m10 -%else ;x86_32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; x_offset == 0.5. We can reuse x_offset reg -%define tempq x_offsetq - add y_offsetq, g_bilin_filterm -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add y_offsetq, bilin_filter -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -.x_half_y_other_loop: - movu m4, [srcq] - movu m2, [srcq+1] - mova m1, [dstq] - pavgb m4, m2 -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - pmullw m2, filter_y_a - pmullw m3, filter_y_b - paddw m2, filter_rnd - punpcklbw m0, m5 - paddw m2, m3 - punpcklbw m3, m4, m5 - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 -%endif - punpckhbw m3, m1, m5 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m3, [srcq+1] - add srcq, src_strideq - pavgb m0, m3 -%if notcpuflag(ssse3) - punpcklbw m0, m5 -%endif -.x_half_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] - pavgb m2, m1 - pavgb m4, m3 - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - movx m1, [dstq] - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_y_a - pmullw m1, m2, filter_y_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_y_a - paddw m0, m1 - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m2, m1 - movx m1, [dstq] -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_half_y_other_loop -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf: - test y_offsetd, y_offsetd - jnz .x_nonhalf_y_nonzero - - ; x_offset == bilin interpolation && y_offset == 0 -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -;y_offset == 0. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -.x_other_y_zero_loop: -%if %1 == 16 - movu m0, [srcq] - movu m4, [srcq+1] - mova m1, [dstq] -%if cpuflag(ssse3) - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m4, m5 - punpcklbw m0, m5 - punpcklbw m4, m5 - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - pmullw m0, filter_x_a - pmullw m4, filter_x_b - paddw m0, filter_rnd - paddw m2, m3 - paddw m0, m4 -%endif - psraw m2, 4 - psraw m0, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - punpckhbw m3, m1, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] - movx m2, [srcq+src_strideq] - movx m4, [srcq+src_strideq+1] - movx m3, [dstq+dst_strideq] -%if cpuflag(ssse3) - punpcklbw m0, m1 - movx m1, [dstq] - punpcklbw m2, m4 - pmaddubsw m0, filter_x_a - pmaddubsw m2, filter_x_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - punpcklbw m2, m5 - punpcklbw m4, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - punpcklbw m3, m5 - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m4, filter_x_b - paddw m0, m1 - paddw m2, filter_rnd - movx m1, [dstq] - paddw m2, m4 -%endif - psraw m0, 4 - psraw m2, 4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_zero_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonzero: - cmp y_offsetd, 4 - jne .x_nonhalf_y_nonhalf - - ; x_offset == bilin interpolation && y_offset == 0.5 -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_rnd m10 -%else ; x86-32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; y_offset == 0.5. We can reuse y_offset reg. -%define tempq y_offsetq - add x_offsetq, g_bilin_filterm -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - add srcq, src_strideq - packuswb m0, m2 -.x_other_y_half_loop: - movu m4, [srcq] - movu m3, [srcq+1] -%if cpuflag(ssse3) - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - pavgb m0, m4 - punpckhbw m3, m1, m5 - punpcklbw m1, m5 -%else - punpckhbw m2, m4, m5 - punpckhbw m1, m3, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - paddw m4, m3 - paddw m2, m1 - mova m1, [dstq] - psraw m4, 4 - psraw m2, 4 - punpckhbw m3, m1, m5 - ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we - ; have a 1-register shortage to be able to store the backup of the bilin - ; filtered second line as words as cache for the next line. Packing into - ; a byte costs 1 pack and 2 unpacks, but saves a register. - packuswb m4, m2 - punpcklbw m1, m5 - pavgb m0, m4 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - pavgb m0, [secq] -%endif - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - add srcq, src_strideq - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - add srcq, src_strideq - psraw m0, 4 -.x_other_y_half_loop: - movx m2, [srcq] - movx m1, [srcq+1] - movx m4, [srcq+src_strideq] - movx m3, [srcq+src_strideq+1] -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m1, [dstq] - movx m3, [dstq+dst_strideq] - paddw m2, filter_rnd - paddw m4, filter_rnd -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - movx m1, [dstq] - paddw m4, m3 - movx m3, [dstq+dst_strideq] -%endif - psraw m2, 4 - psraw m4, 4 - pavgw m0, m2 - pavgw m2, m4 -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - also consider going to bytes here -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - punpcklbw m3, m5 - punpcklbw m1, m5 - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - lea srcq, [srcq+src_strideq*2] - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_half_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_rnd - STORE_AND_RET %1 - -.x_nonhalf_y_nonhalf: -%if AOM_ARCH_X86_64 - lea bilin_filter, [GLOBAL(bilin_filter_m)] -%endif - shl x_offsetd, filter_idx_shift - shl y_offsetd, filter_idx_shift -%if AOM_ARCH_X86_64 && %1 > 4 - mova m8, [bilin_filter+x_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m9, [bilin_filter+x_offsetq+16] -%endif - mova m10, [bilin_filter+y_offsetq] -%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 - mova m11, [bilin_filter+y_offsetq+16] -%endif - mova m12, [GLOBAL(pw_8)] -%define filter_x_a m8 -%define filter_x_b m9 -%define filter_y_a m10 -%define filter_y_b m11 -%define filter_rnd m12 -%else ; x86-32 -%if AOM_ARCH_X86=1 && CONFIG_PIC=1 -; In this case, there is NO unused register. Used src_stride register. Later, -; src_stride has to be loaded from stack when it is needed. -%define tempq src_strideq - mov tempq, g_bilin_filterm - add x_offsetq, tempq - add y_offsetq, tempq -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] - - mov tempq, g_pw_8m -%define filter_rnd [tempq] -%else - add x_offsetq, bilin_filter - add y_offsetq, bilin_filter -%define filter_x_a [x_offsetq] -%define filter_x_b [x_offsetq+16] -%define filter_y_a [y_offsetq] -%define filter_y_b [y_offsetq+16] -%define filter_rnd [GLOBAL(pw_8)] -%endif -%endif - - ; x_offset == bilin interpolation && y_offset == bilin interpolation -%if %1 == 16 - movu m0, [srcq] - movu m1, [srcq+1] -%if cpuflag(ssse3) - punpckhbw m2, m0, m1 - punpcklbw m0, m1 - pmaddubsw m2, filter_x_a - pmaddubsw m0, filter_x_a - paddw m2, filter_rnd - paddw m0, filter_rnd -%else - punpckhbw m2, m0, m5 - punpckhbw m3, m1, m5 - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - pmullw m2, filter_x_a - pmullw m3, filter_x_b - paddw m2, filter_rnd - paddw m0, m1 - paddw m2, m3 -%endif - psraw m0, 4 - psraw m2, 4 - - INC_SRC_BY_SRC_STRIDE - - packuswb m0, m2 -.x_other_y_other_loop: -%if cpuflag(ssse3) - movu m4, [srcq] - movu m3, [srcq+1] - mova m1, [dstq] - punpckhbw m2, m4, m3 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - punpckhbw m3, m1, m5 - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m4, m2 - punpckhbw m2, m0, m4 - punpcklbw m0, m4 - pmaddubsw m2, filter_y_a - pmaddubsw m0, filter_y_a - punpcklbw m1, m5 - paddw m2, filter_rnd - paddw m0, filter_rnd - psraw m2, 4 - psraw m0, 4 -%else - movu m3, [srcq] - movu m4, [srcq+1] - punpckhbw m1, m3, m5 - punpckhbw m2, m4, m5 - punpcklbw m3, m5 - punpcklbw m4, m5 - pmullw m3, filter_x_a - pmullw m4, filter_x_b - paddw m3, filter_rnd - pmullw m1, filter_x_a - pmullw m2, filter_x_b - paddw m1, filter_rnd - paddw m3, m4 - paddw m1, m2 - psraw m3, 4 - psraw m1, 4 - packuswb m4, m3, m1 - punpckhbw m2, m0, m5 - punpcklbw m0, m5 - pmullw m2, filter_y_a - pmullw m1, filter_y_b - paddw m2, filter_rnd - pmullw m0, filter_y_a - pmullw m3, filter_y_b - paddw m2, m1 - mova m1, [dstq] - paddw m0, filter_rnd - psraw m2, 4 - paddw m0, m3 - punpckhbw m3, m1, m5 - psraw m0, 4 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline - packuswb m0, m2 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - add dstq, dst_strideq -%else ; %1 < 16 - movx m0, [srcq] - movx m1, [srcq+1] -%if cpuflag(ssse3) - punpcklbw m0, m1 - pmaddubsw m0, filter_x_a - paddw m0, filter_rnd -%else - punpcklbw m0, m5 - punpcklbw m1, m5 - pmullw m0, filter_x_a - pmullw m1, filter_x_b - paddw m0, filter_rnd - paddw m0, m1 -%endif - psraw m0, 4 -%if cpuflag(ssse3) - packuswb m0, m0 -%endif - - INC_SRC_BY_SRC_STRIDE - -.x_other_y_other_loop: - movx m2, [srcq] - movx m1, [srcq+1] - - INC_SRC_BY_SRC_STRIDE - movx m4, [srcq] - movx m3, [srcq+1] - -%if cpuflag(ssse3) - punpcklbw m2, m1 - punpcklbw m4, m3 - pmaddubsw m2, filter_x_a - pmaddubsw m4, filter_x_a - movx m3, [dstq+dst_strideq] - movx m1, [dstq] - paddw m2, filter_rnd - paddw m4, filter_rnd - psraw m2, 4 - psraw m4, 4 - packuswb m2, m2 - packuswb m4, m4 - punpcklbw m0, m2 - punpcklbw m2, m4 - pmaddubsw m0, filter_y_a - pmaddubsw m2, filter_y_a - punpcklbw m3, m5 - paddw m0, filter_rnd - paddw m2, filter_rnd - psraw m0, 4 - psraw m2, 4 - punpcklbw m1, m5 -%else - punpcklbw m2, m5 - punpcklbw m1, m5 - punpcklbw m4, m5 - punpcklbw m3, m5 - pmullw m2, filter_x_a - pmullw m1, filter_x_b - paddw m2, filter_rnd - pmullw m4, filter_x_a - pmullw m3, filter_x_b - paddw m4, filter_rnd - paddw m2, m1 - paddw m4, m3 - psraw m2, 4 - psraw m4, 4 - pmullw m0, filter_y_a - pmullw m3, m2, filter_y_b - paddw m0, filter_rnd - pmullw m2, filter_y_a - pmullw m1, m4, filter_y_b - paddw m2, filter_rnd - paddw m0, m3 - movx m3, [dstq+dst_strideq] - paddw m2, m1 - movx m1, [dstq] - psraw m0, 4 - psraw m2, 4 - punpcklbw m3, m5 - punpcklbw m1, m5 -%endif -%if %2 == 1 ; avg - ; FIXME(rbultje) pipeline -%if %1 == 4 - movlhps m0, m2 -%endif - packuswb m0, m2 -%if %1 > 4 - pavgb m0, [secq] - punpckhbw m2, m0, m5 - punpcklbw m0, m5 -%else - movh m2, [secq] - pavgb m0, m2 - punpcklbw m0, m5 - movhlps m2, m0 -%endif -%endif - SUM_SSE m0, m1, m2, m3, m6, m7 - mova m0, m4 - - INC_SRC_BY_SRC_STRIDE - lea dstq, [dstq+dst_strideq*2] -%endif -%if %2 == 1 ; avg - add secq, sec_str -%endif - dec block_height - jg .x_other_y_other_loop -%undef filter_x_a -%undef filter_x_b -%undef filter_y_a -%undef filter_y_b -%undef filter_rnd -%undef movx - STORE_AND_RET %1 -%endmacro - -; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical -; between the ssse3 and non-ssse3 version. It may make sense to merge their -; code in the sense that the ssse3 version would jump to the appropriate -; location in the sse/2 version, rather than duplicating that code in the -; binary. - -INIT_XMM sse2 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4 -SUBPEL_VARIANCE 8 -SUBPEL_VARIANCE 16 - -INIT_XMM sse2 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 - -INIT_XMM ssse3 -SUBPEL_VARIANCE 4, 1 -SUBPEL_VARIANCE 8, 1 -SUBPEL_VARIANCE 16, 1 diff -Nru aom-3.8.2/aom_dsp/x86/subpel_variance_ssse3.asm aom-3.9.0/aom_dsp/x86/subpel_variance_ssse3.asm --- aom-3.8.2/aom_dsp/x86/subpel_variance_ssse3.asm 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/subpel_variance_ssse3.asm 2024-05-07 19:57:02.620000000 +0000 @@ -0,0 +1,1442 @@ +; +; Copyright (c) 2016, Alliance for Open Media. All rights reserved +; +; This source code is subject to the terms of the BSD 2 Clause License and +; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License +; was not distributed with this source code in the LICENSE file, you can +; obtain it at www.aomedia.org/license/software. If the Alliance for Open +; Media Patent License 1.0 was not distributed with this source code in the +; PATENTS file, you can obtain it at www.aomedia.org/license/patent. +; + +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 14, 2 + times 8 db 12, 4 + times 8 db 10, 6 + times 16 db 8 + times 8 db 6, 10 + times 8 db 4, 12 + times 8 db 2, 14 + +SECTION .text + +; int aom_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 1 +%if %1 > 4 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%else ; 4xh + pshuflw m4, m6, 0xe + pshuflw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshuflw m4, m6, 0xe + paddd m6, m4 + movd raxd, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro INC_SRC_BY_SRC_STRIDE 0 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 + add srcq, src_stridemp +%else + add srcq, src_strideq +%endif +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 + +%if AOM_ARCH_X86_64 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define sec_str sec_strideq + %else + cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %endif + %define block_height heightd + %define bilin_filter sseq +%else + %if CONFIG_PIC=1 + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + sec, sec_stride, height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + + ; reuse argument stack space + %define g_bilin_filterm x_offsetm + %define g_pw_8m y_offsetm + + ;Store bilin_filter and pw_8 location in stack + %if GET_GOT_DEFINED == 1 + GET_GOT eax + add esp, 4 ; restore esp + %endif + + lea ecx, [GLOBAL(bilin_filter_m)] + mov g_bilin_filterm, ecx + + lea ecx, [GLOBAL(pw_8)] + mov g_pw_8m, ecx + + LOAD_IF_USED 0, 1 ; load eax, ecx back + %else + %if %2 == 1 ; avg + cglobal sub_pixel_avg_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, sec, sec_stride, \ + height, sse + %define block_height dword heightm + %define sec_str sec_stridemp + %else + cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, \ + x_offset, y_offset, dst, dst_stride, \ + height, sse + %define block_height heightd + %endif + %define bilin_filter bilin_filter_m + %endif +%endif + +%if %1 == 4 + %define movx movd +%else + %define movx movh +%endif + + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar block_height, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] +%endif + + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + +%if %2 == 1 ; avg +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_zero_loop + STORE_AND_RET %1 + +.x_zero_y_nonzero: + cmp y_offsetd, 4 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq*2] +%else ; 4xh + movx m1, [srcq+src_strideq*2] + punpckldq m2, m1 +%endif + movx m1, [dstq] +%if %1 > 4 + movlhps m0, m2 +%else ; 4xh + punpckldq m0, m2 +%endif + movx m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m4, [secq] + pavgb m0, m4 + punpcklbw m3, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq*2] + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_half_loop + STORE_AND_RET %1 + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0, reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq*2] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonzero: + cmp x_offsetd, 4 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m4, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; 4xh + movx m1, [srcq+src_strideq] + punpckldq m0, m1 + movx m2, [srcq+src_strideq+1] + punpckldq m4, m2 +%endif + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 +%if %1 > 4 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; 4xh + movh m2, [secq] + pavgb m0, m2 + punpcklbw m1, m5 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m2, [srcq+src_strideq] + movx m1, [dstq] + pavgb m0, m4 + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_zero_loop + STORE_AND_RET %1 + +.x_half_y_nonzero: + cmp y_offsetd, 4 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movx m2, [srcq] + movx m3, [srcq+1] +%if %2 == 1 ; avg +%if %1 > 4 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + movx m1, [srcq+src_strideq] + punpckldq m2, m1 + movx m1, [srcq+src_strideq+1] + punpckldq m3, m1 +%endif + pavgb m2, m3 +%if %1 > 4 + movlhps m0, m2 + movhlps m4, m2 +%else ; 4xh + punpckldq m0, m2 + pshuflw m4, m2, 0xe +%endif + movx m1, [dstq] + pavgb m0, m2 + movx m3, [dstq+dst_strideq] +%if %1 > 4 + pavgb m0, [secq] +%else + movh m2, [secq] + pavgb m0, m2 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 +%if %1 > 4 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%else ; !avg + movx m4, [srcq+src_strideq] + movx m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_half_loop + STORE_AND_RET %1 + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ;x86_32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; x_offset == 0.5. We can reuse x_offset reg +%define tempq x_offsetq + add y_offsetq, g_bilin_filterm +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movx m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movx m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +;y_offset == 0. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] + movx m2, [srcq+src_strideq] + movx m4, [srcq+src_strideq+1] + movx m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movx m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movx m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 4 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; y_offset == 0.5. We can reuse y_offset reg. +%define tempq y_offsetq + add x_offsetq, g_bilin_filterm +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movx m2, [srcq] + movx m1, [srcq+1] + movx m4, [srcq+src_strideq] + movx m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m1, [dstq] + movx m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movx m1, [dstq] + paddw m4, m3 + movx m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET %1 + +.x_nonhalf_y_nonhalf: +%if AOM_ARCH_X86_64 + lea bilin_filter, [GLOBAL(bilin_filter_m)] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if AOM_ARCH_X86_64 && %1 > 4 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [GLOBAL(pw_8)] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else ; x86-32 +%if AOM_ARCH_X86=1 && CONFIG_PIC=1 +; In this case, there is NO unused register. Used src_stride register. Later, +; src_stride has to be loaded from stack when it is needed. +%define tempq src_strideq + mov tempq, g_bilin_filterm + add x_offsetq, tempq + add y_offsetq, tempq +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] + + mov tempq, g_pw_8m +%define filter_rnd [tempq] +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [GLOBAL(pw_8)] +%endif +%endif + + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + + INC_SRC_BY_SRC_STRIDE + + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + add dstq, dst_strideq +%else ; %1 < 16 + movx m0, [srcq] + movx m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + + INC_SRC_BY_SRC_STRIDE + +.x_other_y_other_loop: + movx m2, [srcq] + movx m1, [srcq+1] + + INC_SRC_BY_SRC_STRIDE + movx m4, [srcq] + movx m3, [srcq+1] + +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movx m3, [dstq+dst_strideq] + movx m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movx m3, [dstq+dst_strideq] + paddw m2, m1 + movx m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline +%if %1 == 4 + movlhps m0, m2 +%endif + packuswb m0, m2 +%if %1 > 4 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + movh m2, [secq] + pavgb m0, m2 + punpcklbw m0, m5 + movhlps m2, m0 +%endif +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + INC_SRC_BY_SRC_STRIDE + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec block_height + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd +%undef movx + STORE_AND_RET %1 +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_XMM ssse3 +SUBPEL_VARIANCE 4, 1 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff -Nru aom-3.8.2/aom_dsp/x86/synonyms.h aom-3.9.0/aom_dsp/x86/synonyms.h --- aom-3.8.2/aom_dsp/x86/synonyms.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/synonyms.h 2024-05-07 19:57:02.622000000 +0000 @@ -46,6 +46,13 @@ return _mm_loadu_si128((const __m128i *)a); } +// Load 64 bits from each of hi and low, and pack into an SSE register +// Since directly loading as `int64_t`s and using _mm_set_epi64 may violate +// the strict aliasing rule, this takes a different approach +static INLINE __m128i xx_loadu_2x64(const void *hi, const void *lo) { + return _mm_unpacklo_epi64(_mm_loadu_si64(lo), _mm_loadu_si64(hi)); +} + static INLINE void xx_storel_32(void *const a, const __m128i v) { const int val = _mm_cvtsi128_si32(v); memcpy(a, &val, sizeof(val)); diff -Nru aom-3.8.2/aom_dsp/x86/synonyms_avx2.h aom-3.9.0/aom_dsp/x86/synonyms_avx2.h --- aom-3.8.2/aom_dsp/x86/synonyms_avx2.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/synonyms_avx2.h 2024-05-07 19:57:02.623000000 +0000 @@ -43,6 +43,16 @@ _mm256_storeu_si256((__m256i *)a, v); } +// Fill an AVX register using an interleaved pair of values, ie. set the +// 16 channels to {a, b} repeated 8 times, using the same channel ordering +// as when a register is stored to / loaded from memory. +// +// This is useful for rearranging filter kernels for use with the _mm_madd_epi16 +// instruction +static INLINE __m256i yy_set2_epi16(int16_t a, int16_t b) { + return _mm256_setr_epi16(a, b, a, b, a, b, a, b, a, b, a, b, a, b, a, b); +} + // The _mm256_set1_epi64x() intrinsic is undefined for some Visual Studio // compilers. The following function is equivalent to _mm256_set1_epi64x() // acting on a 32-bit integer. diff -Nru aom-3.8.2/aom_dsp/x86/variance_avx2.c aom-3.9.0/aom_dsp/x86/variance_avx2.c --- aom-3.8.2/aom_dsp/x86/variance_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/variance_avx2.c 2024-05-07 19:57:02.624000000 +0000 @@ -518,8 +518,8 @@ } } -uint64_t aom_mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_4x8, dst1_4x8, dst2_4x8, dst3_4x8, dst_16x8; __m128i src0_4x16, src1_4x16, src2_4x16, src3_4x16; @@ -575,8 +575,9 @@ // In src buffer, each 4x4 block in a 32x32 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_4xh_quad_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8, dst2_16x8, dst3_16x8; __m256i dst0_16x16, dst1_16x16, dst2_16x16, dst3_16x16; @@ -665,8 +666,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst3_16x8; __m256i src0_8x16, src1_8x16, src_16x16, dst_16x16; @@ -715,8 +716,9 @@ // In src buffer, each 8x8 block in a 64x64 filter block is stored sequentially. // Hence src_blk_stride is same as block width. Whereas dst buffer is a frame // buffer, thus dstride is a frame level stride. -uint64_t aom_mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, uint16_t *src, - int src_blk_stride, int h) { +static uint64_t mse_8xh_dual_16bit_avx2(uint8_t *dst, int dstride, + uint16_t *src, int src_blk_stride, + int h) { uint64_t sum = 0; __m128i dst0_16x8, dst1_16x8; __m256i dst0_16x16, dst1_16x16; @@ -780,8 +782,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_avx2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_avx2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } @@ -795,8 +797,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must be satisfied"); switch (w) { - case 4: return aom_mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); - case 8: return aom_mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); + case 4: return mse_4xh_quad_16bit_avx2(dst, dstride, src, w * h, h); + case 8: return mse_8xh_dual_16bit_avx2(dst, dstride, src, w * h, h); default: assert(0 && "unsupported width"); return -1; } } diff -Nru aom-3.8.2/aom_dsp/x86/variance_impl_avx2.c aom-3.9.0/aom_dsp/x86/variance_impl_avx2.c --- aom-3.8.2/aom_dsp/x86/variance_impl_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/variance_impl_avx2.c 2024-05-07 19:57:02.626000000 +0000 @@ -648,7 +648,7 @@ #endif #define MAKE_SUB_PIXEL_AVG_VAR_32XH(height, log2height) \ - int aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + static int sub_pixel_avg_variance32x##height##_imp_avx2( \ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride, \ unsigned int *sse) { \ @@ -876,7 +876,7 @@ const uint8_t *src, int src_stride, int x_offset, int y_offset, \ const uint8_t *dst, int dst_stride, unsigned int *sse, \ const uint8_t *sec_ptr) { \ - const int sum = aom_sub_pixel_avg_variance32x##height##_imp_avx2( \ + const int sum = sub_pixel_avg_variance32x##height##_imp_avx2( \ src, src_stride, x_offset, y_offset, dst, dst_stride, sec_ptr, 32, \ sse); \ return *sse - (unsigned int)(((int64_t)sum * sum) >> (5 + log2height)); \ @@ -899,7 +899,7 @@ const uint8_t *sec_ptr = sec; \ for (int j = 0; j < (h / hf); ++j) { \ unsigned int sse2; \ - const int se2 = aom_sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ + const int se2 = sub_pixel_avg_variance##wf##x##hf##_imp_avx2( \ src_ptr, src_stride, x_offset, y_offset, dst_ptr, dst_stride, \ sec_ptr, w, &sse2); \ dst_ptr += hf * dst_stride; \ diff -Nru aom-3.8.2/aom_dsp/x86/variance_sse2.c aom-3.9.0/aom_dsp/x86/variance_sse2.c --- aom-3.8.2/aom_dsp/x86/variance_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_dsp/x86/variance_sse2.c 2024-05-07 19:57:02.628000000 +0000 @@ -415,7 +415,6 @@ DECL(8, opt); \ DECL(16, opt) -DECLS(sse2); DECLS(ssse3); #undef DECLS #undef DECL @@ -492,7 +491,6 @@ FN(4, 4, 4, 2, 2, opt, (int32_t), (int32_t)) #endif -FNS(sse2) FNS(ssse3) #undef FNS @@ -510,7 +508,6 @@ DECL(8, opt); \ DECL(16, opt) -DECLS(sse2); DECLS(ssse3); #undef DECL #undef DECLS @@ -591,7 +588,6 @@ FN(4, 4, 4, 2, 2, opt, (uint32_t), (int32_t)) #endif -FNS(sse2) FNS(ssse3) #undef FNS @@ -710,8 +706,8 @@ } } -uint64_t aom_mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_4xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst0_8x8, dst1_8x8, dst_16x8; __m128i src0_16x4, src1_16x4, src_16x8; @@ -744,8 +740,8 @@ return sum; } -uint64_t aom_mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, - int sstride, int h) { +static uint64_t mse_8xh_16bit_sse2(uint8_t *dst, int dstride, uint16_t *src, + int sstride, int h) { uint64_t sum = 0; __m128i dst_8x8, dst_16x8; __m128i src_16x8; @@ -781,8 +777,8 @@ assert((w == 8 || w == 4) && (h == 8 || h == 4) && "w=8/4 and h=8/4 must satisfy"); switch (w) { - case 4: return aom_mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); - case 8: return aom_mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); + case 4: return mse_4xh_16bit_sse2(dst, dstride, src, sstride, h); + case 8: return mse_8xh_16bit_sse2(dst, dstride, src, sstride, h); default: assert(0 && "unsupported width"); return -1; } } diff -Nru aom-3.8.2/aom_ports/aarch64_cpudetect.c aom-3.9.0/aom_ports/aarch64_cpudetect.c --- aom-3.8.2/aom_ports/aarch64_cpudetect.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_ports/aarch64_cpudetect.c 2024-05-07 19:57:02.630000000 +0000 @@ -9,8 +9,12 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "config/aom_config.h" + #include "arm_cpudetect.h" +#include "aom_ports/arm.h" + #if defined(__APPLE__) #include #endif @@ -104,6 +108,7 @@ #define AOM_AARCH64_HWCAP_CRC32 (1 << 7) #define AOM_AARCH64_HWCAP_ASIMDDP (1 << 20) #define AOM_AARCH64_HWCAP_SVE (1 << 22) +#define AOM_AARCH64_HWCAP2_SVE2 (1 << 1) #define AOM_AARCH64_HWCAP2_I8MM (1 << 13) static int arm_get_cpu_caps(void) { @@ -111,7 +116,7 @@ #if HAVE_ARM_CRC32 || HAVE_NEON_DOTPROD || HAVE_SVE unsigned long hwcap = getauxval(AT_HWCAP); #endif -#if HAVE_NEON_I8MM +#if HAVE_NEON_I8MM || HAVE_SVE2 unsigned long hwcap2 = getauxval(AT_HWCAP2); #endif @@ -130,6 +135,9 @@ #if HAVE_SVE if (hwcap & AOM_AARCH64_HWCAP_SVE) flags |= HAS_SVE; #endif // HAVE_SVE +#if HAVE_SVE2 + if (hwcap2 & AOM_AARCH64_HWCAP2_SVE2) flags |= HAS_SVE2; +#endif // HAVE_SVE2 return flags; } @@ -189,5 +197,8 @@ if (!(flags & HAS_NEON_DOTPROD)) flags &= ~HAS_SVE; if (!(flags & HAS_NEON_I8MM)) flags &= ~HAS_SVE; + // Restrict flags: SVE2 assumes that FEAT_SVE is available. + if (!(flags & HAS_SVE)) flags &= ~HAS_SVE2; + return flags; } diff -Nru aom-3.8.2/aom_ports/arm.h aom-3.9.0/aom_ports/arm.h --- aom-3.8.2/aom_ports/arm.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_ports/arm.h 2024-05-07 19:57:02.631000000 +0000 @@ -29,6 +29,8 @@ #define HAS_NEON_I8MM (1 << 3) // Armv8.2-A optional SVE instructions, mandatory from Armv9.0-A. #define HAS_SVE (1 << 4) +// Armv9.0-A SVE2 instructions. +#define HAS_SVE2 (1 << 5) int aom_arm_cpu_caps(void); diff -Nru aom-3.8.2/aom_ports/bitops.h aom-3.9.0/aom_ports/bitops.h --- aom-3.8.2/aom_ports/bitops.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_ports/bitops.h 2024-05-07 19:57:02.631000000 +0000 @@ -13,12 +13,13 @@ #define AOM_AOM_PORTS_BITOPS_H_ #include +#include #include "aom_ports/msvc.h" #include "config/aom_config.h" #ifdef _MSC_VER -#if defined(_M_X64) || defined(_M_IX86) +#if defined(_M_X64) || defined(_M_IX86) || defined(_M_ARM64) || defined(_M_ARM) #include #define USE_MSC_INTRINSICS #endif @@ -52,7 +53,6 @@ _BitScanReverse(&first_set_bit, n); return first_set_bit; } -#undef USE_MSC_INTRINSICS #else static INLINE int get_msb(unsigned int n) { int log = 0; @@ -71,6 +71,50 @@ } #endif +#if defined(__GNUC__) && \ + ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4) +static INLINE int aom_clzll(uint64_t n) { return __builtin_clzll(n); } +#elif defined(USE_MSC_INTRINSICS) +#if defined(_M_X64) || defined(_M_ARM64) +#pragma intrinsic(_BitScanReverse64) +#endif + +static INLINE int aom_clzll(uint64_t n) { + assert(n != 0); + unsigned long first_set_bit; // NOLINT(runtime/int) +#if defined(_M_X64) || defined(_M_ARM64) + const unsigned char bit_set = + _BitScanReverse64(&first_set_bit, (unsigned __int64)n); +#else // !(defined(_M_X64) || defined(_M_ARM64)) + const unsigned long n_hi = (unsigned long)(n >> 32); // NOLINT(runtime/int) + if (n_hi != 0) { + const unsigned char bit_set = _BitScanReverse(&first_set_bit, n_hi); + assert(bit_set != 0); + (void)bit_set; + return 31 ^ (int)first_set_bit; + } + const unsigned char bit_set = + _BitScanReverse(&first_set_bit, (unsigned long)n); // NOLINT(runtime/int) +#endif + assert(bit_set != 0); + (void)bit_set; + return 63 ^ (int)first_set_bit; +} +#undef USE_MSC_INTRINSICS +#else +static INLINE int aom_clzll(uint64_t n) { + assert(n != 0); + + int res = 0; + uint64_t high_bit = 1ULL << 63; + while (!(n & high_bit)) { + res++; + n <<= 1; + } + return res; +} +#endif + #ifdef __cplusplus } // extern "C" #endif diff -Nru aom-3.8.2/aom_ports/mem.h aom-3.9.0/aom_ports/mem.h --- aom-3.8.2/aom_ports/mem.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_ports/mem.h 2024-05-07 19:57:02.631000000 +0000 @@ -24,7 +24,13 @@ #define DECLARE_ALIGNED(n, typ, val) typ val #endif -#if HAVE_NEON && defined(_MSC_VER) +#if defined(__has_builtin) +#define AOM_HAS_BUILTIN(x) __has_builtin(x) +#else +#define AOM_HAS_BUILTIN(x) 0 +#endif + +#if !AOM_HAS_BUILTIN(__builtin_prefetch) && !defined(__GNUC__) #define __builtin_prefetch(x) #endif diff -Nru aom-3.8.2/aom_scale/aom_scale_rtcd.c aom-3.9.0/aom_scale/aom_scale_rtcd.c --- aom-3.8.2/aom_scale/aom_scale_rtcd.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_scale/aom_scale_rtcd.c 2024-05-07 19:57:02.635000000 +0000 @@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void aom_scale_rtcd() { aom_once(setup_rtcd_internal); } +void aom_scale_rtcd(void) { aom_once(setup_rtcd_internal); } diff -Nru aom-3.8.2/aom_scale/aom_scale_rtcd.pl aom-3.9.0/aom_scale/aom_scale_rtcd.pl --- aom-3.8.2/aom_scale/aom_scale_rtcd.pl 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_scale/aom_scale_rtcd.pl 2024-05-07 19:57:02.635000000 +0000 @@ -10,6 +10,8 @@ ## sub aom_scale_forward_decls() { print < + struct yv12_buffer_config; EOF } @@ -26,17 +28,17 @@ add_proto qw/void aom_vertical_band_2_1_scale_i/, "unsigned char *source, int src_pitch, unsigned char *dest, int dest_pitch, unsigned int dest_width"; } -add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, int num_pyramid_levels, int num_planes"; +add_proto qw/int aom_yv12_realloc_with_new_border/, "struct yv12_buffer_config *ybf, int new_border, int byte_alignment, bool alloc_pyramid, int num_planes"; add_proto qw/void aom_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes"; add_proto qw/void aom_yv12_copy_frame/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, const int num_planes"; -add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; +add_proto qw/void aom_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int use_crop"; -add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc"; +add_proto qw/void aom_yv12_copy_u/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop"; -add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc"; +add_proto qw/void aom_yv12_copy_v/, "const struct yv12_buffer_config *src_bc, struct yv12_buffer_config *dst_bc, int use_crop"; add_proto qw/void aom_yv12_partial_copy_y/, "const struct yv12_buffer_config *src_ybc, int hstart1, int hend1, int vstart1, int vend1, struct yv12_buffer_config *dst_ybc, int hstart2, int vstart2"; add_proto qw/void aom_yv12_partial_coloc_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int hstart, int hend, int vstart, int vend"; @@ -47,7 +49,7 @@ add_proto qw/void aom_extend_frame_borders_plane_row/, "const struct yv12_buffer_config *ybf, int plane, int v_start, int v_end"; -add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, const int num_planes"; +add_proto qw/void aom_extend_frame_borders/, "struct yv12_buffer_config *ybf, int num_planes"; add_proto qw/void aom_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf, const int num_planes"; diff -Nru aom-3.8.2/aom_scale/generic/yv12config.c aom-3.9.0/aom_scale/generic/yv12config.c --- aom-3.8.2/aom_scale/generic/yv12config.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_scale/generic/yv12config.c 2024-05-07 19:57:02.636000000 +0000 @@ -11,9 +11,12 @@ #include +#include "config/aom_config.h" + +#include "aom/aom_image.h" #include "aom/internal/aom_image_internal.h" -#include "aom_dsp/pyramid.h" #include "aom_dsp/flow_estimation/corner_detect.h" +#include "aom_dsp/pyramid.h" #include "aom_mem/aom_mem.h" #include "aom_ports/mem.h" #include "aom_scale/yv12config.h" @@ -60,7 +63,7 @@ const uint64_t uvplane_size, const int aligned_width, const int aligned_height, const int uv_width, const int uv_height, const int uv_stride, const int uv_border_w, const int uv_border_h, - int num_pyramid_levels, int alloc_y_plane_only) { + bool alloc_pyramid, int alloc_y_plane_only) { if (ybf) { const int aom_byte_align = (byte_alignment == 0) ? 1 : byte_alignment; const uint64_t frame_size = @@ -71,8 +74,8 @@ #if CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER // We should only need an 8-bit version of the source frame if we are // encoding in non-realtime mode - (void)num_pyramid_levels; - assert(num_pyramid_levels == 0); + (void)alloc_pyramid; + assert(!alloc_pyramid); #endif // CONFIG_REALTIME_ONLY || !CONFIG_AV1_ENCODER #if defined AOM_MAX_ALLOCABLE_MEMORY @@ -80,9 +83,8 @@ uint64_t alloc_size = frame_size; #if CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY // The size of ybf->y_pyramid - if (num_pyramid_levels > 0) { - alloc_size += aom_get_pyramid_alloc_size( - width, height, num_pyramid_levels, use_highbitdepth); + if (alloc_pyramid) { + alloc_size += aom_get_pyramid_alloc_size(width, height, use_highbitdepth); alloc_size += av1_get_corner_list_size(); } #endif // CONFIG_AV1_ENCODER && !CONFIG_REALTIME_ONLY @@ -190,9 +192,8 @@ av1_free_corner_list(ybf->corners); ybf->corners = NULL; } - if (num_pyramid_levels > 0) { - ybf->y_pyramid = aom_alloc_pyramid(width, height, num_pyramid_levels, - use_highbitdepth); + if (alloc_pyramid) { + ybf->y_pyramid = aom_alloc_pyramid(width, height, use_highbitdepth); if (!ybf->y_pyramid) return AOM_CODEC_MEM_ERROR; ybf->corners = av1_alloc_corner_list(); if (!ybf->corners) return AOM_CODEC_MEM_ERROR; @@ -237,7 +238,7 @@ int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, - int num_pyramid_levels, int alloc_y_plane_only) { + bool alloc_pyramid, int alloc_y_plane_only) { #if CONFIG_SIZE_LIMIT if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT) return AOM_CODEC_MEM_ERROR; @@ -264,21 +265,20 @@ ybf, width, height, ss_x, ss_y, use_highbitdepth, border, byte_alignment, fb, cb, cb_priv, y_stride, yplane_size, uvplane_size, aligned_width, aligned_height, uv_width, uv_height, uv_stride, - uv_border_w, uv_border_h, num_pyramid_levels, alloc_y_plane_only); + uv_border_w, uv_border_h, alloc_pyramid, alloc_y_plane_only); } return AOM_CODEC_MEM_ERROR; } int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, - int byte_alignment, int num_pyramid_levels, + int byte_alignment, bool alloc_pyramid, int alloc_y_plane_only) { if (ybf) { aom_free_frame_buffer(ybf); - return aom_realloc_frame_buffer(ybf, width, height, ss_x, ss_y, - use_highbitdepth, border, byte_alignment, - NULL, NULL, NULL, num_pyramid_levels, - alloc_y_plane_only); + return aom_realloc_frame_buffer( + ybf, width, height, ss_x, ss_y, use_highbitdepth, border, + byte_alignment, NULL, NULL, NULL, alloc_pyramid, alloc_y_plane_only); } return AOM_CODEC_MEM_ERROR; } diff -Nru aom-3.8.2/aom_scale/generic/yv12extend.c aom-3.9.0/aom_scale/generic/yv12extend.c --- aom-3.8.2/aom_scale/generic/yv12extend.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_scale/generic/yv12extend.c 2024-05-07 19:57:02.637000000 +0000 @@ -302,8 +302,10 @@ } void aom_yv12_copy_y_c(const YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc) { + YV12_BUFFER_CONFIG *dst_ybc, int use_crop) { int row; + int width = use_crop ? src_ybc->y_crop_width : src_ybc->y_width; + int height = use_crop ? src_ybc->y_crop_height : src_ybc->y_height; const uint8_t *src = src_ybc->y_buffer; uint8_t *dst = dst_ybc->y_buffer; @@ -311,8 +313,8 @@ if (src_ybc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); - for (row = 0; row < src_ybc->y_height; ++row) { - memcpy(dst16, src16, src_ybc->y_width * sizeof(uint16_t)); + for (row = 0; row < height; ++row) { + memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_ybc->y_stride; dst16 += dst_ybc->y_stride; } @@ -320,56 +322,60 @@ } #endif - for (row = 0; row < src_ybc->y_height; ++row) { - memcpy(dst, src, src_ybc->y_width); + for (row = 0; row < height; ++row) { + memcpy(dst, src, width); src += src_ybc->y_stride; dst += dst_ybc->y_stride; } } void aom_yv12_copy_u_c(const YV12_BUFFER_CONFIG *src_bc, - YV12_BUFFER_CONFIG *dst_bc) { + YV12_BUFFER_CONFIG *dst_bc, int use_crop) { int row; + int width = use_crop ? src_bc->uv_crop_width : src_bc->uv_width; + int height = use_crop ? src_bc->uv_crop_height : src_bc->uv_height; const uint8_t *src = src_bc->u_buffer; uint8_t *dst = dst_bc->u_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + for (row = 0; row < height; ++row) { + memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst, src, src_bc->uv_width); + for (row = 0; row < height; ++row) { + memcpy(dst, src, width); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } } void aom_yv12_copy_v_c(const YV12_BUFFER_CONFIG *src_bc, - YV12_BUFFER_CONFIG *dst_bc) { + YV12_BUFFER_CONFIG *dst_bc, int use_crop) { int row; + int width = use_crop ? src_bc->uv_crop_width : src_bc->uv_width; + int height = use_crop ? src_bc->uv_crop_height : src_bc->uv_height; const uint8_t *src = src_bc->v_buffer; uint8_t *dst = dst_bc->v_buffer; #if CONFIG_AV1_HIGHBITDEPTH if (src_bc->flags & YV12_FLAG_HIGHBITDEPTH) { const uint16_t *src16 = CONVERT_TO_SHORTPTR(src); uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst16, src16, src_bc->uv_width * sizeof(uint16_t)); + for (row = 0; row < height; ++row) { + memcpy(dst16, src16, width * sizeof(uint16_t)); src16 += src_bc->uv_stride; dst16 += dst_bc->uv_stride; } return; } #endif - for (row = 0; row < src_bc->uv_height; ++row) { - memcpy(dst, src, src_bc->uv_width); + for (row = 0; row < height; ++row) { + memcpy(dst, src, width); src += src_bc->uv_stride; dst += dst_bc->uv_stride; } @@ -491,8 +497,8 @@ } int aom_yv12_realloc_with_new_border_c(YV12_BUFFER_CONFIG *ybf, int new_border, - int byte_alignment, - int num_pyramid_levels, int num_planes) { + int byte_alignment, bool alloc_pyramid, + int num_planes) { if (ybf) { if (new_border == ybf->border) return 0; YV12_BUFFER_CONFIG new_buf; @@ -500,7 +506,7 @@ const int error = aom_alloc_frame_buffer( &new_buf, ybf->y_crop_width, ybf->y_crop_height, ybf->subsampling_x, ybf->subsampling_y, ybf->flags & YV12_FLAG_HIGHBITDEPTH, new_border, - byte_alignment, num_pyramid_levels, 0); + byte_alignment, alloc_pyramid, 0); if (error) return error; // Copy image buffer aom_yv12_copy_frame(ybf, &new_buf, num_planes); diff -Nru aom-3.8.2/aom_scale/yv12config.h aom-3.9.0/aom_scale/yv12config.h --- aom-3.8.2/aom_scale/yv12config.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_scale/yv12config.h 2024-05-07 19:57:02.637000000 +0000 @@ -16,6 +16,8 @@ extern "C" { #endif +#include + #include "config/aom_config.h" #include "aom/aom_codec.h" @@ -45,18 +47,29 @@ /*!\cond */ union { struct { + // The aligned frame width of luma. + // It is aligned to a multiple of 8: + // y_width = (y_crop_width + 7) & ~7 int y_width; + // The aligned frame width of chroma. + // uv_width = y_width >> subsampling_x int uv_width; }; int widths[2]; }; union { struct { + // The aligned frame height of luma. + // It is aligned to a multiple of 8: + // y_height = (y_crop_height + 7) & ~7 int y_height; + // The aligned frame height of chroma. + // uv_height = y_height >> subsampling_y int uv_height; }; int heights[2]; }; + // The frame size en/decoded by AV1 union { struct { int y_crop_width; @@ -139,7 +152,7 @@ // available return values. int aom_alloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, int ss_x, int ss_y, int use_highbitdepth, int border, - int byte_alignment, int num_pyramid_levels, + int byte_alignment, bool alloc_pyramid, int alloc_y_plane_only); // Updates the yv12 buffer config with the frame buffer. |byte_alignment| must @@ -149,15 +162,11 @@ // to decode the current frame. If cb is NULL, libaom will allocate memory // internally to decode the current frame. // -// If num_pyramid_levels > 0, then an image pyramid will be allocated with -// the specified number of levels. -// -// Any buffer which may become a source or ref frame buffer in the encoder -// must have num_pyramid_levels = cpi->image_pyramid_levels. This will cause -// an image pyramid to be allocated if one is needed. -// -// Any other buffers (in particular, any buffers inside the decoder) -// must have cpi->image_pyramid_levels = 0, as a pyramid is unneeded there. +// If alloc_pyramid is true, then an image pyramid will be allocated +// for use in global motion estimation. This is only needed if this frame +// buffer will be used to store a source frame or a reference frame in +// the encoder. Any other framebuffers (eg, intermediates for filtering, +// or any buffer in the decoder) can set alloc_pyramid = false. // // Returns 0 on success. Returns < 0 on failure. int aom_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, int width, int height, @@ -165,7 +174,7 @@ int border, int byte_alignment, aom_codec_frame_buffer_t *fb, aom_get_frame_buffer_cb_fn_t cb, void *cb_priv, - int num_pyramid_levels, int alloc_y_plane_only); + bool alloc_pyramid, int alloc_y_plane_only); int aom_free_frame_buffer(YV12_BUFFER_CONFIG *ybf); diff -Nru aom-3.8.2/aom_util/aom_pthread.h aom-3.9.0/aom_util/aom_pthread.h --- aom-3.8.2/aom_util/aom_pthread.h 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/aom_util/aom_pthread.h 2024-05-07 19:57:02.640000000 +0000 @@ -0,0 +1,171 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +// +// pthread.h wrapper + +#ifndef AOM_AOM_UTIL_AOM_PTHREAD_H_ +#define AOM_AOM_UTIL_AOM_PTHREAD_H_ + +#include "config/aom_config.h" + +#if CONFIG_MULTITHREAD + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_WIN32) && !HAVE_PTHREAD_H +// Prevent leaking max/min macros. +#undef NOMINMAX +#define NOMINMAX +#undef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#include // NOLINT +#include // NOLINT +#include // NOLINT +#include // NOLINT +typedef HANDLE pthread_t; +typedef int pthread_attr_t; +typedef CRITICAL_SECTION pthread_mutex_t; + +#if _WIN32_WINNT < 0x0600 +#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. +#endif +typedef CONDITION_VARIABLE pthread_cond_t; + +#ifndef WINAPI_FAMILY_PARTITION +#define WINAPI_PARTITION_DESKTOP 1 +#define WINAPI_FAMILY_PARTITION(x) x +#endif + +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define USE_CREATE_THREAD +#endif + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +// _beginthreadex requires __stdcall +#if defined(__GNUC__) && \ + (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)) +#define THREADFN __attribute__((force_align_arg_pointer)) unsigned int __stdcall +#else +#define THREADFN unsigned int __stdcall +#endif +#define THREAD_EXIT_SUCCESS 0 + +static INLINE int pthread_attr_init(pthread_attr_t *attr) { + (void)attr; + return 0; +} + +static INLINE int pthread_attr_destroy(pthread_attr_t *attr) { + (void)attr; + return 0; +} + +static INLINE int pthread_create(pthread_t *const thread, + const pthread_attr_t *attr, + unsigned int(__stdcall *start)(void *), + void *arg) { + (void)attr; +#ifdef USE_CREATE_THREAD + *thread = CreateThread(NULL, /* lpThreadAttributes */ + 0, /* dwStackSize */ + start, arg, 0, /* dwStackSize */ + NULL); /* lpThreadId */ +#else + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, arg, 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ +#endif + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static INLINE int pthread_join(pthread_t thread, void **value_ptr) { + (void)value_ptr; + return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != + WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, + void *mutexattr) { + (void)mutexattr; + InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); + return 0; +} + +static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { + return TryEnterCriticalSection(mutex) ? 0 : EBUSY; +} + +static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { + (void)condition; + return 0; +} + +static INLINE int pthread_cond_init(pthread_cond_t *const condition, + void *cond_attr) { + (void)cond_attr; + InitializeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { + WakeConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { + WakeAllConditionVariable(condition); + return 0; +} + +static INLINE int pthread_cond_wait(pthread_cond_t *const condition, + pthread_mutex_t *const mutex) { + int ok; + ok = SleepConditionVariableCS(condition, mutex, INFINITE); + return !ok; +} +#else // _WIN32 +#include // NOLINT +#define THREADFN void * +#define THREAD_EXIT_SUCCESS NULL +#endif + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // CONFIG_MULTITHREAD + +#endif // AOM_AOM_UTIL_AOM_PTHREAD_H_ diff -Nru aom-3.8.2/aom_util/aom_thread.c aom-3.9.0/aom_util/aom_thread.c --- aom-3.8.2/aom_util/aom_thread.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_util/aom_thread.c 2024-05-07 19:57:02.641000000 +0000 @@ -23,8 +23,11 @@ #include #include // for memset() +#include "config/aom_config.h" + #include "aom_mem/aom_mem.h" #include "aom_ports/sanitizer.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #if CONFIG_MULTITHREAD @@ -65,29 +68,30 @@ #endif pthread_mutex_lock(&worker->impl_->mutex_); for (;;) { - while (worker->status_ == OK) { // wait in idling mode + while (worker->status_ == AVX_WORKER_STATUS_OK) { // wait in idling mode pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } - if (worker->status_ == WORK) { - // When worker->status_ is WORK, the main thread doesn't change - // worker->status_ and will wait until the worker changes worker->status_ - // to OK. See change_state(). So the worker can safely call execute() - // without holding worker->impl_->mutex_. When the worker reacquires - // worker->impl_->mutex_, worker->status_ must still be WORK. + if (worker->status_ == AVX_WORKER_STATUS_WORKING) { + // When worker->status_ is AVX_WORKER_STATUS_WORKING, the main thread + // doesn't change worker->status_ and will wait until the worker changes + // worker->status_ to AVX_WORKER_STATUS_OK. See change_state(). So the + // worker can safely call execute() without holding worker->impl_->mutex_. + // When the worker reacquires worker->impl_->mutex_, worker->status_ must + // still be AVX_WORKER_STATUS_WORKING. pthread_mutex_unlock(&worker->impl_->mutex_); execute(worker); pthread_mutex_lock(&worker->impl_->mutex_); - assert(worker->status_ == WORK); - worker->status_ = OK; + assert(worker->status_ == AVX_WORKER_STATUS_WORKING); + worker->status_ = AVX_WORKER_STATUS_OK; // signal to the main thread that we're done (for sync()) pthread_cond_signal(&worker->impl_->condition_); } else { - assert(worker->status_ == NOT_OK); // finish the worker + assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK); // finish the worker break; } } pthread_mutex_unlock(&worker->impl_->mutex_); - return THREAD_RETURN(NULL); // Thread is finished + return THREAD_EXIT_SUCCESS; // Thread is finished } // main thread state control @@ -98,13 +102,13 @@ if (worker->impl_ == NULL) return; pthread_mutex_lock(&worker->impl_->mutex_); - if (worker->status_ >= OK) { + if (worker->status_ >= AVX_WORKER_STATUS_OK) { // wait for the worker to finish - while (worker->status_ != OK) { + while (worker->status_ != AVX_WORKER_STATUS_OK) { pthread_cond_wait(&worker->impl_->condition_, &worker->impl_->mutex_); } // assign new status and release the working thread if needed - if (new_status != OK) { + if (new_status != AVX_WORKER_STATUS_OK) { worker->status_ = new_status; pthread_cond_signal(&worker->impl_->condition_); } @@ -118,21 +122,21 @@ static void init(AVxWorker *const worker) { memset(worker, 0, sizeof(*worker)); - worker->status_ = NOT_OK; + worker->status_ = AVX_WORKER_STATUS_NOT_OK; } static int sync(AVxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, OK); + change_state(worker, AVX_WORKER_STATUS_OK); #endif - assert(worker->status_ <= OK); + assert(worker->status_ <= AVX_WORKER_STATUS_OK); return !worker->had_error; } static int reset(AVxWorker *const worker) { int ok = 1; worker->had_error = 0; - if (worker->status_ < OK) { + if (worker->status_ < AVX_WORKER_STATUS_OK) { #if CONFIG_MULTITHREAD worker->impl_ = (AVxWorkerImpl *)aom_calloc(1, sizeof(*worker->impl_)); if (worker->impl_ == NULL) { @@ -164,7 +168,7 @@ #endif pthread_mutex_lock(&worker->impl_->mutex_); ok = !pthread_create(&worker->impl_->thread_, &attr, thread_loop, worker); - if (ok) worker->status_ = OK; + if (ok) worker->status_ = AVX_WORKER_STATUS_OK; pthread_mutex_unlock(&worker->impl_->mutex_); pthread_attr_destroy(&attr); if (!ok) { @@ -177,12 +181,12 @@ return 0; } #else - worker->status_ = OK; + worker->status_ = AVX_WORKER_STATUS_OK; #endif - } else if (worker->status_ > OK) { + } else if (worker->status_ > AVX_WORKER_STATUS_OK) { ok = sync(worker); } - assert(!ok || (worker->status_ == OK)); + assert(!ok || (worker->status_ == AVX_WORKER_STATUS_OK)); return ok; } @@ -194,7 +198,7 @@ static void launch(AVxWorker *const worker) { #if CONFIG_MULTITHREAD - change_state(worker, WORK); + change_state(worker, AVX_WORKER_STATUS_WORKING); #else execute(worker); #endif @@ -203,7 +207,7 @@ static void end(AVxWorker *const worker) { #if CONFIG_MULTITHREAD if (worker->impl_ != NULL) { - change_state(worker, NOT_OK); + change_state(worker, AVX_WORKER_STATUS_NOT_OK); pthread_join(worker->impl_->thread_, NULL); pthread_mutex_destroy(&worker->impl_->mutex_); pthread_cond_destroy(&worker->impl_->condition_); @@ -211,10 +215,10 @@ worker->impl_ = NULL; } #else - worker->status_ = NOT_OK; + worker->status_ = AVX_WORKER_STATUS_NOT_OK; assert(worker->impl_ == NULL); #endif - assert(worker->status_ == NOT_OK); + assert(worker->status_ == AVX_WORKER_STATUS_NOT_OK); } //------------------------------------------------------------------------------ diff -Nru aom-3.8.2/aom_util/aom_thread.h aom-3.9.0/aom_util/aom_thread.h --- aom-3.8.2/aom_util/aom_thread.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_util/aom_thread.h 2024-05-07 19:57:02.642000000 +0000 @@ -17,157 +17,17 @@ #ifndef AOM_AOM_UTIL_AOM_THREAD_H_ #define AOM_AOM_UTIL_AOM_THREAD_H_ -#include "config/aom_config.h" - #ifdef __cplusplus extern "C" { #endif #define MAX_NUM_THREADS 64 -#if CONFIG_MULTITHREAD - -#if defined(_WIN32) && !HAVE_PTHREAD_H -// Prevent leaking max/min macros. -#undef NOMINMAX -#define NOMINMAX -#undef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#include // NOLINT -#include // NOLINT -#include // NOLINT -typedef HANDLE pthread_t; -typedef int pthread_attr_t; -typedef CRITICAL_SECTION pthread_mutex_t; - -#if _WIN32_WINNT < 0x0600 -#error _WIN32_WINNT must target Windows Vista / Server 2008 or newer. -#endif -typedef CONDITION_VARIABLE pthread_cond_t; - -#ifndef WINAPI_FAMILY_PARTITION -#define WINAPI_PARTITION_DESKTOP 1 -#define WINAPI_FAMILY_PARTITION(x) x -#endif - -#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#define USE_CREATE_THREAD -#endif - -//------------------------------------------------------------------------------ -// simplistic pthread emulation layer - -// _beginthreadex requires __stdcall -#define THREADFN unsigned int __stdcall -#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) - -static INLINE int pthread_attr_init(pthread_attr_t *attr) { - (void)attr; - return 0; -} - -static INLINE int pthread_attr_destroy(pthread_attr_t *attr) { - (void)attr; - return 0; -} - -static INLINE int pthread_create(pthread_t *const thread, - const pthread_attr_t *attr, - unsigned int(__stdcall *start)(void *), - void *arg) { - (void)attr; -#ifdef USE_CREATE_THREAD - *thread = CreateThread(NULL, /* lpThreadAttributes */ - 0, /* dwStackSize */ - start, arg, 0, /* dwStackSize */ - NULL); /* lpThreadId */ -#else - *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ - 0, /* unsigned stack_size */ - start, arg, 0, /* unsigned initflag */ - NULL); /* unsigned *thrdaddr */ -#endif - if (*thread == NULL) return 1; - SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); - return 0; -} - -static INLINE int pthread_join(pthread_t thread, void **value_ptr) { - (void)value_ptr; - return (WaitForSingleObjectEx(thread, INFINITE, FALSE /*bAlertable*/) != - WAIT_OBJECT_0 || - CloseHandle(thread) == 0); -} - -// Mutex -static INLINE int pthread_mutex_init(pthread_mutex_t *const mutex, - void *mutexattr) { - (void)mutexattr; - InitializeCriticalSectionEx(mutex, 0 /*dwSpinCount*/, 0 /*Flags*/); - return 0; -} - -static INLINE int pthread_mutex_trylock(pthread_mutex_t *const mutex) { - return TryEnterCriticalSection(mutex) ? 0 : EBUSY; -} - -static INLINE int pthread_mutex_lock(pthread_mutex_t *const mutex) { - EnterCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_unlock(pthread_mutex_t *const mutex) { - LeaveCriticalSection(mutex); - return 0; -} - -static INLINE int pthread_mutex_destroy(pthread_mutex_t *const mutex) { - DeleteCriticalSection(mutex); - return 0; -} - -// Condition -static INLINE int pthread_cond_destroy(pthread_cond_t *const condition) { - (void)condition; - return 0; -} - -static INLINE int pthread_cond_init(pthread_cond_t *const condition, - void *cond_attr) { - (void)cond_attr; - InitializeConditionVariable(condition); - return 0; -} - -static INLINE int pthread_cond_signal(pthread_cond_t *const condition) { - WakeConditionVariable(condition); - return 0; -} - -static INLINE int pthread_cond_broadcast(pthread_cond_t *const condition) { - WakeAllConditionVariable(condition); - return 0; -} - -static INLINE int pthread_cond_wait(pthread_cond_t *const condition, - pthread_mutex_t *const mutex) { - int ok; - ok = SleepConditionVariableCS(condition, mutex, INFINITE); - return !ok; -} -#else // _WIN32 -#include // NOLINT -#define THREADFN void * -#define THREAD_RETURN(val) val -#endif - -#endif // CONFIG_MULTITHREAD - // State of the worker thread object typedef enum { - NOT_OK = 0, // object is unusable - OK, // ready to work - WORK // busy finishing the current task + AVX_WORKER_STATUS_NOT_OK = 0, // object is unusable + AVX_WORKER_STATUS_OK, // ready to work + AVX_WORKER_STATUS_WORKING // busy finishing the current task } AVxWorkerStatus; // Function to be called by the worker thread. Takes two opaque pointers as diff -Nru aom-3.8.2/aom_util/aom_util.cmake aom-3.9.0/aom_util/aom_util.cmake --- aom-3.8.2/aom_util/aom_util.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_util/aom_util.cmake 2024-05-07 19:57:02.642000000 +0000 @@ -13,7 +13,8 @@ endif() # AOM_AOM_UTIL_AOM_UTIL_CMAKE_ set(AOM_AOM_UTIL_AOM_UTIL_CMAKE_ 1) -list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_thread.c" +list(APPEND AOM_UTIL_SOURCES "${AOM_ROOT}/aom_util/aom_pthread.h" + "${AOM_ROOT}/aom_util/aom_thread.c" "${AOM_ROOT}/aom_util/aom_thread.h" "${AOM_ROOT}/aom_util/endian_inl.h") diff -Nru aom-3.8.2/aom_util/debug_util.c aom-3.9.0/aom_util/debug_util.c --- aom-3.8.2/aom_util/debug_util.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/aom_util/debug_util.c 2024-05-07 19:57:02.642000000 +0000 @@ -108,7 +108,7 @@ static int frame_stride = MAX_FRAME_STRIDE; static int frame_height = MAX_FRAME_HEIGHT; static int frame_size = MAX_FRAME_STRIDE * MAX_FRAME_HEIGHT; -void mismatch_move_frame_idx_w() { +void mismatch_move_frame_idx_w(void) { frame_buf_idx_w = (frame_buf_idx_w + 1) % max_frame_buf_num; if (frame_buf_idx_w == frame_buf_idx_r) { printf("frame_buf overflow\n"); @@ -125,7 +125,7 @@ } } -void mismatch_move_frame_idx_r() { +void mismatch_move_frame_idx_r(void) { if (frame_buf_idx_w == frame_buf_idx_r) { printf("frame_buf underflow\n"); assert(0); diff -Nru aom-3.8.2/apps/aomdec.c aom-3.9.0/apps/aomdec.c --- aom-3.8.2/apps/aomdec.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/apps/aomdec.c 2024-05-07 19:57:02.645000000 +0000 @@ -834,6 +834,8 @@ dx_time += aom_usec_timer_elapsed(&timer); got_data = 0; + // TODO(aomedia:3519): Change the prototype of aom_codec_get_frame_fn_t to + // facilitate error handling. while ((img = aom_codec_get_frame(&decoder, &iter))) { ++frame_out; got_data = 1; diff -Nru aom-3.8.2/apps/aomenc.c aom-3.9.0/apps/aomenc.c --- aom-3.8.2/apps/aomenc.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/apps/aomenc.c 2024-05-07 19:57:02.647000000 +0000 @@ -1533,30 +1533,36 @@ if (stream->config.vmaf_model_path) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_VMAF_MODEL_PATH, stream->config.vmaf_model_path); + ctx_exit_on_error(&stream->encoder, "Failed to set vmaf model path"); } #endif if (stream->config.partition_info_path) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_PARTITION_INFO_PATH, stream->config.partition_info_path); + ctx_exit_on_error(&stream->encoder, "Failed to set partition info path"); } if (stream->config.enable_rate_guide_deltaq) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_ENABLE_RATE_GUIDE_DELTAQ, stream->config.enable_rate_guide_deltaq); + ctx_exit_on_error(&stream->encoder, "Failed to enable rate guide deltaq"); } if (stream->config.rate_distribution_info) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_RATE_DISTRIBUTION_INFO, stream->config.rate_distribution_info); + ctx_exit_on_error(&stream->encoder, "Failed to set rate distribution info"); } if (stream->config.film_grain_filename) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_FILM_GRAIN_TABLE, stream->config.film_grain_filename); + ctx_exit_on_error(&stream->encoder, "Failed to set film grain table"); } AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_COLOR_RANGE, stream->config.color_range); + ctx_exit_on_error(&stream->encoder, "Failed to set color range"); #if CONFIG_AV1_DECODER if (global->test_decode != TEST_DECODE_OFF) { @@ -2245,17 +2251,25 @@ AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X, input.y4m.dst_c_dec_h >> 1); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling x"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y, input.y4m.dst_c_dec_v >> 1); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling y"); } else if (input.bit_depth == 12 && input.file_type == FILE_TYPE_RAW) { AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_X, stream->chroma_subsampling_x); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling x"); AOM_CODEC_CONTROL_TYPECHECKED(&stream->encoder, AV1E_SET_CHROMA_SUBSAMPLING_Y, stream->chroma_subsampling_y); + ctx_exit_on_error(&stream->encoder, + "Failed to set chroma subsampling y"); } break; default: break; diff -Nru aom-3.8.2/av1/av1.cmake aom-3.9.0/av1/av1.cmake --- aom-3.8.2/av1/av1.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/av1.cmake 2024-05-07 19:57:02.653000000 +0000 @@ -262,7 +262,6 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSE2 "${AOM_ROOT}/av1/common/x86/av1_txfm_sse2.h" - "${AOM_ROOT}/av1/common/x86/cdef_block_sse2.c" "${AOM_ROOT}/av1/common/x86/cfl_sse2.c" "${AOM_ROOT}/av1/common/x86/convolve_2d_sse2.c" "${AOM_ROOT}/av1/common/x86/convolve_sse2.c" @@ -272,11 +271,14 @@ list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3 "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.c" "${AOM_ROOT}/av1/common/x86/av1_inv_txfm_ssse3.h" - "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c" "${AOM_ROOT}/av1/common/x86/cfl_ssse3.c" "${AOM_ROOT}/av1/common/x86/jnt_convolve_ssse3.c" "${AOM_ROOT}/av1/common/x86/resize_ssse3.c") +# Fallbacks to support Valgrind on 32-bit x86 +list(APPEND AOM_AV1_COMMON_INTRIN_SSSE3_X86 + "${AOM_ROOT}/av1/common/x86/cdef_block_ssse3.c") + list(APPEND AOM_AV1_COMMON_INTRIN_SSE4_1 "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c" "${AOM_ROOT}/av1/common/x86/av1_convolve_scale_sse4.c" @@ -355,11 +357,13 @@ "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/av1_highbd_quantize_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/av1_k_means_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/ml_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.c" + "${AOM_ROOT}/av1/encoder/arm/neon/pickrst_neon.h" "${AOM_ROOT}/av1/encoder/arm/neon/quantize_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c" "${AOM_ROOT}/av1/encoder/arm/neon/reconinter_enc_neon.c" @@ -369,6 +373,10 @@ list(APPEND AOM_AV1_ENCODER_INTRIN_NEON_DOTPROD "${AOM_ROOT}/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c") +list(APPEND AOM_AV1_ENCODER_INTRIN_SVE + "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_sve.c" + "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_sve.c") + list(APPEND AOM_AV1_ENCODER_INTRIN_ARM_CRC32 "${AOM_ROOT}/av1/encoder/arm/crc32/hash_arm_crc32.c") @@ -401,6 +409,7 @@ "${AOM_ROOT}/av1/common/arm/warp_plane_neon_i8mm.c") list(APPEND AOM_AV1_COMMON_INTRIN_SVE + "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_sve.c" "${AOM_ROOT}/av1/common/arm/warp_plane_sve.c") list(APPEND AOM_AV1_ENCODER_INTRIN_SSE4_2 @@ -471,6 +480,10 @@ "${AOM_ROOT}/av1/common/arm/highbd_warp_plane_neon.c" "${AOM_ROOT}/av1/common/arm/highbd_wiener_convolve_neon.c") + list(APPEND AOM_AV1_COMMON_INTRIN_SVE2 + "${AOM_ROOT}/av1/common/arm/highbd_compound_convolve_sve2.c" + "${AOM_ROOT}/av1/common/arm/highbd_convolve_sve2.c") + list(APPEND AOM_AV1_ENCODER_INTRIN_SSE2 "${AOM_ROOT}/av1/encoder/x86/highbd_block_error_intrin_sse2.c" "${AOM_ROOT}/av1/encoder/x86/highbd_temporal_filter_sse2.c") @@ -511,6 +524,9 @@ "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c" "${AOM_ROOT}/av1/encoder/x86/cnn_avx2.c") + list(REMOVE_ITEM AOM_AV1_ENCODER_INTRIN_NEON + "${AOM_ROOT}/av1/encoder/arm/neon/cnn_neon.c") + list(REMOVE_ITEM AOM_AV1_ENCODER_SOURCES "${AOM_ROOT}/av1/encoder/cnn.c" "${AOM_ROOT}/av1/encoder/cnn.h" @@ -596,6 +612,10 @@ require_compiler_flag_nomsvc("-mssse3" NO) add_intrinsics_object_library("-mssse3" "ssse3" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SSSE3") + if(AOM_ARCH_X86) + add_intrinsics_object_library("-mssse3" "ssse3_x86" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SSSE3_X86") + endif() if(CONFIG_AV1_DECODER) if(AOM_AV1_DECODER_INTRIN_SSSE3) @@ -688,6 +708,15 @@ if(HAVE_SVE) add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_common" "AOM_AV1_COMMON_INTRIN_SVE") + if(CONFIG_AV1_ENCODER) + add_intrinsics_object_library("${AOM_SVE_FLAG}" "sve" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SVE") + endif() + endif() + + if(HAVE_SVE2) + add_intrinsics_object_library("${AOM_SVE2_FLAG}" "sve2" "aom_av1_common" + "AOM_AV1_COMMON_INTRIN_SVE2") endif() if(HAVE_VSX) diff -Nru aom-3.8.2/av1/av1_cx_iface.c aom-3.9.0/av1/av1_cx_iface.c --- aom-3.8.2/av1/av1_cx_iface.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/av1_cx_iface.c 2024-05-07 19:57:02.654000000 +0000 @@ -9,21 +9,28 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ #include +#include #include #include -#include "aom_mem/aom_mem.h" #include "config/aom_config.h" #include "config/aom_version.h" -#include "aom_ports/mem_ops.h" - +#include "aom/aomcx.h" #include "aom/aom_encoder.h" +#include "aom/aom_external_partition.h" +#include "aom/aom_image.h" #include "aom/internal/aom_codec_internal.h" - #include "aom_dsp/flow_estimation/flow_estimation.h" +#include "aom_mem/aom_mem.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" +#include "av1/av1_cx_iface.h" #include "av1/av1_iface_common.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/enums.h" +#include "av1/common/scale.h" #include "av1/encoder/bitstream.h" #include "av1/encoder/encoder.h" #include "av1/encoder/encoder_alloc.h" @@ -31,6 +38,7 @@ #include "av1/encoder/ethread.h" #include "av1/encoder/external_partition.h" #include "av1/encoder/firstpass.h" +#include "av1/encoder/lookahead.h" #include "av1/encoder/rc_utils.h" #include "av1/arg_defs.h" @@ -564,7 +572,10 @@ ratio->den /= denom; } -// Called by encoder_encode() only. Must not be called by encoder_init(). +// Called by encoder_encode() only. Must not be called by encoder_init() +// because the `error` paramerer will be destroyed by aom_codec_enc_init_ver() +// after encoder_init() returns an error. See the "IMPORTANT" comment in +// aom_codec_enc_init_ver(). static aom_codec_err_t update_error_state( aom_codec_alg_priv_t *ctx, const struct aom_internal_error_info *error) { const aom_codec_err_t res = error->error_code; @@ -947,7 +958,7 @@ return AOM_CODEC_OK; } -int av1_get_image_bps(const aom_image_t *img) { +static int get_image_bps(const aom_image_t *img) { switch (img->fmt) { case AOM_IMG_FMT_YV12: case AOM_IMG_FMT_NV12: @@ -1608,11 +1619,26 @@ bool is_sb_size_changed = false; av1_change_config_seq(ctx->ppi, &ctx->oxcf, &is_sb_size_changed); for (int i = 0; i < ctx->ppi->num_fp_contexts; i++) { - av1_change_config(ctx->ppi->parallel_cpi[i], &ctx->oxcf, - is_sb_size_changed); + AV1_COMP *const cpi = ctx->ppi->parallel_cpi[i]; + struct aom_internal_error_info *const error = cpi->common.error; + if (setjmp(error->jmp)) { + error->setjmp = 0; + return error->error_code; + } + error->setjmp = 1; + av1_change_config(cpi, &ctx->oxcf, is_sb_size_changed); + error->setjmp = 0; } if (ctx->ppi->cpi_lap != NULL) { - av1_change_config(ctx->ppi->cpi_lap, &ctx->oxcf, is_sb_size_changed); + AV1_COMP *const cpi_lap = ctx->ppi->cpi_lap; + struct aom_internal_error_info *const error = cpi_lap->common.error; + if (setjmp(error->jmp)) { + error->setjmp = 0; + return error->error_code; + } + error->setjmp = 1; + av1_change_config(cpi_lap, &ctx->oxcf, is_sb_size_changed); + error->setjmp = 0; } } return res; @@ -1817,6 +1843,11 @@ va_list args) { struct av1_extracfg extra_cfg = ctx->extra_cfg; extra_cfg.enable_qm = CAST(AV1E_SET_ENABLE_QM, args); +#if !CONFIG_QUANT_MATRIX + if (extra_cfg.enable_qm) { + ERROR("QM can't be enabled with CONFIG_QUANT_MATRIX=0."); + } +#endif return update_extra_cfg(ctx, &extra_cfg); } static aom_codec_err_t ctrl_set_qm_y(aom_codec_alg_priv_t *ctx, va_list args) { @@ -2603,10 +2634,22 @@ return AOM_CODEC_OK; } +static aom_codec_err_t ctrl_set_svc_frame_drop_mode(aom_codec_alg_priv_t *ctx, + va_list args) { + AV1_PRIMARY *const ppi = ctx->ppi; + AV1_COMP *const cpi = ppi->cpi; + cpi->svc.framedrop_mode = CAST(AV1E_SET_SVC_FRAME_DROP_MODE, args); + if (cpi->svc.framedrop_mode != AOM_LAYER_DROP && + cpi->svc.framedrop_mode != AOM_FULL_SUPERFRAME_DROP) + return AOM_CODEC_INVALID_PARAM; + else + return AOM_CODEC_OK; +} + #if !CONFIG_REALTIME_ONLY -aom_codec_err_t av1_create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, - STATS_BUFFER_CTX *stats_buf_context, - int num_lap_buffers) { +static aom_codec_err_t create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, + STATS_BUFFER_CTX *stats_buf_context, + int num_lap_buffers) { aom_codec_err_t res = AOM_CODEC_OK; int size = get_stats_buf_size(num_lap_buffers, MAX_LAG_BUFFERS); @@ -2763,8 +2806,8 @@ if (!priv->ppi) return AOM_CODEC_MEM_ERROR; #if !CONFIG_REALTIME_ONLY - res = av1_create_stats_buffer(&priv->frame_stats_buffer, - &priv->stats_buf_context, *num_lap_buffers); + res = create_stats_buffer(&priv->frame_stats_buffer, + &priv->stats_buf_context, *num_lap_buffers); if (res != AOM_CODEC_OK) return AOM_CODEC_MEM_ERROR; assert(MAX_LAP_BUFFERS >= MAX_LAG_BUFFERS); @@ -2813,8 +2856,8 @@ } } -void av1_destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, - FIRSTPASS_STATS *frame_stats_buffer) { +static void destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, + FIRSTPASS_STATS *frame_stats_buffer) { aom_free(stats_buf_context->total_left_stats); aom_free(stats_buf_context->total_stats); aom_free(frame_stats_buffer); @@ -2875,7 +2918,7 @@ } av1_remove_primary_compressor(ppi); } - av1_destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer); + destroy_stats_buffer(&ctx->stats_buf_context, ctx->frame_stats_buffer); aom_free(ctx); return AOM_CODEC_OK; } @@ -2943,8 +2986,7 @@ if (res == AOM_CODEC_OK) { const size_t uncompressed_frame_sz = ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_w, 5) * - ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * - av1_get_image_bps(img) / 8; + ALIGN_POWER_OF_TWO_UNSIGNED(ctx->cfg.g_h, 5) * get_image_bps(img) / 8; // Due to the presence of no-show frames, the ctx->cx_data buffer holds // compressed data corresponding to multiple frames. As no-show frames are @@ -3042,11 +3084,36 @@ ctx->pts_offset = ptsvol; ctx->pts_offset_initialized = 1; } + if (ptsvol < ctx->pts_offset) { + aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, + "pts is smaller than initial pts"); + } ptsvol -= ctx->pts_offset; + if (ptsvol > INT64_MAX / cpi_data.timestamp_ratio->num) { + aom_internal_error( + &ppi->error, AOM_CODEC_INVALID_PARAM, + "conversion of relative pts to ticks would overflow"); + } int64_t src_time_stamp = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol); +#if ULONG_MAX > INT64_MAX + if (duration > INT64_MAX) { + aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, + "duration is too big"); + } +#endif + if (ptsvol > INT64_MAX - (int64_t)duration) { + aom_internal_error(&ppi->error, AOM_CODEC_INVALID_PARAM, + "relative pts + duration is too big"); + } + aom_codec_pts_t pts_end = ptsvol + (int64_t)duration; + if (pts_end > INT64_MAX / cpi_data.timestamp_ratio->num) { + aom_internal_error( + &ppi->error, AOM_CODEC_INVALID_PARAM, + "conversion of relative pts + duration to ticks would overflow"); + } int64_t src_end_time_stamp = - timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + duration); + timebase_units_to_ticks(cpi_data.timestamp_ratio, pts_end); YV12_BUFFER_CONFIG sd; res = image2yuvconfig(img, &sd); @@ -3080,18 +3147,27 @@ subsampling_x, subsampling_y, use_highbitdepth, lag_in_frames, src_border_in_pixels, cpi->common.features.byte_alignment, ctx->num_lap_buffers, (cpi->oxcf.kf_cfg.key_freq_max == 0), - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } if (!ppi->lookahead) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate lag buffers"); for (int i = 0; i < ppi->num_fp_contexts; i++) { - av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth, - subsampling_x, subsampling_y); + aom_codec_err_t err = + av1_check_initial_width(ppi->parallel_cpi[i], use_highbitdepth, + subsampling_x, subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, err, + "av1_check_initial_width() failed"); + } } if (cpi_lap != NULL) { - av1_check_initial_width(cpi_lap, use_highbitdepth, subsampling_x, - subsampling_y); + aom_codec_err_t err = av1_check_initial_width( + cpi_lap, use_highbitdepth, subsampling_x, subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(&ppi->error, err, + "av1_check_initial_width() failed"); + } } // Store the original flags in to the frame buffer. Will extract the @@ -3155,17 +3231,6 @@ av1_create_workers(ppi, num_workers); av1_init_tile_thread_data(ppi, cpi->oxcf.pass == AOM_RC_FIRST_PASS); } -#if CONFIG_MULTITHREAD - if (ppi->p_mt_info.num_workers > 1) { - for (int i = 0; i < ppi->num_fp_contexts; i++) { - av1_init_mt_sync(ppi->parallel_cpi[i], - ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS); - } - if (cpi_lap != NULL) { - av1_init_mt_sync(cpi_lap, 1); - } - } -#endif // CONFIG_MULTITHREAD // Re-allocate thread data if workers for encoder multi-threading stage // exceeds prev_num_enc_workers. @@ -3187,6 +3252,17 @@ if (cpi_lap != NULL) { av1_init_frame_mt(ppi, cpi_lap); } +#if CONFIG_MULTITHREAD + if (ppi->p_mt_info.num_workers > 1) { + for (int i = 0; i < ppi->num_fp_contexts; i++) { + av1_init_mt_sync(ppi->parallel_cpi[i], + ppi->parallel_cpi[i]->oxcf.pass == AOM_RC_FIRST_PASS); + } + if (cpi_lap != NULL) { + av1_init_mt_sync(cpi_lap, 1); + } + } +#endif // CONFIG_MULTITHREAD // Call for LAP stage if (cpi_lap != NULL) { @@ -3194,11 +3270,8 @@ cpi_lap_data.flush = !img; cpi_lap_data.timestamp_ratio = &ctx->timestamp_ratio; const int status = av1_get_compressed_data(cpi_lap, &cpi_lap_data); - if (status != -1) { - if (status != AOM_CODEC_OK) { - aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s", - cpi->common.error->detail); - } + if (status > AOM_CODEC_OK) { + aom_internal_error_copy(&ppi->error, cpi_lap->common.error); } av1_post_encode_updates(cpi_lap, &cpi_lap_data); } @@ -3239,16 +3312,20 @@ status = av1_get_compressed_data(cpi, &cpi_data); } else if (ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] == 1) { - status = av1_compress_parallel_frames(ppi, &cpi_data); + // In case of an error, longjmp() would be invoked and hence "status" + // is set to AOM_CODEC_OK here. + av1_compress_parallel_frames(ppi, &cpi_data); + status = AOM_CODEC_OK; } else { + // No possibility of failures from this function and hence "status" is + // set to AOM_CODEC_OK here. cpi = av1_get_parallel_frame_enc_data(ppi, &cpi_data); status = AOM_CODEC_OK; } } if (status == -1) break; if (status != AOM_CODEC_OK) { - aom_internal_error(&ppi->error, cpi->common.error->error_code, "%s", - cpi->common.error->detail); + aom_internal_error_copy(&ppi->error, cpi->common.error); } if (ppi->num_fp_contexts > 0 && frame_is_intra_only(&cpi->common)) { av1_init_sc_decisions(ppi); @@ -3603,8 +3680,8 @@ LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; lc->max_q = params->max_quantizers[layer]; lc->min_q = params->min_quantizers[layer]; - lc->scaling_factor_num = params->scaling_factor_num[sl]; - lc->scaling_factor_den = params->scaling_factor_den[sl]; + lc->scaling_factor_num = AOMMAX(1, params->scaling_factor_num[sl]); + lc->scaling_factor_den = AOMMAX(1, params->scaling_factor_den[sl]); const int layer_target_bitrate = params->layer_target_bitrate[layer]; if (layer_target_bitrate > INT_MAX / 1000) { lc->layer_target_bitrate = INT_MAX; @@ -4398,6 +4475,7 @@ { AV1E_SET_QUANTIZER_ONE_PASS, ctrl_set_quantizer_one_pass }, { AV1E_SET_BITRATE_ONE_PASS_CBR, ctrl_set_bitrate_one_pass_cbr }, { AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, ctrl_set_max_consec_frame_drop_cbr }, + { AV1E_SET_SVC_FRAME_DROP_MODE, ctrl_set_svc_frame_drop_mode }, // Getters { AOME_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff -Nru aom-3.8.2/av1/av1_cx_iface.h aom-3.9.0/av1/av1_cx_iface.h --- aom-3.8.2/av1/av1_cx_iface.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/av1_cx_iface.h 2024-05-07 19:57:02.661000000 +0000 @@ -20,13 +20,6 @@ AV1EncoderConfig av1_get_encoder_config(const aom_codec_enc_cfg_t *cfg); -aom_codec_err_t av1_create_stats_buffer(FIRSTPASS_STATS **frame_stats_buffer, - STATS_BUFFER_CTX *stats_buf_context, - int num_lap_buffers); - -void av1_destroy_stats_buffer(STATS_BUFFER_CTX *stats_buf_context, - FIRSTPASS_STATS *frame_stats_buffer); - aom_codec_err_t av1_create_context_and_bufferpool(AV1_PRIMARY *ppi, AV1_COMP **p_cpi, BufferPool **p_buffer_pool, @@ -37,8 +30,6 @@ void av1_destroy_context_and_bufferpool(AV1_COMP *cpi, BufferPool **p_buffer_pool); -int av1_get_image_bps(const aom_image_t *img); - #ifdef __cplusplus } // extern "C" #endif diff -Nru aom-3.8.2/av1/av1_dx_iface.c aom-3.9.0/av1/av1_dx_iface.c --- aom-3.8.2/av1/av1_dx_iface.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/av1_dx_iface.c 2024-05-07 19:57:02.662000000 +0000 @@ -19,18 +19,23 @@ #include "aom/internal/aom_image_internal.h" #include "aom/aomdx.h" #include "aom/aom_decoder.h" +#include "aom/aom_image.h" #include "aom_dsp/bitreader_buffer.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_ports/mem.h" #include "aom_ports/mem_ops.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" #include "av1/common/frame_buffers.h" #include "av1/common/enums.h" #include "av1/common/obu_util.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodeframe.h" +#include "av1/decoder/dthread.h" #include "av1/decoder/grain_synthesis.h" #include "av1/decoder/obu.h" @@ -814,102 +819,111 @@ // simply a pointer to an integer index uintptr_t *index = (uintptr_t *)iter; - if (ctx->frame_worker != NULL) { - const AVxWorkerInterface *const winterface = aom_get_worker_interface(); - AVxWorker *const worker = ctx->frame_worker; - FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; - AV1Decoder *const pbi = frame_worker_data->pbi; - AV1_COMMON *const cm = &pbi->common; - CommonTileParams *const tiles = &cm->tiles; - // Wait for the frame from worker thread. - if (winterface->sync(worker)) { - // Check if worker has received any frames. - if (frame_worker_data->received_frame == 1) { - frame_worker_data->received_frame = 0; - check_resync(ctx, frame_worker_data->pbi); + if (ctx->frame_worker == NULL) { + return NULL; + } + const AVxWorkerInterface *const winterface = aom_get_worker_interface(); + AVxWorker *const worker = ctx->frame_worker; + FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1; + AV1Decoder *const pbi = frame_worker_data->pbi; + pbi->error.error_code = AOM_CODEC_OK; + pbi->error.has_detail = 0; + AV1_COMMON *const cm = &pbi->common; + CommonTileParams *const tiles = &cm->tiles; + // Wait for the frame from worker thread. + if (!winterface->sync(worker)) { + // Decoding failed. Release the worker thread. + frame_worker_data->received_frame = 0; + ctx->need_resync = 1; + // TODO(aomedia:3519): Set an error code. Check if a different error code + // should be used if ctx->flushed != 1. + return NULL; + } + // Check if worker has received any frames. + if (frame_worker_data->received_frame == 1) { + frame_worker_data->received_frame = 0; + check_resync(ctx, frame_worker_data->pbi); + } + YV12_BUFFER_CONFIG *sd; + aom_film_grain_t *grain_params; + if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, &grain_params) != + 0) { + return NULL; + } + RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; + ctx->last_show_frame = output_frame_buf; + if (ctx->need_resync) return NULL; + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); + move_decoder_metadata_to_img(pbi, &ctx->img); + + if (!pbi->ext_tile_debug && tiles->large_scale) { + *index += 1; // Advance the iterator to point to the next image + aom_img_remove_metadata(&ctx->img); + yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL); + move_decoder_metadata_to_img(pbi, &ctx->img); + img = &ctx->img; + return img; + } + + const int num_planes = av1_num_planes(cm); + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_row >= 0) { + int tile_width, tile_height; + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + return NULL; + } + const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1); + const int mi_row = tile_row * tile_height; + const int ssy = ctx->img.y_chroma_shift; + int plane; + ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += + mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; } - YV12_BUFFER_CONFIG *sd; - aom_film_grain_t *grain_params; - if (av1_get_raw_frame(frame_worker_data->pbi, *index, &sd, - &grain_params) == 0) { - RefCntBuffer *const output_frame_buf = pbi->output_frames[*index]; - ctx->last_show_frame = output_frame_buf; - if (ctx->need_resync) return NULL; - aom_img_remove_metadata(&ctx->img); - yuvconfig2image(&ctx->img, sd, frame_worker_data->user_priv); - move_decoder_metadata_to_img(pbi, &ctx->img); - - if (!pbi->ext_tile_debug && tiles->large_scale) { - *index += 1; // Advance the iterator to point to the next image - aom_img_remove_metadata(&ctx->img); - yuvconfig2image(&ctx->img, &pbi->tile_list_outbuf, NULL); - move_decoder_metadata_to_img(pbi, &ctx->img); - img = &ctx->img; - return img; - } - - const int num_planes = av1_num_planes(cm); - if (pbi->ext_tile_debug && tiles->single_tile_decoding && - pbi->dec_tile_row >= 0) { - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_row = AOMMIN(pbi->dec_tile_row, tiles->rows - 1); - const int mi_row = tile_row * tile_height; - const int ssy = ctx->img.y_chroma_shift; - int plane; - ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0]; - if (num_planes > 1) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - ctx->img.planes[plane] += - mi_row * (MI_SIZE >> ssy) * ctx->img.stride[plane]; - } - } - ctx->img.d_h = - AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE; - } - - if (pbi->ext_tile_debug && tiles->single_tile_decoding && - pbi->dec_tile_col >= 0) { - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1); - const int mi_col = tile_col * tile_width; - const int ssx = ctx->img.x_chroma_shift; - const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; - int plane; - ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); - if (num_planes > 1) { - for (plane = 1; plane < MAX_MB_PLANE; ++plane) { - ctx->img.planes[plane] += - mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); - } - } - ctx->img.d_w = - AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE; - } + } + ctx->img.d_h = + AOMMIN(tile_height, cm->mi_params.mi_rows - mi_row) * MI_SIZE; + } - ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; - img = &ctx->img; - img->temporal_id = output_frame_buf->temporal_id; - img->spatial_id = output_frame_buf->spatial_id; - if (pbi->skip_film_grain) grain_params->apply_grain = 0; - aom_image_t *res = - add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); - if (!res) { - aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, - "Grain systhesis failed\n"); - } - *index += 1; // Advance the iterator to point to the next image - return res; + if (pbi->ext_tile_debug && tiles->single_tile_decoding && + pbi->dec_tile_col >= 0) { + int tile_width, tile_height; + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + return NULL; + } + const int tile_col = AOMMIN(pbi->dec_tile_col, tiles->cols - 1); + const int mi_col = tile_col * tile_width; + const int ssx = ctx->img.x_chroma_shift; + const int is_hbd = (ctx->img.fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 1 : 0; + int plane; + ctx->img.planes[0] += mi_col * MI_SIZE * (1 + is_hbd); + if (num_planes > 1) { + for (plane = 1; plane < MAX_MB_PLANE; ++plane) { + ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx) * (1 + is_hbd); } - } else { - // Decoding failed. Release the worker thread. - frame_worker_data->received_frame = 0; - ctx->need_resync = 1; - if (ctx->flushed != 1) return NULL; } + ctx->img.d_w = AOMMIN(tile_width, cm->mi_params.mi_cols - mi_col) * MI_SIZE; + } + + ctx->img.fb_priv = output_frame_buf->raw_frame_buffer.priv; + img = &ctx->img; + img->temporal_id = output_frame_buf->temporal_id; + img->spatial_id = output_frame_buf->spatial_id; + if (pbi->skip_film_grain) grain_params->apply_grain = 0; + aom_image_t *res = + add_grain_if_needed(ctx, img, &ctx->image_with_grain, grain_params); + if (!res) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + pbi->error.has_detail = 1; + snprintf(pbi->error.detail, sizeof(pbi->error.detail), + "Grain synthesis failed\n"); + return res; } - return NULL; + *index += 1; // Advance the iterator to point to the next image + return res; } static aom_codec_err_t decoder_set_fb_fn( @@ -917,16 +931,17 @@ aom_release_frame_buffer_cb_fn_t cb_release, void *cb_priv) { if (cb_get == NULL || cb_release == NULL) { return AOM_CODEC_INVALID_PARAM; - } else if (ctx->frame_worker == NULL) { + } + if (ctx->frame_worker != NULL) { // If the decoder has already been initialized, do not accept changes to // the frame buffer functions. - ctx->get_ext_fb_cb = cb_get; - ctx->release_ext_fb_cb = cb_release; - ctx->ext_priv = cb_priv; - return AOM_CODEC_OK; + return AOM_CODEC_ERROR; } - return AOM_CODEC_ERROR; + ctx->get_ext_fb_cb = cb_get; + ctx->release_ext_fb_cb = cb_release; + ctx->ext_priv = cb_priv; + return AOM_CODEC_OK; } static aom_codec_err_t ctrl_set_reference(aom_codec_alg_priv_t *ctx, @@ -1422,7 +1437,9 @@ (FrameWorkerData *)worker->data1; const AV1_COMMON *const cm = &frame_worker_data->pbi->common; int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + return AOM_CODEC_CORRUPT_FRAME; + } *tile_size = ((tile_width * MI_SIZE) << 16) + tile_height * MI_SIZE; return AOM_CODEC_OK; } else { diff -Nru aom-3.8.2/av1/common/alloccommon.c aom-3.9.0/av1/common/alloccommon.c --- aom-3.8.2/av1/common/alloccommon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/alloccommon.c 2024-05-07 19:57:02.665000000 +0000 @@ -13,6 +13,8 @@ #include "config/aom_config.h" #include "aom_mem/aom_mem.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" @@ -20,6 +22,8 @@ #include "av1/common/cdef_block.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" +#include "av1/common/enums.h" +#include "av1/common/restoration.h" #include "av1/common/thread_common.h" int av1_get_MBs(int width, int height) { @@ -99,10 +103,14 @@ if (*cdef_row_mt == NULL) return; #if CONFIG_MULTITHREAD for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { - pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); - pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); - aom_free((*cdef_row_mt)[row_idx].row_mutex_); - aom_free((*cdef_row_mt)[row_idx].row_cond_); + if ((*cdef_row_mt)[row_idx].row_mutex_ != NULL) { + pthread_mutex_destroy((*cdef_row_mt)[row_idx].row_mutex_); + aom_free((*cdef_row_mt)[row_idx].row_mutex_); + } + if ((*cdef_row_mt)[row_idx].row_cond_ != NULL) { + pthread_cond_destroy((*cdef_row_mt)[row_idx].row_cond_); + aom_free((*cdef_row_mt)[row_idx].row_cond_); + } } #else (void)num_mi_rows; @@ -167,7 +175,7 @@ if (*cdef_row_mt != NULL) return; CHECK_MEM_ERROR(cm, *cdef_row_mt, - aom_malloc(sizeof(**cdef_row_mt) * num_mi_rows)); + aom_calloc(num_mi_rows, sizeof(**cdef_row_mt))); #if CONFIG_MULTITHREAD for (int row_idx = 0; row_idx < num_mi_rows; row_idx++) { CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_mutex_, @@ -177,8 +185,6 @@ CHECK_MEM_ERROR(cm, (*cdef_row_mt)[row_idx].row_cond_, aom_malloc(sizeof(*(*cdef_row_mt)[row_idx].row_cond_))); pthread_cond_init((*cdef_row_mt)[row_idx].row_cond_, NULL); - - (*cdef_row_mt)[row_idx].is_row_done = 0; } #endif // CONFIG_MULTITHREAD } @@ -198,7 +204,7 @@ const int is_num_workers_changed = cdef_info->allocated_num_workers != num_workers; const int is_cdef_enabled = - cm->seq_params->enable_cdef && !cm->tiles.large_scale; + cm->seq_params->enable_cdef && !cm->tiles.single_tile_decoding; // num-bufs=3 represents ping-pong buffers for top linebuf, // followed by bottom linebuf. @@ -466,11 +472,11 @@ mi_params->mi_grid_base = (MB_MODE_INFO **)aom_calloc( mi_grid_size, sizeof(*mi_params->mi_grid_base)); if (!mi_params->mi_grid_base) return 1; - mi_params->mi_grid_size = mi_grid_size; mi_params->tx_type_map = aom_calloc(mi_grid_size, sizeof(*mi_params->tx_type_map)); if (!mi_params->tx_type_map) return 1; + mi_params->mi_grid_size = mi_grid_size; } return 0; diff -Nru aom-3.8.2/av1/common/arm/blend_a64_hmask_neon.c aom-3.9.0/av1/common/arm/blend_a64_hmask_neon.c --- aom-3.8.2/av1/common/arm/blend_a64_hmask_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/blend_a64_hmask_neon.c 2024-05-07 19:57:02.672000000 +0000 @@ -73,7 +73,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -88,7 +88,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m0, s0, s1); - store_unaligned_u8_2x2(dst, dst_stride, blend); + store_u8x2_strided_x2(dst, dst_stride, blend); src0 += 2 * src0_stride; src1 += 2 * src1_stride; diff -Nru aom-3.8.2/av1/common/arm/blend_a64_vmask_neon.c aom-3.9.0/av1/common/arm/blend_a64_vmask_neon.c --- aom-3.8.2/av1/common/arm/blend_a64_vmask_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/blend_a64_vmask_neon.c 2024-05-07 19:57:02.672000000 +0000 @@ -78,7 +78,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); - store_unaligned_u8_4x2(dst, dst_stride, blend); + store_u8x4_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; @@ -97,7 +97,7 @@ uint8x8_t blend = alpha_blend_a64_u8x8(m, s0, s1); - store_unaligned_u8_2x2(dst, dst_stride, blend); + store_u8x2_strided_x2(dst, dst_stride, blend); mask += 2; src0 += 2 * src0_stride; diff -Nru aom-3.8.2/av1/common/arm/cdef_block_neon.c aom-3.9.0/av1/common/arm/cdef_block_neon.c --- aom-3.8.2/av1/common/arm/cdef_block_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/cdef_block_neon.c 2024-05-07 19:57:02.672000000 +0000 @@ -9,124 +9,69 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "aom_dsp/aom_simd.h" -#include "aom_dsp/arm/mem_neon.h" +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" -#define SIMD_FUNC(name) name##_neon -#include "av1/common/cdef_block_simd.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/cdef_block.h" void cdef_copy_rect8_8bit_to_16bit_neon(uint16_t *dst, int dstride, const uint8_t *src, int sstride, int width, int height) { - int j; - for (int i = 0; i < height; i++) { - for (j = 0; j < (width & ~0x7); j += 8) { - v64 row = v64_load_unaligned(&src[i * sstride + j]); - v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); + do { + const uint8_t *src_ptr = src; + uint16_t *dst_ptr = dst; + + int w = 0; + while (width - w >= 16) { + uint8x16_t row = vld1q_u8(src_ptr + w); + uint8x16x2_t row_u16 = { { row, vdupq_n_u8(0) } }; + vst2q_u8((uint8_t *)(dst_ptr + w), row_u16); + + w += 16; } - for (; j < width; j++) { - dst[i * dstride + j] = src[i * sstride + j]; + if (width - w >= 8) { + uint8x8_t row = vld1_u8(src_ptr + w); + vst1q_u16(dst_ptr + w, vmovl_u8(row)); + w += 8; + } + if (width - w == 4) { + for (int i = w; i < w + 4; i++) { + dst_ptr[i] = src_ptr[i]; + } } - } -} -static INLINE int16x8_t v128_from_64_neon(int64_t a, int64_t b) { - return vreinterpretq_s16_s64(vcombine_s64(vcreate_s64(a), vcreate_s64(b))); + src += sstride; + dst += dstride; + } while (--height != 0); } -#define SHL_HIGH_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - 0, vget_lane_u64(vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \ - (n - 8) * 8), \ - 0)); \ - } - -#define SHL_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - 0, vget_lane_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), 0)); \ - } - -#define SHL_LOW_NEON(n) \ - static INLINE int16x8_t v128_shl_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64( \ - vshl_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), 0), \ - vget_lane_u64( \ - vorr_u64( \ - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \ - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), \ - (8 - n) * 8)), \ - 0)); \ - } - -SHL_HIGH_NEON(14) -SHL_HIGH_NEON(12) -SHL_HIGH_NEON(10) -SHL_NEON(8) -SHL_LOW_NEON(6) -SHL_LOW_NEON(4) -SHL_LOW_NEON(2) - -#define v128_shl_n_byte_neon(a, n) v128_shl_##n##_byte_neon(a) - -#define SHR_HIGH_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64(vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \ - (n - 8) * 8), \ - 0), \ - 0); \ - } - -#define SHR_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), 0), 0); \ - } - -#define SHR_LOW_NEON(n) \ - static INLINE int16x8_t v128_shr_##n##_byte_neon(int16x8_t a) { \ - int64x2_t a_s64 = vreinterpretq_s64_s16(a); \ - return v128_from_64_neon( \ - vget_lane_u64( \ - vorr_u64( \ - vshr_n_u64(vreinterpret_u64_s64(vget_low_s64(a_s64)), n * 8), \ - vshl_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), \ - (8 - n) * 8)), \ - 0), \ - vget_lane_u64( \ - vshr_n_u64(vreinterpret_u64_s64(vget_high_s64(a_s64)), n * 8), \ - 0)); \ - } - -SHR_HIGH_NEON(14) -SHR_HIGH_NEON(12) -SHR_HIGH_NEON(10) -SHR_NEON(8) -SHR_LOW_NEON(6) -SHR_LOW_NEON(4) -SHR_LOW_NEON(2) - -#define v128_shr_n_byte_neon(a, n) v128_shr_##n##_byte_neon(a) - -static INLINE uint32x4_t v128_madd_s16_neon(int16x8_t a, int16x8_t b) { - uint32x4_t t1 = - vreinterpretq_u32_s32(vmull_s16(vget_low_s16(a), vget_low_s16(b))); - uint32x4_t t2 = - vreinterpretq_u32_s32(vmull_s16(vget_high_s16(a), vget_high_s16(b))); -#if AOM_ARCH_AARCH64 - return vpaddq_u32(t1, t2); -#else - return vcombine_u32(vpadd_u32(vget_low_u32(t1), vget_high_u32(t1)), - vpadd_u32(vget_low_u32(t2), vget_high_u32(t2))); -#endif +void cdef_copy_rect8_16bit_to_16bit_neon(uint16_t *dst, int dstride, + const uint16_t *src, int sstride, + int width, int height) { + do { + const uint16_t *src_ptr = src; + uint16_t *dst_ptr = dst; + + int w = 0; + while (width - w >= 8) { + uint16x8_t row = vld1q_u16(src_ptr + w); + vst1q_u16(dst_ptr + w, row); + + w += 8; + } + if (width - w == 4) { + uint16x4_t row = vld1_u16(src_ptr + w); + vst1_u16(dst_ptr + w, row); + } + + src += sstride; + dst += dstride; + } while (--height != 0); } // partial A is a 16-bit vector of the form: @@ -139,8 +84,8 @@ int16x8_t partialb, uint32x4_t const1, uint32x4_t const2) { - int16x8_t tmp; // Reverse partial B. + // pattern = { 12 13 10 11 8 9 6 7 4 5 2 3 0 1 14 15 }. uint8x16_t pattern = vreinterpretq_u8_u64( vcombine_u64(vcreate_u64((uint64_t)0x07060908 << 32 | 0x0b0a0d0c), vcreate_u64((uint64_t)0x0f0e0100 << 32 | 0x03020504))); @@ -156,98 +101,100 @@ partialb = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); #endif - // Interleave the x and y values of identical indices and pair x8 with 0. - tmp = partiala; - partiala = vzipq_s16(partiala, partialb).val[0]; - partialb = vzipq_s16(tmp, partialb).val[1]; // Square and add the corresponding x and y values. - uint32x4_t partiala_u32 = v128_madd_s16_neon(partiala, partiala); - uint32x4_t partialb_u32 = v128_madd_s16_neon(partialb, partialb); + int32x4_t cost_lo = vmull_s16(vget_low_s16(partiala), vget_low_s16(partiala)); + cost_lo = vmlal_s16(cost_lo, vget_low_s16(partialb), vget_low_s16(partialb)); + int32x4_t cost_hi = + vmull_s16(vget_high_s16(partiala), vget_high_s16(partiala)); + cost_hi = + vmlal_s16(cost_hi, vget_high_s16(partialb), vget_high_s16(partialb)); // Multiply by constant. - partiala_u32 = vmulq_u32(partiala_u32, const1); - partialb_u32 = vmulq_u32(partialb_u32, const2); - - // Sum all results. - partiala_u32 = vaddq_u32(partiala_u32, partialb_u32); - return partiala_u32; -} - -static INLINE uint64x2_t ziplo_u64(uint32x4_t a, uint32x4_t b) { - return vcombine_u64(vget_low_u64(vreinterpretq_u64_u32(a)), - vget_low_u64(vreinterpretq_u64_u32(b))); -} - -static INLINE uint64x2_t ziphi_u64(uint32x4_t a, uint32x4_t b) { - return vcombine_u64(vget_high_u64(vreinterpretq_u64_u32(a)), - vget_high_u64(vreinterpretq_u64_u32(b))); -} - -static INLINE uint32x4_t hsum4_neon(uint32x4_t x0, uint32x4_t x1, uint32x4_t x2, - uint32x4_t x3) { - uint32x4_t t0, t1, t2, t3; - t0 = vzipq_u32(x0, x1).val[0]; - t1 = vzipq_u32(x2, x3).val[0]; - t2 = vzipq_u32(x0, x1).val[1]; - t3 = vzipq_u32(x2, x3).val[1]; - x0 = vreinterpretq_u32_u64(ziplo_u64(t0, t1)); - x1 = vreinterpretq_u32_u64(ziphi_u64(t0, t1)); - x2 = vreinterpretq_u32_u64(ziplo_u64(t2, t3)); - x3 = vreinterpretq_u32_u64(ziphi_u64(t2, t3)); - return vaddq_u32(vaddq_u32(x0, x1), vaddq_u32(x2, x3)); -} - -static INLINE uint32x4_t compute_directions_neon(int16x8_t lines[8], - uint32_t cost[4]) { - int16x8_t partial4a, partial4b, partial5a, partial5b, partial6, partial7a, - partial7b; - int16x8_t tmp; + uint32x4_t cost = vmulq_u32(vreinterpretq_u32_s32(cost_lo), const1); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(cost_hi), const2); + return cost; +} + +// This function computes the cost along directions 4, 5, 6, 7. (4 is diagonal +// down-right, 6 is vertical). +// +// For each direction the lines are shifted so that we can perform a +// basic sum on each vector element. For example, direction 5 is "south by +// southeast", so we need to add the pixels along each line i below: +// +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// For this to fit nicely in vectors, the lines need to be shifted like so: +// 0 1 2 3 4 5 6 7 +// 0 1 2 3 4 5 6 7 +// 8 0 1 2 3 4 5 6 +// 8 0 1 2 3 4 5 6 +// 9 8 0 1 2 3 4 5 +// 9 8 0 1 2 3 4 5 +// 10 9 8 0 1 2 3 4 +// 10 9 8 0 1 2 3 4 +// +// In this configuration we can now perform SIMD additions to get the cost +// along direction 5. Since this won't fit into a single 128-bit vector, we use +// two of them to compute each half of the new configuration, and pad the empty +// spaces with zeros. Similar shifting is done for other directions, except +// direction 6 which is straightforward as it's the vertical direction. +static INLINE uint32x4_t compute_vert_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); // Partial sums for lines 0 and 1. - partial4a = v128_shl_n_byte_neon(lines[0], 14); - partial4b = v128_shr_n_byte_neon(lines[0], 2); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[1], 12)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[1], 4)); - tmp = vaddq_s16(lines[0], lines[1]); - partial5a = v128_shl_n_byte_neon(tmp, 10); - partial5b = v128_shr_n_byte_neon(tmp, 6); - partial7a = v128_shl_n_byte_neon(tmp, 4); - partial7b = v128_shr_n_byte_neon(tmp, 12); - partial6 = tmp; + int16x8_t partial4a = vextq_s16(zero, lines[0], 1); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[1], 2)); + int16x8_t partial4b = vextq_s16(lines[0], zero, 1); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[1], zero, 2)); + int16x8_t tmp = vaddq_s16(lines[0], lines[1]); + int16x8_t partial5a = vextq_s16(zero, tmp, 3); + int16x8_t partial5b = vextq_s16(tmp, zero, 3); + int16x8_t partial7a = vextq_s16(zero, tmp, 6); + int16x8_t partial7b = vextq_s16(tmp, zero, 6); + int16x8_t partial6 = tmp; // Partial sums for lines 2 and 3. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[2], 10)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[2], 6)); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[3], 8)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[3], 8)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[2], 3)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[3], 4)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[2], zero, 3)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[3], zero, 4)); tmp = vaddq_s16(lines[2], lines[3]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 8)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 8)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 6)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 10)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 4)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 4)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 5)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 5)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 4 and 5. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[4], 6)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[4], 10)); - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[5], 4)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[5], 12)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[4], 5)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[5], 6)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[4], zero, 5)); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[5], zero, 6)); tmp = vaddq_s16(lines[4], lines[5]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 6)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 10)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 8)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 8)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 5)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 5)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 4)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 4)); partial6 = vaddq_s16(partial6, tmp); // Partial sums for lines 6 and 7. - partial4a = vaddq_s16(partial4a, v128_shl_n_byte_neon(lines[6], 2)); - partial4b = vaddq_s16(partial4b, v128_shr_n_byte_neon(lines[6], 14)); + partial4a = vaddq_s16(partial4a, vextq_s16(zero, lines[6], 7)); partial4a = vaddq_s16(partial4a, lines[7]); + partial4b = vaddq_s16(partial4b, vextq_s16(lines[6], zero, 7)); tmp = vaddq_s16(lines[6], lines[7]); - partial5a = vaddq_s16(partial5a, v128_shl_n_byte_neon(tmp, 4)); - partial5b = vaddq_s16(partial5b, v128_shr_n_byte_neon(tmp, 12)); - partial7a = vaddq_s16(partial7a, v128_shl_n_byte_neon(tmp, 10)); - partial7b = vaddq_s16(partial7b, v128_shr_n_byte_neon(tmp, 6)); + partial5a = vaddq_s16(partial5a, vextq_s16(zero, tmp, 6)); + partial5b = vaddq_s16(partial5b, vextq_s16(tmp, zero, 6)); + partial7a = vaddq_s16(partial7a, vextq_s16(zero, tmp, 3)); + partial7b = vaddq_s16(partial7b, vextq_s16(tmp, zero, 3)); partial6 = vaddq_s16(partial6, tmp); uint32x4_t const0 = vreinterpretq_u32_u64( @@ -263,74 +210,173 @@ vcreate_u64((uint64_t)105 << 32 | 105))); // Compute costs in terms of partial sums. - uint32x4_t partial4a_u32 = - fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); - uint32x4_t partial7a_u32 = - fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); - uint32x4_t partial5a_u32 = - fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); - uint32x4_t partial6_u32 = v128_madd_s16_neon(partial6, partial6); - partial6_u32 = vmulq_u32(partial6_u32, vdupq_n_u32(105)); - - partial4a_u32 = - hsum4_neon(partial4a_u32, partial5a_u32, partial6_u32, partial7a_u32); - vst1q_u32(cost, partial4a_u32); - return partial4a_u32; -} - -static INLINE int64x2_t ziplo_s64(int32x4_t a, int32x4_t b) { - return vcombine_s64(vget_low_s64(vreinterpretq_s64_s32(a)), - vget_low_s64(vreinterpretq_s64_s32(b))); -} - -static INLINE int64x2_t ziphi_s64(int32x4_t a, int32x4_t b) { - return vcombine_s64(vget_high_s64(vreinterpretq_s64_s32(a)), - vget_high_s64(vreinterpretq_s64_s32(b))); -} - -// Transpose and reverse the order of the lines -- equivalent to a 90-degree -// counter-clockwise rotation of the pixels. -static INLINE void array_reverse_transpose_8x8_neon(int16x8_t *in, - int16x8_t *res) { - const int32x4_t tr0_0 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[0]); - const int32x4_t tr0_1 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[0]); - const int32x4_t tr0_2 = vreinterpretq_s32_s16(vzipq_s16(in[0], in[1]).val[1]); - const int32x4_t tr0_3 = vreinterpretq_s32_s16(vzipq_s16(in[2], in[3]).val[1]); - const int32x4_t tr0_4 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[0]); - const int32x4_t tr0_5 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[0]); - const int32x4_t tr0_6 = vreinterpretq_s32_s16(vzipq_s16(in[4], in[5]).val[1]); - const int32x4_t tr0_7 = vreinterpretq_s32_s16(vzipq_s16(in[6], in[7]).val[1]); - - const int32x4_t tr1_0 = vzipq_s32(tr0_0, tr0_1).val[0]; - const int32x4_t tr1_1 = vzipq_s32(tr0_4, tr0_5).val[0]; - const int32x4_t tr1_2 = vzipq_s32(tr0_0, tr0_1).val[1]; - const int32x4_t tr1_3 = vzipq_s32(tr0_4, tr0_5).val[1]; - const int32x4_t tr1_4 = vzipq_s32(tr0_2, tr0_3).val[0]; - const int32x4_t tr1_5 = vzipq_s32(tr0_6, tr0_7).val[0]; - const int32x4_t tr1_6 = vzipq_s32(tr0_2, tr0_3).val[1]; - const int32x4_t tr1_7 = vzipq_s32(tr0_6, tr0_7).val[1]; - - res[7] = vreinterpretq_s16_s64(ziplo_s64(tr1_0, tr1_1)); - res[6] = vreinterpretq_s16_s64(ziphi_s64(tr1_0, tr1_1)); - res[5] = vreinterpretq_s16_s64(ziplo_s64(tr1_2, tr1_3)); - res[4] = vreinterpretq_s16_s64(ziphi_s64(tr1_2, tr1_3)); - res[3] = vreinterpretq_s16_s64(ziplo_s64(tr1_4, tr1_5)); - res[2] = vreinterpretq_s16_s64(ziphi_s64(tr1_4, tr1_5)); - res[1] = vreinterpretq_s16_s64(ziplo_s64(tr1_6, tr1_7)); - res[0] = vreinterpretq_s16_s64(ziphi_s64(tr1_6, tr1_7)); -} - -static INLINE uint32_t compute_best_dir(uint8x16_t a) { - uint8x16_t idx = - vandq_u8(a, vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))); + int32x4_t partial6_s32 = + vmull_s16(vget_low_s16(partial6), vget_low_s16(partial6)); + partial6_s32 = + vmlal_s16(partial6_s32, vget_high_s16(partial6), vget_high_s16(partial6)); + + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial4a, partial4b, const0, const1); + costs[1] = fold_mul_and_sum_neon(partial5a, partial5b, const2, const3); + costs[2] = vmulq_n_u32(vreinterpretq_u32_s32(partial6_s32), 105); + costs[3] = fold_mul_and_sum_neon(partial7a, partial7b, const2, const3); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; +} + +static INLINE uint32x4_t fold_mul_and_sum_pairwise_neon(int16x8_t partiala, + int16x8_t partialb, + int16x8_t partialc, + uint32x4_t const0) { + // Reverse partial c. + // pattern = { 10 11 8 9 6 7 4 5 2 3 0 1 12 13 14 15 }. + uint8x16_t pattern = vreinterpretq_u8_u64( + vcombine_u64(vcreate_u64((uint64_t)0x05040706 << 32 | 0x09080b0a), + vcreate_u64((uint64_t)0x0f0e0d0c << 32 | 0x01000302))); + #if AOM_ARCH_AARCH64 - return vaddv_u8(vget_low_u8(idx)) + (vaddv_u8(vget_high_u8(idx)) << 8); + partialc = + vreinterpretq_s16_s8(vqtbl1q_s8(vreinterpretq_s8_s16(partialc), pattern)); #else - uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(idx))); - uint8x16_t s = vreinterpretq_u8_u64(m); - return vget_lane_u32( - vreinterpret_u32_u8(vzip_u8(vget_low_u8(s), vget_high_u8(s)).val[0]), 0); + int8x8x2_t p = { { vget_low_s8(vreinterpretq_s8_s16(partialc)), + vget_high_s8(vreinterpretq_s8_s16(partialc)) } }; + int8x8_t shuffle_hi = vtbl2_s8(p, vget_high_s8(vreinterpretq_s8_u8(pattern))); + int8x8_t shuffle_lo = vtbl2_s8(p, vget_low_s8(vreinterpretq_s8_u8(pattern))); + partialc = vreinterpretq_s16_s8(vcombine_s8(shuffle_lo, shuffle_hi)); #endif + + int32x4_t partiala_s32 = vpaddlq_s16(partiala); + int32x4_t partialb_s32 = vpaddlq_s16(partialb); + int32x4_t partialc_s32 = vpaddlq_s16(partialc); + + partiala_s32 = vmulq_s32(partiala_s32, partiala_s32); + partialb_s32 = vmulq_s32(partialb_s32, partialb_s32); + partialc_s32 = vmulq_s32(partialc_s32, partialc_s32); + + partiala_s32 = vaddq_s32(partiala_s32, partialc_s32); + + uint32x4_t cost = vmulq_n_u32(vreinterpretq_u32_s32(partialb_s32), 105); + cost = vmlaq_u32(cost, vreinterpretq_u32_s32(partiala_s32), const0); + return cost; +} + +// This function computes the cost along directions 0, 1, 2, 3. (0 means +// 45-degree up-right, 2 is horizontal). +// +// For direction 1 and 3 ("east northeast" and "east southeast") the shifted +// lines need three vectors instead of two. For direction 1 for example, we need +// to compute the sums along the line i below: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Which means we need the following configuration: +// 0 0 1 1 2 2 3 3 +// 1 1 2 2 3 3 4 4 +// 2 2 3 3 4 4 5 5 +// 3 3 4 4 5 5 6 6 +// 4 4 5 5 6 6 7 7 +// 5 5 6 6 7 7 8 8 +// 6 6 7 7 8 8 9 9 +// 7 7 8 8 9 9 10 10 +// +// Three vectors are needed to compute this, as well as some extra pairwise +// additions. +static uint32x4_t compute_horiz_directions_neon(int16x8_t lines[8], + uint32_t cost[4]) { + const int16x8_t zero = vdupq_n_s16(0); + + // Compute diagonal directions (1, 2, 3). + // Partial sums for lines 0 and 1. + int16x8_t partial0a = lines[0]; + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[1], 7)); + int16x8_t partial0b = vextq_s16(lines[1], zero, 7); + int16x8_t partial1a = vaddq_s16(lines[0], vextq_s16(zero, lines[1], 6)); + int16x8_t partial1b = vextq_s16(lines[1], zero, 6); + int16x8_t partial3a = vextq_s16(lines[0], zero, 2); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[1], zero, 4)); + int16x8_t partial3b = vextq_s16(zero, lines[0], 2); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[1], 4)); + + // Partial sums for lines 2 and 3. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[2], 6)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[3], 5)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[2], zero, 6)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[3], zero, 5)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[2], 4)); + partial1a = vaddq_s16(partial1a, vextq_s16(zero, lines[3], 2)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[2], zero, 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(lines[3], zero, 2)); + partial3a = vaddq_s16(partial3a, vextq_s16(lines[2], zero, 6)); + partial3b = vaddq_s16(partial3b, vextq_s16(zero, lines[2], 6)); + partial3b = vaddq_s16(partial3b, lines[3]); + + // Partial sums for lines 4 and 5. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[4], 4)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[5], 3)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[4], zero, 4)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[5], zero, 3)); + partial1b = vaddq_s16(partial1b, lines[4]); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[5], 6)); + int16x8_t partial1c = vextq_s16(lines[5], zero, 6); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[4], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[5], zero, 4)); + int16x8_t partial3c = vextq_s16(zero, lines[4], 2); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[5], 4)); + + // Partial sums for lines 6 and 7. + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[6], 2)); + partial0a = vaddq_s16(partial0a, vextq_s16(zero, lines[7], 1)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[6], zero, 2)); + partial0b = vaddq_s16(partial0b, vextq_s16(lines[7], zero, 1)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[6], 4)); + partial1b = vaddq_s16(partial1b, vextq_s16(zero, lines[7], 2)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[6], zero, 4)); + partial1c = vaddq_s16(partial1c, vextq_s16(lines[7], zero, 2)); + partial3b = vaddq_s16(partial3b, vextq_s16(lines[6], zero, 6)); + partial3c = vaddq_s16(partial3c, vextq_s16(zero, lines[6], 6)); + partial3c = vaddq_s16(partial3c, lines[7]); + + // Special case for direction 2 as it's just a sum along each line. + int16x8_t lines03[4] = { lines[0], lines[1], lines[2], lines[3] }; + int16x8_t lines47[4] = { lines[4], lines[5], lines[6], lines[7] }; + int32x4_t partial2a = horizontal_add_4d_s16x8(lines03); + int32x4_t partial2b = horizontal_add_4d_s16x8(lines47); + + uint32x4_t partial2a_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2a, partial2a)); + uint32x4_t partial2b_u32 = + vreinterpretq_u32_s32(vmulq_s32(partial2b, partial2b)); + + uint32x4_t const0 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)420 << 32 | 840), + vcreate_u64((uint64_t)210 << 32 | 280))); + uint32x4_t const1 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)140 << 32 | 168), + vcreate_u64((uint64_t)105 << 32 | 120))); + uint32x4_t const2 = vreinterpretq_u32_u64( + vcombine_u64(vcreate_u64((uint64_t)210 << 32 | 420), + vcreate_u64((uint64_t)105 << 32 | 140))); + + uint32x4_t costs[4]; + costs[0] = fold_mul_and_sum_neon(partial0a, partial0b, const0, const1); + costs[1] = + fold_mul_and_sum_pairwise_neon(partial1a, partial1b, partial1c, const2); + costs[2] = vaddq_u32(partial2a_u32, partial2b_u32); + costs[2] = vmulq_n_u32(costs[2], 105); + costs[3] = + fold_mul_and_sum_pairwise_neon(partial3c, partial3b, partial3a, const2); + + costs[0] = horizontal_add_4d_u32x4(costs); + vst1q_u32(cost, costs[0]); + return costs[0]; } int cdef_find_dir_neon(const uint16_t *img, int stride, int32_t *var, @@ -346,22 +392,40 @@ } // Compute "mostly vertical" directions. - uint32x4_t cost47 = compute_directions_neon(lines, cost + 4); - - array_reverse_transpose_8x8_neon(lines, lines); + uint32x4_t cost47 = compute_vert_directions_neon(lines, cost + 4); // Compute "mostly horizontal" directions. - uint32x4_t cost03 = compute_directions_neon(lines, cost); + uint32x4_t cost03 = compute_horiz_directions_neon(lines, cost); - uint32x4_t max_cost = vmaxq_u32(cost03, cost47); - max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 2)); - max_cost = vmaxq_u32(max_cost, vextq_u32(max_cost, max_cost, 1)); - best_cost = vgetq_lane_u32(max_cost, 0); - uint16x8_t idx = vcombine_u16(vqmovn_u32(vceqq_u32(max_cost, cost03)), - vqmovn_u32(vceqq_u32(max_cost, cost47))); - uint8x16_t idx_u8 = vcombine_u8(vqmovn_u16(idx), vqmovn_u16(idx)); - best_dir = compute_best_dir(idx_u8); - best_dir = get_msb(best_dir ^ (best_dir - 1)); // Count trailing zeros + // Find max cost as well as its index to get best_dir. + // The max cost needs to be propagated in the whole vector to find its + // position in the original cost vectors cost03 and cost47. + uint32x4_t cost07 = vmaxq_u32(cost03, cost47); +#if AOM_ARCH_AARCH64 + best_cost = vmaxvq_u32(cost07); + uint32x4_t max_cost = vdupq_n_u32(best_cost); + uint8x16x2_t costs = { { vreinterpretq_u8_u32(vceqq_u32(max_cost, cost03)), + vreinterpretq_u8_u32( + vceqq_u32(max_cost, cost47)) } }; + // idx = { 28, 24, 20, 16, 12, 8, 4, 0 }; + uint8x8_t idx = vreinterpret_u8_u64(vcreate_u64(0x0004080c1014181cULL)); + // Get the lowest 8 bit of each 32-bit elements and reverse them. + uint8x8_t tbl = vqtbl2_u8(costs, idx); + uint64_t a = vget_lane_u64(vreinterpret_u64_u8(tbl), 0); + best_dir = aom_clzll(a) >> 3; +#else + uint32x2_t cost64 = vpmax_u32(vget_low_u32(cost07), vget_high_u32(cost07)); + cost64 = vpmax_u32(cost64, cost64); + uint32x4_t max_cost = vcombine_u32(cost64, cost64); + best_cost = vget_lane_u32(cost64, 0); + uint16x8_t costs = vcombine_u16(vmovn_u32(vceqq_u32(max_cost, cost03)), + vmovn_u32(vceqq_u32(max_cost, cost47))); + uint8x8_t idx = + vand_u8(vmovn_u16(costs), + vreinterpret_u8_u64(vcreate_u64(0x8040201008040201ULL))); + int sum = horizontal_add_u8x8(idx); + best_dir = get_msb(sum ^ (sum - 1)); +#endif // Difference between the optimal variance and the variance along the // orthogonal direction. Again, the sum(x^2) terms cancel out. @@ -386,81 +450,64 @@ // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) static INLINE int16x8_t constrain16(uint16x8_t a, uint16x8_t b, unsigned int threshold, int adjdamp) { - int16x8_t diff = vreinterpretq_s16_u16(vsubq_u16(a, b)); - const int16x8_t sign = vshrq_n_s16(diff, 15); - diff = vabsq_s16(diff); - const uint16x8_t s = - vqsubq_u16(vdupq_n_u16(threshold), - vreinterpretq_u16_s16(vshlq_s16(diff, vdupq_n_s16(-adjdamp)))); - return veorq_s16(vaddq_s16(sign, vminq_s16(diff, vreinterpretq_s16_u16(s))), - sign); -} - -static INLINE uint16x8_t get_max_primary(const int is_lowbd, uint16x8_t *tap, - uint16x8_t max, - uint16x8_t cdef_large_value_mask) { - if (is_lowbd) { - uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3])); - /* The source is 16 bits, however, we only really care about the lower - 8 bits. The upper 8 bits contain the "large" flag. After the final - primary max has been calculated, zero out the upper 8 bits. Use this - to find the "16 bit" max. */ - max = vmaxq_u16( - max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask)); - } else { - /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ - max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask)); - } - return max; -} - -static INLINE uint16x8_t get_max_secondary(const int is_lowbd, uint16x8_t *tap, - uint16x8_t max, - uint16x8_t cdef_large_value_mask) { - if (is_lowbd) { - uint8x16_t max_u8 = vreinterpretq_u8_u16(tap[0]); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[1])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[2])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[3])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[4])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[5])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[6])); - max_u8 = vmaxq_u8(max_u8, vreinterpretq_u8_u16(tap[7])); - /* The source is 16 bits, however, we only really care about the lower - 8 bits. The upper 8 bits contain the "large" flag. After the final - primary max has been calculated, zero out the upper 8 bits. Use this - to find the "16 bit" max. */ - max = vmaxq_u16( - max, vandq_u16(vreinterpretq_u16_u8(max_u8), cdef_large_value_mask)); - } else { - /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ - max = vmaxq_u16(max, vandq_u16(tap[0], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[1], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[2], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[3], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[4], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[5], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[6], cdef_large_value_mask)); - max = vmaxq_u16(max, vandq_u16(tap[7], cdef_large_value_mask)); - } - return max; -} - -static INLINE void filter_block_4x4(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int pri_strength, - int sec_strength, int dir, int pri_damping, - int sec_damping, int coeff_shift, - int height, int enable_primary, - int enable_secondary) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; - const int clipping_required = enable_primary && enable_secondary; + uint16x8_t diff = vabdq_u16(a, b); + const uint16x8_t a_gt_b = vcgtq_u16(a, b); + const uint16x8_t s = vqsubq_u16(vdupq_n_u16(threshold), + vshlq_u16(diff, vdupq_n_s16(-adjdamp))); + const int16x8_t clip = vreinterpretq_s16_u16(vminq_u16(diff, s)); + return vbslq_s16(a_gt_b, clip, vnegq_s16(clip)); +} + +static INLINE void primary_filter(uint16x8_t s, uint16x8_t tap[4], + const int *pri_taps, int pri_strength, + int pri_damping, int16x8_t *sum) { + // Near taps + int16x8_t n0 = constrain16(tap[0], s, pri_strength, pri_damping); + int16x8_t n1 = constrain16(tap[1], s, pri_strength, pri_damping); + // sum += pri_taps[0] * (n0 + n1) + n0 = vaddq_s16(n0, n1); + *sum = vmlaq_n_s16(*sum, n0, pri_taps[0]); + + // Far taps + int16x8_t f0 = constrain16(tap[2], s, pri_strength, pri_damping); + int16x8_t f1 = constrain16(tap[3], s, pri_strength, pri_damping); + // sum += pri_taps[1] * (f0 + f1) + f0 = vaddq_s16(f0, f1); + *sum = vmlaq_n_s16(*sum, f0, pri_taps[1]); +} + +static INLINE void secondary_filter(uint16x8_t s, uint16x8_t tap[8], + const int *sec_taps, int sec_strength, + int sec_damping, int16x8_t *sum) { + // Near taps + int16x8_t s0 = constrain16(tap[0], s, sec_strength, sec_damping); + int16x8_t s1 = constrain16(tap[1], s, sec_strength, sec_damping); + int16x8_t s2 = constrain16(tap[2], s, sec_strength, sec_damping); + int16x8_t s3 = constrain16(tap[3], s, sec_strength, sec_damping); + + // sum += sec_taps[0] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[0]); + + // Far taps + s0 = constrain16(tap[4], s, sec_strength, sec_damping); + s1 = constrain16(tap[5], s, sec_strength, sec_damping); + s2 = constrain16(tap[6], s, sec_strength, sec_damping); + s3 = constrain16(tap[7], s, sec_strength, sec_damping); + + // sum += sec_taps[1] * (p0 + p1 + p2 + p3) + s0 = vaddq_s16(s0, s1); + s2 = vaddq_s16(s2, s3); + s0 = vaddq_s16(s0, s2); + *sum = vmlaq_n_s16(*sum, s0, sec_taps[1]); +} + +void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { uint16x8_t max, min; const uint16x8_t cdef_large_value_mask = vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); @@ -473,355 +520,378 @@ const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; const int *sec_taps = cdef_sec_taps; - if (enable_primary && pri_strength) { + if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } - if (enable_secondary && sec_strength) { + if (sec_strength) { sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } - int h = height; - do { - int16x8_t sum = vdupq_n_s16(0); - uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); - max = min = s; + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; - if (enable_primary) { - uint16x8_t tap[4]; + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; // Primary near taps - tap[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); - tap[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); - int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping); - int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping); - - // sum += pri_taps[0] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0])); + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); // Primary far taps - tap[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); - tap[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); - p0 = constrain16(tap[2], s, pri_strength, pri_damping); - p1 = constrain16(tap[3], s, pri_strength, pri_damping); - - // sum += pri_taps[1] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1])); - - if (clipping_required) { - max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - } - } + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); - if (enable_secondary) { - uint16x8_t tap[8]; + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + uint16x8_t sec_src[8]; // Secondary near taps - tap[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); - tap[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); - tap[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); - tap[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); - int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping); - int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping); - int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping); - int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping); - - // sum += sec_taps[0] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0])); + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); // Secondary far taps - tap[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); - tap[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); - tap[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); - tap[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); - p0 = constrain16(tap[4], s, sec_strength, sec_damping); - p1 = constrain16(tap[5], s, sec_strength, sec_damping); - p2 = constrain16(tap[6], s, sec_strength, sec_damping); - p3 = constrain16(tap[7], s, sec_strength, sec_damping); - - // sum += sec_taps[1] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1])); - - if (clipping_required) { - max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - min = vminq_u16(min, tap[4]); - min = vminq_u16(min, tap[5]); - min = vminq_u16(min, tap[6]); - min = vminq_u16(min, tap[7]); - } - } + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); + } else { + uint8_t *dst8 = (uint8_t *)dest; - // res = row + ((sum - (sum < 0) + 8) >> 4) - sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); - int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8)); - res = vshrq_n_s16(res, 4); - res = vaddq_s16(vreinterpretq_s16_u16(s), res); + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; - if (clipping_required) { - res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), - vreinterpretq_s16_u16(max)); - } + uint16x8_t pri_src[4]; - if (is_lowbd) { - const uint8x8_t res_128 = vqmovun_s16(res); - store_unaligned_u8_4x2(dst8, dstride, res_128); - } else { - store_unaligned_u16_4x2(dst16, dstride, vreinterpretq_u16_s16(res)); - } + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t pri_max0 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[0]), + vreinterpretq_u8_u16(pri_src[1])); + uint8x16_t pri_max1 = vmaxq_u8(vreinterpretq_u8_u16(pri_src[2]), + vreinterpretq_u8_u16(pri_src[3])); + pri_max0 = vmaxq_u8(pri_max0, pri_max1); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(pri_max0), + cdef_large_value_mask)); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // The source is 16 bits, however, we only really care about the lower + // 8 bits. The upper 8 bits contain the "large" flag. After the final + // primary max has been calculated, zero out the upper 8 bits. Use this + // to find the "16 bit" max. + uint8x16_t sec_max0 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[0]), + vreinterpretq_u8_u16(sec_src[1])); + uint8x16_t sec_max1 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[2]), + vreinterpretq_u8_u16(sec_src[3])); + uint8x16_t sec_max2 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[4]), + vreinterpretq_u8_u16(sec_src[5])); + uint8x16_t sec_max3 = vmaxq_u8(vreinterpretq_u8_u16(sec_src[6]), + vreinterpretq_u8_u16(sec_src[7])); + sec_max0 = vmaxq_u8(sec_max0, sec_max1); + sec_max2 = vmaxq_u8(sec_max2, sec_max3); + sec_max0 = vmaxq_u8(sec_max0, sec_max2); + max = vmaxq_u16(max, vandq_u16(vreinterpretq_u16_u8(sec_max0), + cdef_large_value_mask)); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res_s16 = vminq_s16(vmaxq_s16(res_s16, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); + } +} + +void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in, + int pri_strength, int sec_strength, int dir, + int pri_damping, int sec_damping, int coeff_shift, + int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; - in += 2 * CDEF_BSTRIDE; - dst8 += 2 * dstride; - dst16 += 2 * dstride; - h -= 2; - } while (h != 0); -} - -static INLINE void filter_block_8x8(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int pri_strength, - int sec_strength, int dir, int pri_damping, - int sec_damping, int coeff_shift, - int height, int enable_primary, - int enable_secondary) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; - const int clipping_required = enable_primary && enable_secondary; - uint16x8_t max, min; - const uint16x8_t cdef_large_value_mask = - vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); const int po1 = cdef_directions[dir][0]; const int po2 = cdef_directions[dir][1]; - const int s1o1 = cdef_directions[dir + 2][0]; - const int s1o2 = cdef_directions[dir + 2][1]; - const int s2o1 = cdef_directions[dir - 2][0]; - const int s2o2 = cdef_directions[dir - 2][1]; const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; - const int *sec_taps = cdef_sec_taps; - if (enable_primary && pri_strength) { + if (pri_strength) { pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); } - if (enable_secondary && sec_strength) { - sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); - } - int h = height; - do { - int16x8_t sum = vdupq_n_s16(0); - uint16x8_t s = vld1q_u16(in); - max = min = s; + if (block_width == 8) { + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); - if (enable_primary) { uint16x8_t tap[4]; // Primary near taps tap[0] = vld1q_u16(in + po1); tap[1] = vld1q_u16(in - po1); - int16x8_t p0 = constrain16(tap[0], s, pri_strength, pri_damping); - int16x8_t p1 = constrain16(tap[1], s, pri_strength, pri_damping); - - // sum += pri_taps[0] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[0])); // Primary far taps tap[2] = vld1q_u16(in + po2); - p0 = constrain16(tap[2], s, pri_strength, pri_damping); tap[3] = vld1q_u16(in - po2); - p1 = constrain16(tap[3], s, pri_strength, pri_damping); - // sum += pri_taps[1] * (p0 + p1) - p0 = vaddq_s16(p0, p1); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(pri_taps[1])); - if (clipping_required) { - max = get_max_primary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - } - } + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); - if (enable_secondary) { - uint16x8_t tap[8]; + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); - // Secondary near taps - tap[0] = vld1q_u16(in + s1o1); - tap[1] = vld1q_u16(in - s1o1); - tap[2] = vld1q_u16(in + s2o1); - tap[3] = vld1q_u16(in - s2o1); - int16x8_t p0 = constrain16(tap[0], s, sec_strength, sec_damping); - int16x8_t p1 = constrain16(tap[1], s, sec_strength, sec_damping); - int16x8_t p2 = constrain16(tap[2], s, sec_strength, sec_damping); - int16x8_t p3 = constrain16(tap[3], s, sec_strength, sec_damping); - - // sum += sec_taps[0] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[0])); - - // Secondary far taps - tap[4] = vld1q_u16(in + s1o2); - tap[5] = vld1q_u16(in - s1o2); - tap[6] = vld1q_u16(in + s2o2); - tap[7] = vld1q_u16(in - s2o2); - p0 = constrain16(tap[4], s, sec_strength, sec_damping); - p1 = constrain16(tap[5], s, sec_strength, sec_damping); - p2 = constrain16(tap[6], s, sec_strength, sec_damping); - p3 = constrain16(tap[7], s, sec_strength, sec_damping); - - // sum += sec_taps[1] * (p0 + p1 + p2 + p3) - p0 = vaddq_s16(p0, p1); - p2 = vaddq_s16(p2, p3); - p0 = vaddq_s16(p0, p2); - sum = vmlaq_s16(sum, p0, vdupq_n_s16(sec_taps[1])); - - if (clipping_required) { - max = get_max_secondary(is_lowbd, tap, max, cdef_large_value_mask); - - min = vminq_u16(min, tap[0]); - min = vminq_u16(min, tap[1]); - min = vminq_u16(min, tap[2]); - min = vminq_u16(min, tap[3]); - min = vminq_u16(min, tap[4]); - min = vminq_u16(min, tap[5]); - min = vminq_u16(min, tap[6]); - min = vminq_u16(min, tap[7]); - } - } - - // res = row + ((sum - (sum < 0) + 8) >> 4) - sum = vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); - int16x8_t res = vaddq_s16(sum, vdupq_n_s16(8)); - res = vshrq_n_s16(res, 4); - res = vaddq_s16(vreinterpretq_s16_u16(s), res); - if (clipping_required) { - res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), - vreinterpretq_s16_u16(max)); - } - - if (is_lowbd) { - const uint8x8_t res_128 = vqmovun_s16(res); - vst1_u8(dst8, res_128); - } else { - vst1q_u16(dst16, vreinterpretq_u16_s16(res)); - } - - in += CDEF_BSTRIDE; - dst8 += dstride; - dst16 += dstride; - } while (--h != 0); -} + } else { + uint8_t *dst8 = (uint8_t *)dest; -static INLINE void copy_block_4xh(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int height) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); - int h = height; - do { - const uint16x8_t row = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); - if (is_lowbd) { - const uint8x8_t res_128 = vqmovn_u16(row); - store_unaligned_u8_4x2(dst8, dstride, res_128); - } else { - store_unaligned_u16_4x2(dst16, dstride, row); - } + uint16x8_t pri_src[4]; - in += 2 * CDEF_BSTRIDE; - dst8 += 2 * dstride; - dst16 += 2 * dstride; - h -= 2; - } while (h != 0); -} - -static INLINE void copy_block_8xh(const int is_lowbd, void *dest, int dstride, - const uint16_t *in, int height) { - uint8_t *dst8 = (uint8_t *)dest; - uint16_t *dst16 = (uint16_t *)dest; + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); - int h = height; - do { - const uint16x8_t row = vld1q_u16(in); - if (is_lowbd) { - const uint8x8_t res_128 = vqmovn_u16(row); - vst1_u8(dst8, res_128); - } else { - vst1q_u16(dst16, row); - } + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); - in += CDEF_BSTRIDE; - dst8 += dstride; - dst16 += dstride; - } while (--h != 0); -} + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); -void cdef_filter_8_0_neon(void *dest, int dstride, const uint16_t *in, - int pri_strength, int sec_strength, int dir, - int pri_damping, int sec_damping, int coeff_shift, - int block_width, int block_height) { - if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); - } else { - filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); } } -void cdef_filter_8_1_neon(void *dest, int dstride, const uint16_t *in, +void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in, int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { - if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); - } else { - filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); } -} -void cdef_filter_8_2_neon(void *dest, int dstride, const uint16_t *in, - int pri_strength, int sec_strength, int dir, - int pri_damping, int sec_damping, int coeff_shift, - int block_width, int block_height) { if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + vst1_u8(dst8, res_u8); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/1, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res_s16 = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + const uint8x8_t res_u8 = vqmovun_s16(res_s16); + store_u8x4_strided_x2(dst8, dstride, res_u8); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -837,9 +907,30 @@ (void)coeff_shift; (void)block_width; if (block_width == 8) { - copy_block_8xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + const uint8x8_t res = vqmovn_u16(s); + vst1_u8(dst8, res); + + in += CDEF_BSTRIDE; + dst8 += dstride; + } while (--h != 0); } else { - copy_block_4xh(/*is_lowbd=*/1, dest, dstride, in, block_height); + uint8_t *dst8 = (uint8_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + const uint8x8_t res = vqmovn_u16(s); + store_u8x4_strided_x2(dst8, dstride, res); + + in += 2 * CDEF_BSTRIDE; + dst8 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -847,16 +938,213 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + uint16x8_t max, min; + const uint16x8_t cdef_large_value_mask = + vdupq_n_u16(((uint16_t)~CDEF_VERY_LARGE)); + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + const int *sec_taps = cdef_sec_taps; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = vld1q_u16(in + po1); + pri_src[1] = vld1q_u16(in - po1); + + // Primary far taps + pri_src[2] = vld1q_u16(in + po2); + pri_src[3] = vld1q_u16(in - po2); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min0 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min1 = vminq_u16(pri_src[2], pri_src[3]); + pri_min0 = vminq_u16(pri_min0, pri_min1); + min = vminq_u16(min, pri_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + max = min = s; + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + uint16x8_t pri_min1 = vminq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_min2 = vminq_u16(pri_src[2], pri_src[3]); + pri_min1 = vminq_u16(pri_min1, pri_min2); + min = vminq_u16(min, pri_min1); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + pri_src[0] = vandq_u16(pri_src[0], cdef_large_value_mask); + pri_src[1] = vandq_u16(pri_src[1], cdef_large_value_mask); + pri_src[2] = vandq_u16(pri_src[2], cdef_large_value_mask); + pri_src[3] = vandq_u16(pri_src[3], cdef_large_value_mask); + uint16x8_t pri_max0 = vmaxq_u16(pri_src[0], pri_src[1]); + uint16x8_t pri_max1 = vmaxq_u16(pri_src[2], pri_src[3]); + pri_max0 = vmaxq_u16(pri_max0, pri_max1); + max = vmaxq_u16(max, pri_max0); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + uint16x8_t sec_min0 = vminq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_min1 = vminq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_min2 = vminq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_min3 = vminq_u16(sec_src[6], sec_src[7]); + sec_min0 = vminq_u16(sec_min0, sec_min1); + sec_min2 = vminq_u16(sec_min2, sec_min3); + sec_min0 = vminq_u16(sec_min0, sec_min2); + min = vminq_u16(min, sec_min0); + + /* Convert CDEF_VERY_LARGE to 0 before calculating max. */ + sec_src[0] = vandq_u16(sec_src[0], cdef_large_value_mask); + sec_src[1] = vandq_u16(sec_src[1], cdef_large_value_mask); + sec_src[2] = vandq_u16(sec_src[2], cdef_large_value_mask); + sec_src[3] = vandq_u16(sec_src[3], cdef_large_value_mask); + sec_src[4] = vandq_u16(sec_src[4], cdef_large_value_mask); + sec_src[5] = vandq_u16(sec_src[5], cdef_large_value_mask); + sec_src[6] = vandq_u16(sec_src[6], cdef_large_value_mask); + sec_src[7] = vandq_u16(sec_src[7], cdef_large_value_mask); + + uint16x8_t sec_max0 = vmaxq_u16(sec_src[0], sec_src[1]); + uint16x8_t sec_max1 = vmaxq_u16(sec_src[2], sec_src[3]); + uint16x8_t sec_max2 = vmaxq_u16(sec_src[4], sec_src[5]); + uint16x8_t sec_max3 = vmaxq_u16(sec_src[6], sec_src[7]); + sec_max0 = vmaxq_u16(sec_max0, sec_max1); + sec_max2 = vmaxq_u16(sec_max2, sec_max3); + sec_max0 = vmaxq_u16(sec_max0, sec_max2); + max = vmaxq_u16(max, sec_max0); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + res = vminq_s16(vmaxq_s16(res, vreinterpretq_s16_u16(min)), + vreinterpretq_s16_u16(max)); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -864,16 +1152,78 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + (void)sec_strength; + (void)sec_damping; + + const int po1 = cdef_directions[dir][0]; + const int po2 = cdef_directions[dir][1]; + const int *pri_taps = cdef_pri_taps[(pri_strength >> coeff_shift) & 1]; + + if (pri_strength) { + pri_damping = AOMMAX(0, pri_damping - get_msb(pri_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t tap[4]; + + // Primary near taps + tap[0] = vld1q_u16(in + po1); + tap[1] = vld1q_u16(in - po1); + + // Primary far taps + tap[2] = vld1q_u16(in + po2); + tap[3] = vld1q_u16(in - po2); + + primary_filter(s, tap, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/1, - /*enable_secondary=*/0); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t pri_src[4]; + + // Primary near taps + pri_src[0] = load_unaligned_u16_4x2(in + po1, CDEF_BSTRIDE); + pri_src[1] = load_unaligned_u16_4x2(in - po1, CDEF_BSTRIDE); + + // Primary far taps + pri_src[2] = load_unaligned_u16_4x2(in + po2, CDEF_BSTRIDE); + pri_src[3] = load_unaligned_u16_4x2(in - po2, CDEF_BSTRIDE); + + primary_filter(s, pri_src, pri_taps, pri_strength, pri_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -881,16 +1231,89 @@ int pri_strength, int sec_strength, int dir, int pri_damping, int sec_damping, int coeff_shift, int block_width, int block_height) { + (void)pri_strength; + (void)pri_damping; + (void)coeff_shift; + + const int s1o1 = cdef_directions[dir + 2][0]; + const int s1o2 = cdef_directions[dir + 2][1]; + const int s2o1 = cdef_directions[dir - 2][0]; + const int s2o2 = cdef_directions[dir - 2][1]; + const int *sec_taps = cdef_sec_taps; + + if (sec_strength) { + sec_damping = AOMMAX(0, sec_damping - get_msb(sec_strength)); + } + if (block_width == 8) { - filter_block_8x8(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = vld1q_u16(in); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = vld1q_u16(in + s1o1); + sec_src[1] = vld1q_u16(in - s1o1); + sec_src[2] = vld1q_u16(in + s2o1); + sec_src[3] = vld1q_u16(in - s2o1); + + // Secondary far taps + sec_src[4] = vld1q_u16(in + s1o2); + sec_src[5] = vld1q_u16(in - s1o2); + sec_src[6] = vld1q_u16(in + s2o2); + sec_src[7] = vld1q_u16(in - s2o2); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + vst1q_u16(dst16, vreinterpretq_u16_s16(res)); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - filter_block_4x4(/*is_lowbd=*/0, dest, dstride, in, pri_strength, - sec_strength, dir, pri_damping, sec_damping, coeff_shift, - block_height, /*enable_primary=*/0, - /*enable_secondary=*/1); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + int16x8_t sum = vdupq_n_s16(0); + uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + + uint16x8_t sec_src[8]; + + // Secondary near taps + sec_src[0] = load_unaligned_u16_4x2(in + s1o1, CDEF_BSTRIDE); + sec_src[1] = load_unaligned_u16_4x2(in - s1o1, CDEF_BSTRIDE); + sec_src[2] = load_unaligned_u16_4x2(in + s2o1, CDEF_BSTRIDE); + sec_src[3] = load_unaligned_u16_4x2(in - s2o1, CDEF_BSTRIDE); + + // Secondary far taps + sec_src[4] = load_unaligned_u16_4x2(in + s1o2, CDEF_BSTRIDE); + sec_src[5] = load_unaligned_u16_4x2(in - s1o2, CDEF_BSTRIDE); + sec_src[6] = load_unaligned_u16_4x2(in + s2o2, CDEF_BSTRIDE); + sec_src[7] = load_unaligned_u16_4x2(in - s2o2, CDEF_BSTRIDE); + + secondary_filter(s, sec_src, sec_taps, sec_strength, sec_damping, &sum); + + // res = s + ((sum - (sum < 0) + 8) >> 4) + sum = + vaddq_s16(sum, vreinterpretq_s16_u16(vcltq_s16(sum, vdupq_n_s16(0)))); + const int16x8_t res = vrsraq_n_s16(vreinterpretq_s16_u16(s), sum, 4); + + store_u16x4_strided_x2(dst16, dstride, vreinterpretq_u16_s16(res)); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } } @@ -906,8 +1329,27 @@ (void)coeff_shift; (void)block_width; if (block_width == 8) { - copy_block_8xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = vld1q_u16(in); + vst1q_u16(dst16, s); + + in += CDEF_BSTRIDE; + dst16 += dstride; + } while (--h != 0); } else { - copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); + uint16_t *dst16 = (uint16_t *)dest; + + int h = block_height; + do { + const uint16x8_t s = load_unaligned_u16_4x2(in, CDEF_BSTRIDE); + store_u16x4_strided_x2(dst16, dstride, s); + + in += 2 * CDEF_BSTRIDE; + dst16 += 2 * dstride; + h -= 2; + } while (h != 0); } } diff -Nru aom-3.8.2/av1/common/arm/compound_convolve_neon.c aom-3.9.0/av1/common/arm/compound_convolve_neon.c --- aom-3.8.2/av1/common/arm/compound_convolve_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/compound_convolve_neon.c 2024-05-07 19:57:02.677000000 +0000 @@ -336,10 +336,8 @@ dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); - store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; @@ -425,10 +423,8 @@ vreinterpretq_s16_u16(round_offset_vec), &d01, &d23); - store_u8_4x1(dst8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(dst8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(dst8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(dst8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(dst8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(dst8 + 2 * dst8_stride, dst8_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; @@ -647,7 +643,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(dst8_ptr, d01, 0); + store_u8_4x1(dst8_ptr, d01); src_ptr += src_stride; dst_ptr += dst_stride; @@ -860,7 +856,7 @@ uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(dst8_ptr, d01, 0); + store_u8_4x1(dst8_ptr, d01); src_ptr += src_stride; dst_ptr += dst_stride; @@ -1321,10 +1317,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -1348,7 +1342,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; @@ -1540,10 +1534,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -1566,7 +1558,7 @@ uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; @@ -1998,10 +1990,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -2029,7 +2019,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; @@ -2278,10 +2268,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01, &d23); - store_u8_4x1(d_u8 + 0 * dst8_stride, d01, 0); - store_u8_4x1(d_u8 + 1 * dst8_stride, d01, 1); - store_u8_4x1(d_u8 + 2 * dst8_stride, d23, 0); - store_u8_4x1(d_u8 + 3 * dst8_stride, d23, 1); + store_u8x4_strided_x2(d_u8 + 0 * dst8_stride, dst8_stride, d01); + store_u8x4_strided_x2(d_u8 + 2 * dst8_stride, dst8_stride, d23); s0 = s4; s1 = s5; @@ -2308,7 +2296,7 @@ uint8x8_t d01; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01); - store_u8_4x1(d_u8, d01, 0); + store_u8_4x1(d_u8, d01); s0 = s1; s1 = s2; diff -Nru aom-3.8.2/av1/common/arm/compound_convolve_neon.h aom-3.9.0/av1/common/arm/compound_convolve_neon.h --- aom-3.8.2/av1/common/arm/compound_convolve_neon.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/compound_convolve_neon.h 2024-05-07 19:57:02.679000000 +0000 @@ -282,10 +282,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -308,7 +306,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; @@ -437,10 +435,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -462,7 +458,7 @@ uint8x8_t d01_u8; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; @@ -761,10 +757,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -789,7 +783,7 @@ compute_dist_wtd_avg_4x1(dd0, d0, fwd_offset, bck_offset, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; @@ -924,10 +918,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); dst8_ptr += 4 * dst8_stride; s0 = s4; @@ -951,7 +943,7 @@ uint8x8_t d01_u8; compute_basic_avg_4x1(dd0, d0, vget_low_s16(round_offset_vec), &d01_u8); - store_u8_4x1(dst8_ptr, d01_u8, 0); + store_u8_4x1(dst8_ptr, d01_u8); dst8_ptr += dst8_stride; s0 = s1; diff -Nru aom-3.8.2/av1/common/arm/compound_convolve_neon_dotprod.c aom-3.9.0/av1/common/arm/compound_convolve_neon_dotprod.c --- aom-3.8.2/av1/common/arm/compound_convolve_neon_dotprod.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/compound_convolve_neon_dotprod.c 2024-05-07 19:57:02.680000000 +0000 @@ -380,10 +380,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; @@ -503,10 +501,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; diff -Nru aom-3.8.2/av1/common/arm/compound_convolve_neon_i8mm.c aom-3.9.0/av1/common/arm/compound_convolve_neon_i8mm.c --- aom-3.8.2/av1/common/arm/compound_convolve_neon_i8mm.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/compound_convolve_neon_i8mm.c 2024-05-07 19:57:02.682000000 +0000 @@ -335,10 +335,8 @@ compute_dist_wtd_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, fwd_offset, bck_offset, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; @@ -450,10 +448,8 @@ compute_basic_avg_4x4(dd0, dd1, dd2, dd3, d0, d1, d2, d3, round_offset_vec, &d01_u8, &d23_u8); - store_u8_4x1(dst8_ptr + 0 * dst8_stride, d01_u8, 0); - store_u8_4x1(dst8_ptr + 1 * dst8_stride, d01_u8, 1); - store_u8_4x1(dst8_ptr + 2 * dst8_stride, d23_u8, 0); - store_u8_4x1(dst8_ptr + 3 * dst8_stride, d23_u8, 1); + store_u8x4_strided_x2(dst8_ptr + 0 * dst8_stride, dst8_stride, d01_u8); + store_u8x4_strided_x2(dst8_ptr + 2 * dst8_stride, dst8_stride, d23_u8); src_ptr += 4 * src_stride; dst_ptr += 4 * dst_stride; diff -Nru aom-3.8.2/av1/common/arm/convolve_neon.c aom-3.9.0/av1/common/arm/convolve_neon.c --- aom-3.8.2/av1/common/arm/convolve_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/convolve_neon.c 2024-05-07 19:57:02.682000000 +0000 @@ -121,10 +121,8 @@ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - store_u8_4x1(d + 0 * dst_stride, d01, 0); - store_u8_4x1(d + 1 * dst_stride, d01, 1); - store_u8_4x1(d + 2 * dst_stride, d23, 0); - store_u8_4x1(d + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(d, dst_stride, d01); + store_u8x4_strided_x2(d + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -178,7 +176,7 @@ uint8x8_t dd0 = vqmovun_s16(vcombine_s16(d0, vdup_n_s16(0))); - store_u8_4x1(d, dd0, 0); + store_u8_4x1(d, dd0); s += 4; d += 4; @@ -276,7 +274,7 @@ uint8x8_t d0 = convolve4_4_x(s0, s1, s2, s3, x_filter, vget_low_s16(horiz_const)); - store_u8_4x1(dst, d0, 0); + store_u8_4x1(dst, d0); src += src_stride; dst += dst_stride; @@ -479,10 +477,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -501,7 +497,7 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; @@ -665,10 +661,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -689,7 +683,7 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, vdup_n_s16(0)), FILTER_BITS - 1); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; @@ -885,10 +879,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -1431,11 +1423,11 @@ uint8x8_t d1 = vrhadd_u8(s1_0, s1_1); if (w == 2) { - store_u8_2x1(dst + 0 * dst_stride, d0, 0); - store_u8_2x1(dst + 1 * dst_stride, d1, 0); + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); } else { - store_u8_4x1(dst + 0 * dst_stride, d0, 0); - store_u8_4x1(dst + 1 * dst_stride, d1, 0); + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); } src += 2 * src_stride; @@ -1502,11 +1494,11 @@ uint8x8_t d1 = vrhadd_u8(s1, s2); if (w == 2) { - store_u8_2x1(dst + 0 * dst_stride, d0, 0); - store_u8_2x1(dst + 1 * dst_stride, d1, 0); + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); } else { - store_u8_4x1(dst + 0 * dst_stride, d0, 0); - store_u8_4x1(dst + 1 * dst_stride, d1, 0); + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); } src += 2 * src_stride; @@ -1626,14 +1618,15 @@ uint16x4_t sum0 = vadd_u16(s0, s1); uint16x4_t sum1 = vadd_u16(s1, s2); - uint8x8_t d01 = vqrshrn_n_u16(vcombine_u16(sum0, sum1), 2); + uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2); + uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2); if (w == 2) { - store_u8_2x1(dst + 0 * dst_stride, d01, 0); - store_u8_2x1(dst + 1 * dst_stride, d01, 2); + store_u8_2x1(dst + 0 * dst_stride, d0); + store_u8_2x1(dst + 1 * dst_stride, d1); } else { - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); + store_u8_4x1(dst + 0 * dst_stride, d0); + store_u8_4x1(dst + 1 * dst_stride, d1); } im += 2 * im_stride; diff -Nru aom-3.8.2/av1/common/arm/convolve_neon.h aom-3.9.0/av1/common/arm/convolve_neon.h --- aom-3.8.2/av1/common/arm/convolve_neon.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/convolve_neon.h 2024-05-07 19:57:02.684000000 +0000 @@ -127,10 +127,8 @@ uint8x8_t d01 = vqmovun_s16(dd01); uint8x8_t d23 = vqmovun_s16(dd23); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -282,10 +280,8 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -303,7 +299,7 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; @@ -452,10 +448,8 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, d1), sub_const)); uint8x8_t d23 = vqmovun_s16(vsubq_s16(vcombine_s16(d2, d3), sub_const)); - store_u8_4x1(dst_ptr + 0 * dst_stride, d01, 0); - store_u8_4x1(dst_ptr + 1 * dst_stride, d01, 1); - store_u8_4x1(dst_ptr + 2 * dst_stride, d23, 0); - store_u8_4x1(dst_ptr + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23); s0 = s4; s1 = s5; @@ -471,7 +465,7 @@ uint8x8_t d01 = vqmovun_s16(vsubq_s16(vcombine_s16(d0, vdup_n_s16(0)), sub_const)); - store_u8_4x1(dst_ptr, d01, 0); + store_u8_4x1(dst_ptr, d01); s0 = s1; s1 = s2; diff -Nru aom-3.8.2/av1/common/arm/convolve_neon_dotprod.c aom-3.9.0/av1/common/arm/convolve_neon_dotprod.c --- aom-3.8.2/av1/common/arm/convolve_neon_dotprod.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/convolve_neon_dotprod.c 2024-05-07 19:57:02.685000000 +0000 @@ -128,7 +128,7 @@ do { uint8x8_t d0 = vld1_u8(s); if (w == 4) { - store_u8_4x1(d, d0, 0); + store_u8_4x1(d, d0); } else { vst1_u8(d, d0); } @@ -158,10 +158,8 @@ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); dst += 4 * dst_stride; src += 4 * src_stride; @@ -314,10 +312,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; diff -Nru aom-3.8.2/av1/common/arm/convolve_neon_i8mm.c aom-3.9.0/av1/common/arm/convolve_neon_i8mm.c --- aom-3.8.2/av1/common/arm/convolve_neon_i8mm.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/convolve_neon_i8mm.c 2024-05-07 19:57:02.686000000 +0000 @@ -107,7 +107,7 @@ do { uint8x8_t d0 = vld1_u8(s); if (w == 4) { - store_u8_4x1(d, d0, 0); + store_u8_4x1(d, d0); } else { vst1_u8(d, d0); } @@ -140,10 +140,8 @@ uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1)); uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3)); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); dst += 4 * dst_stride; src += 4 * src_stride; @@ -271,10 +269,8 @@ uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS - 1); uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS - 1); - store_u8_4x1(dst + 0 * dst_stride, d01, 0); - store_u8_4x1(dst + 1 * dst_stride, d01, 1); - store_u8_4x1(dst + 2 * dst_stride, d23, 0); - store_u8_4x1(dst + 3 * dst_stride, d23, 1); + store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01); + store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23); src += 4 * src_stride; dst += 4 * dst_stride; diff -Nru aom-3.8.2/av1/common/arm/highbd_compound_convolve_neon.c aom-3.9.0/av1/common/arm/highbd_compound_convolve_neon.c --- aom-3.8.2/av1/common/arm/highbd_compound_convolve_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_compound_convolve_neon.c 2024-05-07 19:57:02.689000000 +0000 @@ -20,266 +20,9 @@ #include "aom_ports/mem.h" #include "av1/common/convolve.h" #include "av1/common/filter.h" +#include "av1/common/arm/highbd_compound_convolve_neon.h" #include "av1/common/arm/highbd_convolve_neon.h" -#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS - -static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr, - int src_stride, uint16_t *dst_ptr, - int dst_stride, int w, int h, - ConvolveParams *conv_params, - const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint16x4_t offset_vec = vdup_n_u16(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint16x4_t avg = vhadd_u16(src, ref); - int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint16x8_t avg = vhaddq_u16(s, r); - int32x4_t d0_lo = - vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); - int32x4_t d0_hi = - vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); - - uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2), - vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2)); - d0 = vminq_u16(d0, max); - vst1q_u16(dst, d0); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } -} - -static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, - uint16_t *dst_ptr, int dst_stride, - int w, int h, - ConvolveParams *conv_params, - const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint16x4_t offset_vec = vdup_n_u16(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint16x4_t avg = vhadd_u16(src, ref); - int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint16x8_t avg = vhaddq_u16(s, r); - int32x4_t d0_lo = - vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); - int32x4_t d0_hi = - vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); - - uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT), - vqrshrun_n_s32(d0_hi, ROUND_SHIFT)); - d0 = vminq_u16(d0, max); - vst1q_u16(dst, d0); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - - src_ptr += src_stride; - ref_ptr += ref_stride; - dst_ptr += dst_stride; - } while (--h != 0); - } -} - -static INLINE void highbd_12_dist_wtd_comp_avg_neon( - const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, - int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint32x4_t offset_vec = vdupq_n_u32(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); - uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); - - // Weighted averaging - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); - wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); - wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); - wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); - wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); - - uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); - wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); - wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); - int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); - - uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2), - vqrshrun_n_s32(d1, ROUND_SHIFT - 2)); - d01 = vminq_u16(d01, max); - vst1q_u16(dst, d01); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } -} - -static INLINE void highbd_dist_wtd_comp_avg_neon( - const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, - int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { - CONV_BUF_TYPE *ref_ptr = conv_params->dst; - const int ref_stride = conv_params->dst_stride; - const uint32x4_t offset_vec = vdupq_n_u32(offset); - const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); - uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); - uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); - - // Weighted averaging - if (w == 4) { - do { - const uint16x4_t src = vld1_u16(src_ptr); - const uint16x4_t ref = vld1_u16(ref_ptr); - - uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); - wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); - wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); - - uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); - d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); - - vst1_u16(dst_ptr, d0_u16); - - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } else { - do { - int width = w; - const uint16_t *src = src_ptr; - const uint16_t *ref = ref_ptr; - uint16_t *dst = dst_ptr; - do { - const uint16x8_t s = vld1q_u16(src); - const uint16x8_t r = vld1q_u16(ref); - - uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); - wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); - wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); - int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); - - uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); - wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); - wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); - int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); - - uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT), - vqrshrun_n_s32(d1, ROUND_SHIFT)); - d01 = vminq_u16(d01, max); - vst1q_u16(dst, d01); - - src += 8; - ref += 8; - dst += 8; - width -= 8; - } while (width != 0); - src_ptr += src_stride; - dst_ptr += dst_stride; - ref_ptr += ref_stride; - } while (--h != 0); - } -} - static INLINE uint16x4_t highbd_12_convolve6_4( const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, const int16x4_t s4, const int16x4_t s5, @@ -1235,7 +978,7 @@ uint16x4_t d = vshl_u16(s, round_shift_s16); d = vadd_u16(d, offset_u16); if (w == 2) { - store_u16_2x1(dst_ptr + y * dst_stride, d, 0); + store_u16_2x1(dst_ptr + y * dst_stride, d); } else { vst1_u16(dst_ptr + y * dst_stride, d); } diff -Nru aom-3.8.2/av1/common/arm/highbd_compound_convolve_neon.h aom-3.9.0/av1/common/arm/highbd_compound_convolve_neon.h --- aom-3.8.2/av1/common/arm/highbd_compound_convolve_neon.h 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_compound_convolve_neon.h 2024-05-07 19:57:02.690000000 +0000 @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" + +#define ROUND_SHIFT 2 * FILTER_BITS - ROUND0_BITS - COMPOUND_ROUND1_BITS + +static INLINE void highbd_12_comp_avg_neon(const uint16_t *src_ptr, + int src_stride, uint16_t *dst_ptr, + int dst_stride, int w, int h, + ConvolveParams *conv_params, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT - 2), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT - 2)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_comp_avg_neon(const uint16_t *src_ptr, int src_stride, + uint16_t *dst_ptr, int dst_stride, + int w, int h, + ConvolveParams *conv_params, + const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint16x4_t offset_vec = vdup_n_u16(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint16x4_t avg = vhadd_u16(src, ref); + int32x4_t d0 = vreinterpretq_s32_u32(vsubl_u16(avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint16x8_t avg = vhaddq_u16(s, r); + int32x4_t d0_lo = + vreinterpretq_s32_u32(vsubl_u16(vget_low_u16(avg), offset_vec)); + int32x4_t d0_hi = + vreinterpretq_s32_u32(vsubl_u16(vget_high_u16(avg), offset_vec)); + + uint16x8_t d0 = vcombine_u16(vqrshrun_n_s32(d0_lo, ROUND_SHIFT), + vqrshrun_n_s32(d0_hi, ROUND_SHIFT)); + d0 = vminq_u16(d0, max); + vst1q_u16(dst, d0); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + + src_ptr += src_stride; + ref_ptr += ref_stride; + dst_ptr += dst_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_12_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT - 2); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT - 2), + vqrshrun_n_s32(d1, ROUND_SHIFT - 2)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} + +static INLINE void highbd_dist_wtd_comp_avg_neon( + const uint16_t *src_ptr, int src_stride, uint16_t *dst_ptr, int dst_stride, + int w, int h, ConvolveParams *conv_params, const int offset, const int bd) { + CONV_BUF_TYPE *ref_ptr = conv_params->dst; + const int ref_stride = conv_params->dst_stride; + const uint32x4_t offset_vec = vdupq_n_u32(offset); + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x4_t fwd_offset = vdup_n_u16(conv_params->fwd_offset); + uint16x4_t bck_offset = vdup_n_u16(conv_params->bck_offset); + + // Weighted averaging + if (w == 4) { + do { + const uint16x4_t src = vld1_u16(src_ptr); + const uint16x4_t ref = vld1_u16(ref_ptr); + + uint32x4_t wtd_avg = vmull_u16(ref, fwd_offset); + wtd_avg = vmlal_u16(wtd_avg, src, bck_offset); + wtd_avg = vshrq_n_u32(wtd_avg, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg, offset_vec)); + + uint16x4_t d0_u16 = vqrshrun_n_s32(d0, ROUND_SHIFT); + d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); + + vst1_u16(dst_ptr, d0_u16); + + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } else { + do { + int width = w; + const uint16_t *src = src_ptr; + const uint16_t *ref = ref_ptr; + uint16_t *dst = dst_ptr; + do { + const uint16x8_t s = vld1q_u16(src); + const uint16x8_t r = vld1q_u16(ref); + + uint32x4_t wtd_avg0 = vmull_u16(vget_low_u16(r), fwd_offset); + wtd_avg0 = vmlal_u16(wtd_avg0, vget_low_u16(s), bck_offset); + wtd_avg0 = vshrq_n_u32(wtd_avg0, DIST_PRECISION_BITS); + int32x4_t d0 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg0, offset_vec)); + + uint32x4_t wtd_avg1 = vmull_u16(vget_high_u16(r), fwd_offset); + wtd_avg1 = vmlal_u16(wtd_avg1, vget_high_u16(s), bck_offset); + wtd_avg1 = vshrq_n_u32(wtd_avg1, DIST_PRECISION_BITS); + int32x4_t d1 = vreinterpretq_s32_u32(vsubq_u32(wtd_avg1, offset_vec)); + + uint16x8_t d01 = vcombine_u16(vqrshrun_n_s32(d0, ROUND_SHIFT), + vqrshrun_n_s32(d1, ROUND_SHIFT)); + d01 = vminq_u16(d01, max); + vst1q_u16(dst, d01); + + src += 8; + ref += 8; + dst += 8; + width -= 8; + } while (width != 0); + src_ptr += src_stride; + dst_ptr += dst_stride; + ref_ptr += ref_stride; + } while (--h != 0); + } +} diff -Nru aom-3.8.2/av1/common/arm/highbd_compound_convolve_sve2.c aom-3.9.0/av1/common/arm/highbd_compound_convolve_sve2.c --- aom-3.8.2/av1/common/arm/highbd_compound_convolve_sve2.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_compound_convolve_sve2.c 2024-05-07 19:57:02.690000000 +0000 @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/aom_neon_sve2_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" +#include "av1/common/filter.h" +#include "av1/common/arm/highbd_compound_convolve_neon.h" +#include "av1/common/arm/highbd_convolve_neon.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, +}; + +static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset, int32x4_t shift) { + int64x2_t sum[8]; + sum[0] = aom_sdotq_s16(offset, s0[0], filter); + sum[1] = aom_sdotq_s16(offset, s0[1], filter); + sum[2] = aom_sdotq_s16(offset, s0[2], filter); + sum[3] = aom_sdotq_s16(offset, s0[3], filter); + sum[4] = aom_sdotq_s16(offset, s0[4], filter); + sum[5] = aom_sdotq_s16(offset, s0[5], filter); + sum[6] = aom_sdotq_s16(offset, s0[6], filter); + sum[7] = aom_sdotq_s16(offset, s0[7], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + sum[4] = vpaddq_s64(sum[4], sum[5]); + sum[6] = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); + + sum0123 = vshlq_s32(sum0123, shift); + sum4567 = vshlq_s32(sum4567, shift); + + return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); +} + +static INLINE void highbd_dist_wtd_convolve_x_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, + ConvolveParams *conv_params, const int offset) { + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + const int64x2_t offset_vec = vdupq_n_s64(offset); + + const int64x2_t offset_lo = + vcombine_s64(vget_low_s64(offset_vec), vdup_n_s64(0)); + const int16x8_t filter = vld1q_s16(x_filter_ptr); + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, shift); + uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, shift); + uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, shift); + uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, shift); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter, + int64x2_t offset, int32x4_t shift, + uint16x8x2_t permute_tbl) { + int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); + + int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vshlq_s32(sum0123, shift); + + return vqmovun_s32(sum0123); +} + +static INLINE uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter, + int64x2_t offset, int32x4_t shift, + uint16x8_t tbl) { + int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); + + int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + sum0415 = vshlq_s32(sum0415, shift); + + int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + sum2637 = vshlq_s32(sum2637, shift); + + uint16x8_t res = vcombine_u16(vqmovun_s32(sum0415), vqmovun_s32(sum2637)); + return aom_tbl_u16(res, tbl); +} + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; +// clang-format on + +static INLINE void highbd_dist_wtd_convolve_x_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, + ConvolveParams *conv_params, const int offset) { + // This shim allows to do only one rounding shift instead of two. + const int64x2_t offset_s64 = vdupq_n_s64(offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + if (width == 4) { + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = convolve4_4_x(s0, filter, offset_s64, shift, permute_tbl); + uint16x4_t d1 = convolve4_4_x(s1, filter, offset_s64, shift, permute_tbl); + uint16x4_t d2 = convolve4_4_x(s2, filter, offset_s64, shift, permute_tbl); + uint16x4_t d3 = convolve4_4_x(s3, filter, offset_s64, shift, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = convolve4_8_x(s0, filter, offset_s64, shift, idx); + uint16x8_t d1 = convolve4_8_x(s1, filter, offset_s64, shift, idx); + uint16x8_t d2 = convolve4_8_x(s2, filter, offset_s64, shift, idx); + uint16x8_t d3 = convolve4_8_x(s3, filter, offset_s64, shift, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_dist_wtd_convolve_x_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, + int h, const InterpFilterParams *filter_params_x, const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + CONV_BUF_TYPE *dst16 = conv_params->dst; + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + + if (x_filter_taps == 6) { + av1_highbd_dist_wtd_convolve_x_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, + conv_params, bd); + return; + } + + int dst16_stride = conv_params->dst_stride; + const int im_stride = MAX_SB_SIZE; + const int horiz_offset = filter_params_x->taps / 2 - 1; + assert(FILTER_BITS == COMPOUND_ROUND1_BITS); + const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + const int offset_avg = (1 << (offset_bits - conv_params->round_1)) + + (1 << (offset_bits - conv_params->round_1 - 1)); + const int offset_convolve = (1 << (conv_params->round_0 - 1)) + + (1 << (bd + FILTER_BITS)) + + (1 << (bd + FILTER_BITS - 1)); + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + if (conv_params->do_average) { + if (x_filter_taps <= 4) { + highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, im_block, + im_stride, w, h, x_filter_ptr, + conv_params, offset_convolve); + } else { + highbd_dist_wtd_convolve_x_sve2(src, src_stride, im_block, im_stride, w, + h, x_filter_ptr, conv_params, + offset_convolve); + } + + if (conv_params->use_dist_wtd_comp_avg) { + if (bd == 12) { + highbd_12_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, + w, h, conv_params, offset_avg, bd); + + } else { + highbd_dist_wtd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, + h, conv_params, offset_avg, bd); + } + + } else { + if (bd == 12) { + highbd_12_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, offset_avg, bd); + + } else { + highbd_comp_avg_neon(im_block, im_stride, dst, dst_stride, w, h, + conv_params, offset_avg, bd); + } + } + } else { + if (x_filter_taps <= 4) { + highbd_dist_wtd_convolve_x_4tap_sve2(src + 2, src_stride, dst16, + dst16_stride, w, h, x_filter_ptr, + conv_params, offset_convolve); + } else { + highbd_dist_wtd_convolve_x_sve2(src, src_stride, dst16, dst16_stride, w, + h, x_filter_ptr, conv_params, + offset_convolve); + } + } +} diff -Nru aom-3.8.2/av1/common/arm/highbd_convolve_horiz_rs_neon.c aom-3.9.0/av1/common/arm/highbd_convolve_horiz_rs_neon.c --- aom-3.8.2/av1/common/arm/highbd_convolve_horiz_rs_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_convolve_horiz_rs_neon.c 2024-05-07 19:57:02.691000000 +0000 @@ -142,9 +142,9 @@ d0 = vmin_u16(d0, max); if (w == 2) { - store_u16_2x1(d + 0 * dst_stride, d0, 0); + store_u16_2x1(d, d0); } else { - vst1_u16(d + 0 * dst_stride, d0); + vst1_u16(d, d0); } src_ptr += src_stride; diff -Nru aom-3.8.2/av1/common/arm/highbd_convolve_neon.c aom-3.9.0/av1/common/arm/highbd_convolve_neon.c --- aom-3.8.2/av1/common/arm/highbd_convolve_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_convolve_neon.c 2024-05-07 19:57:02.691000000 +0000 @@ -1927,7 +1927,7 @@ uint16x4_t d0 = vrhadd_u16(s0, s1); if (w == 2) { - store_u16_2x1(dst, d0, 0); + store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } @@ -1978,7 +1978,7 @@ uint16x4_t d0 = vrhadd_u16(s0, s1); if (w == 2) { - store_u16_2x1(dst, d0, 0); + store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } @@ -2086,7 +2086,7 @@ d0 = vhadd_u16(d0, vget_low_u16(vert_offset)); if (w == 2) { - store_u16_2x1(dst, d0, 0); + store_u16_2x1(dst, d0); } else { vst1_u16(dst, d0); } diff -Nru aom-3.8.2/av1/common/arm/highbd_convolve_scale_neon.c aom-3.9.0/av1/common/arm/highbd_convolve_scale_neon.c --- aom-3.8.2/av1/common/arm/highbd_convolve_scale_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_convolve_scale_neon.c 2024-05-07 19:57:02.694000000 +0000 @@ -51,7 +51,7 @@ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); if (w == 2) { - store_u16_2x1(dst_ptr, d0_u16, 0); + store_u16_2x1(dst_ptr, d0_u16); } else { vst1_u16(dst_ptr, d0_u16); } @@ -123,7 +123,7 @@ d0_u16 = vmin_u16(d0_u16, vget_low_u16(max)); if (w == 2) { - store_u16_2x1(dst_ptr, d0_u16, 0); + store_u16_2x1(dst_ptr, d0_u16); } else { vst1_u16(dst_ptr, d0_u16); } @@ -260,9 +260,9 @@ s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32); if (w == 2) { - store_u16_2x1(d + 0 * dst_stride, d0, 0); + store_u16_2x1(d, d0); } else { - vst1_u16(d + 0 * dst_stride, d0); + vst1_u16(d, d0); } src_ptr += src_stride; @@ -398,7 +398,7 @@ offset_s32, vdupq_n_s32(0)); if (w == 2) { - store_u16_2x1(d, d0, 0); + store_u16_2x1(d, d0); } else { vst1_u16(d, d0); } @@ -458,7 +458,7 @@ uint16x4_t d = vqmovun_s32(d0); d = vmin_u16(d, vget_low_u16(max)); if (w == 2) { - store_u16_2x1(dst_ptr + y * dst_stride, d, 0); + store_u16_2x1(dst_ptr + y * dst_stride, d); } else { vst1_u16(dst_ptr + y * dst_stride, d); } diff -Nru aom-3.8.2/av1/common/arm/highbd_convolve_sve2.c aom-3.9.0/av1/common/arm/highbd_convolve_sve2.c --- aom-3.8.2/av1/common/arm/highbd_convolve_sve2.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_convolve_sve2.c 2024-05-07 19:57:02.695000000 +0000 @@ -0,0 +1,1798 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/aom_neon_sve2_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/convolve.h" +#include "av1/common/filter.h" + +DECLARE_ALIGNED(16, static const uint16_t, kDotProdTbl[32]) = { + 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, + 4, 5, 6, 7, 5, 6, 7, 0, 6, 7, 0, 1, 7, 0, 1, 2, +}; + +static INLINE uint16x4_t convolve12_4_x( + int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, + const int64x2_t offset, uint16x8x4_t permute_tbl, uint16x4_t max) { + int16x8_t permuted_samples[6]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int32x4_t res0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(res0123, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t convolve12_8_x(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t filter_0_7, + int16x8_t filter_4_11, int64x2_t offset, + uint16x8x4_t permute_tbl, + uint16x8_t max) { + int16x8_t permuted_samples[8]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); + permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int64x2_t sum45 = + aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); + + int64x2_t sum67 = + aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_x_sr_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, int bd) { + // This shim allows to do only one rounding shift instead of two. + const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); + + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( + vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); + permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); + + uint16x8_t correction1 = vreinterpretq_u16_u64( + vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), + vdup_n_u64(svcnth() * 0x0001000100010000ULL))); + permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + const int16_t *s = (const int16_t *)src; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); + load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); + + uint16x4_t d0 = convolve12_4_x(s0, s1, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x4_t d1 = convolve12_4_x(s2, s3, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x4_t d2 = convolve12_4_x(s4, s5, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x4_t d3 = convolve12_4_x(s6, s7, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; + load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); + load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); + load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); + + uint16x8_t d0 = convolve12_8_x(s0, s1, s2, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x8_t d1 = convolve12_8_x(s3, s4, s5, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x8_t d2 = convolve12_8_x(s6, s7, s8, y_filter_0_7, y_filter_4_11, + offset, permute_tbl, max); + uint16x8_t d3 = convolve12_8_x(s9, s10, s11, y_filter_0_7, + y_filter_4_11, offset, permute_tbl, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +static INLINE uint16x8_t convolve8_8_x(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset, uint16x8_t max) { + int64x2_t sum[8]; + sum[0] = aom_sdotq_s16(offset, s0[0], filter); + sum[1] = aom_sdotq_s16(offset, s0[1], filter); + sum[2] = aom_sdotq_s16(offset, s0[2], filter); + sum[3] = aom_sdotq_s16(offset, s0[3], filter); + sum[4] = aom_sdotq_s16(offset, s0[4], filter); + sum[5] = aom_sdotq_s16(offset, s0[5], filter); + sum[6] = aom_sdotq_s16(offset, s0[6], filter); + sum[7] = aom_sdotq_s16(offset, s0[7], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + sum[4] = vpaddq_s64(sum[4], sum[5]); + sum[6] = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_x_sr_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, int bd) { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + // This shim allows to do only one rounding shift instead of two. + const int64_t offset = 1 << (conv_params->round_0 - 1); + const int64x2_t offset_lo = vcombine_s64((int64x1_t)(offset), vdup_n_s64(0)); + + const int16x8_t filter = vld1q_s16(y_filter_ptr); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = convolve8_8_x(s0, filter, offset_lo, max); + uint16x8_t d1 = convolve8_8_x(s1, filter, offset_lo, max); + uint16x8_t d2 = convolve8_8_x(s2, filter, offset_lo, max); + uint16x8_t d3 = convolve8_8_x(s3, filter, offset_lo, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); +} + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDeinterleaveTbl[8]) = { + 0, 2, 4, 6, 1, 3, 5, 7, +}; +// clang-format on + +static INLINE uint16x4_t convolve4_4_x(int16x8_t s0, int16x8_t filter, + int64x2_t offset, + uint16x8x2_t permute_tbl, + uint16x4_t max) { + int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); + + int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE uint16x8_t convolve4_8_x(int16x8_t s0[4], int16x8_t filter, + int64x2_t offset, uint16x8_t tbl, + uint16x8_t max) { + int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); + + int32x4_t sum0415 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t sum2637 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0415, FILTER_BITS), + vqrshrun_n_s32(sum2637, FILTER_BITS)); + res = aom_tbl_u16(res, tbl); + + return vminq_u16(res, max); +} + +static INLINE void highbd_convolve_x_sr_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, + ConvolveParams *conv_params, int bd) { + // This shim allows to do only one rounding shift instead of two. + const int64x2_t offset = vdupq_n_s64(1 << (conv_params->round_0 - 1)); + + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + const int16_t *s = (const int16_t *)(src); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = convolve4_4_x(s0, filter, offset, permute_tbl, max); + uint16x4_t d1 = convolve4_4_x(s1, filter, offset, permute_tbl, max); + uint16x4_t d2 = convolve4_4_x(s2, filter, offset, permute_tbl, max); + uint16x4_t d3 = convolve4_4_x(s3, filter, offset, permute_tbl, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[4], s1[4], s2[4], s3[4]; + load_s16_8x4(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3]); + load_s16_8x4(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3]); + load_s16_8x4(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3]); + load_s16_8x4(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3]); + + uint16x8_t d0 = convolve4_8_x(s0, filter, offset, idx, max); + uint16x8_t d1 = convolve4_8_x(s1, filter, offset, idx, max); + uint16x8_t d2 = convolve4_8_x(s2, filter, offset, idx, max); + uint16x8_t d3 = convolve4_8_x(s3, filter, offset, idx, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } +} + +void av1_highbd_convolve_x_sr_sve2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const int subpel_x_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_x_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, bd); + return; + } + + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + + if (x_filter_taps == 6) { + av1_highbd_convolve_x_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_x, subpel_x_qn, conv_params, + bd); + return; + } + + const int horiz_offset = filter_params_x->taps / 2 - 1; + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + + src -= horiz_offset; + + if (x_filter_taps == 12) { + highbd_convolve_x_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + + if (x_filter_taps == 8) { + highbd_convolve_x_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); + return; + } + + highbd_convolve_x_sr_4tap_sve2(src + 2, src_stride, dst, dst_stride, w, h, + x_filter_ptr, conv_params, bd); +} + +// clang-format off +DECLARE_ALIGNED(16, static const uint16_t, kDotProdMergeBlockTbl[24]) = { + // Shift left and insert new last column in transposed 4x4 block. + 1, 2, 3, 0, 5, 6, 7, 4, + // Shift left and insert two new columns in transposed 4x4 block. + 2, 3, 0, 1, 6, 7, 4, 5, + // Shift left and insert three new columns in transposed 4x4 block. + 3, 0, 1, 2, 7, 4, 5, 6, +}; +// clang-format on + +static INLINE void transpose_concat_4x4(int16x4_t s0, int16x4_t s1, + int16x4_t s2, int16x4_t s3, + int16x8_t res[2]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03 + // s1: 10, 11, 12, 13 + // s2: 20, 21, 22, 23 + // s3: 30, 31, 32, 33 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + + int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); + int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); + int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); + int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); + + int32x4_t s01 = vreinterpretq_s32_s16(vzip1q_s16(s0q, s1q)); + int32x4_t s23 = vreinterpretq_s32_s16(vzip1q_s16(s2q, s3q)); + + int32x4x2_t s0123 = vzipq_s32(s01, s23); + + res[0] = vreinterpretq_s16_s32(s0123.val[0]); + res[1] = vreinterpretq_s16_s32(s0123.val[1]); +} + +static INLINE void transpose_concat_8x4(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t s3, + int16x8_t res[4]) { + // Transpose 16-bit elements and concatenate result rows as follows: + // s0: 00, 01, 02, 03, 04, 05, 06, 07 + // s1: 10, 11, 12, 13, 14, 15, 16, 17 + // s2: 20, 21, 22, 23, 24, 25, 26, 27 + // s3: 30, 31, 32, 33, 34, 35, 36, 37 + // + // res[0]: 00 10 20 30 01 11 21 31 + // res[1]: 02 12 22 32 03 13 23 33 + // res[2]: 04 14 24 34 05 15 25 35 + // res[3]: 06 16 26 36 07 17 27 37 + + int16x8x2_t tr01_16 = vzipq_s16(s0, s1); + int16x8x2_t tr23_16 = vzipq_s16(s2, s3); + + int32x4x2_t tr01_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[0]), + vreinterpretq_s32_s16(tr23_16.val[0])); + int32x4x2_t tr23_32 = vzipq_s32(vreinterpretq_s32_s16(tr01_16.val[1]), + vreinterpretq_s32_s16(tr23_16.val[1])); + + res[0] = vreinterpretq_s16_s32(tr01_32.val[0]); + res[1] = vreinterpretq_s16_s32(tr01_32.val[1]); + res[2] = vreinterpretq_s16_s32(tr23_32.val[0]); + res[3] = vreinterpretq_s16_s32(tr23_32.val[1]); +} + +static INLINE void aom_tbl2x4_s16(int16x8_t t0[4], int16x8_t t1[4], + uint16x8_t tbl, int16x8_t res[4]) { + res[0] = aom_tbl2_s16(t0[0], t1[0], tbl); + res[1] = aom_tbl2_s16(t0[1], t1[1], tbl); + res[2] = aom_tbl2_s16(t0[2], t1[2], tbl); + res[3] = aom_tbl2_s16(t0[3], t1[3], tbl); +} + +static INLINE void aom_tbl2x2_s16(int16x8_t t0[2], int16x8_t t1[2], + uint16x8_t tbl, int16x8_t res[2]) { + res[0] = aom_tbl2_s16(t0[0], t1[0], tbl); + res[1] = aom_tbl2_s16(t0[1], t1[1], tbl); +} + +static INLINE uint16x4_t highbd_convolve12_4_y(int16x8_t s0[2], int16x8_t s1[2], + int16x8_t s2[2], + int16x8_t filter_0_7, + int16x8_t filter_4_11, + uint16x4_t max) { + int64x2_t sum[2]; + + sum[0] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[0], filter_0_7, 0); + sum[0] = aom_svdot_lane_s16(sum[0], s1[0], filter_0_7, 1); + sum[0] = aom_svdot_lane_s16(sum[0], s2[0], filter_4_11, 1); + + sum[1] = aom_svdot_lane_s16(vdupq_n_s64(0), s0[1], filter_0_7, 0); + sum[1] = aom_svdot_lane_s16(sum[1], s1[1], filter_0_7, 1); + sum[1] = aom_svdot_lane_s16(sum[1], s2[1], filter_4_11, 1); + + int32x4_t res_s32 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[1])); + + uint16x4_t res = vqrshrun_n_s32(res_s32, FILTER_BITS); + + return vmin_u16(res, max); +} + +static INLINE void highbd_convolve_y_sr_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, int bd) { + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + do { + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + int h = height; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], + s6789[2], s789A[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + transpose_concat_4x4(s4, s5, s6, s7, s4567); + transpose_concat_4x4(s5, s6, s7, s8, s5678); + transpose_concat_4x4(s6, s7, s8, s9, s6789); + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + do { + int16x4_t sB, sC, sD, sE; + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); + + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; + transpose_concat_4x4(sB, sC, sD, sE, sBCDE); + + // Use the above transpose and reuse data from the previous loop to get + // the rest. + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); + + uint16x4_t d0 = highbd_convolve12_4_y(s0123, s4567, s89AB, y_filter_0_7, + y_filter_4_11, max); + uint16x4_t d1 = highbd_convolve12_4_y(s1234, s5678, s9ABC, y_filter_0_7, + y_filter_4_11, max); + uint16x4_t d2 = highbd_convolve12_4_y(s2345, s6789, sABCD, y_filter_0_7, + y_filter_4_11, max); + uint16x4_t d3 = highbd_convolve12_4_y(s3456, s789A, sBCDE, y_filter_0_7, + y_filter_4_11, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s4567[0] = s89AB[0]; + s4567[1] = s89AB[1]; + s5678[0] = s9ABC[0]; + s5678[1] = s9ABC[1]; + s6789[0] = sABCD[0]; + s6789[1] = sABCD[1]; + s789A[0] = sBCDE[0]; + s789A[1] = sBCDE[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 4; + dst += 4; + width -= 4; + } while (width != 0); +} + +static INLINE uint16x4_t highbd_convolve8_4_y(int16x8_t samples_lo[2], + int16x8_t samples_hi[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_y(int16x8_t samples_lo[4], + int16x8_t samples_hi[4], + int16x8_t filter, + uint16x8_t max) { + int64x2_t sum01 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int64x2_t sum45 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[2], filter, 0); + sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); + + int64x2_t sum67 = + aom_svdot_lane_s16(vdupq_n_s64(0), samples_lo[3], filter, 0); + sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +void highbd_convolve_y_sr_8tap_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + int width, int height, + const int16_t *filter_y, int bd) { + assert(w >= 4 && h >= 4); + + const int16x8_t y_filter = vld1q_s16(filter_y); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x4_t d0 = highbd_convolve8_4_y(s0123, s4567, y_filter, max); + uint16x4_t d1 = highbd_convolve8_4_y(s1234, s5678, y_filter, max); + uint16x4_t d2 = highbd_convolve8_4_y(s2345, s6789, y_filter, max); + uint16x4_t d3 = highbd_convolve8_4_y(s3456, s789A, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x8_t d0 = highbd_convolve8_8_y(s0123, s4567, y_filter, max); + uint16x8_t d1 = highbd_convolve8_8_y(s1234, s5678, y_filter, max); + uint16x8_t d2 = highbd_convolve8_8_y(s2345, s6789, y_filter, max); + uint16x8_t d3 = highbd_convolve8_8_y(s3456, s789A, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4_y(int16x8_t samples[2], + int16x8_t filter, + uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + uint16x4_t res = vqrshrun_n_s32(sum0123, FILTER_BITS); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_y(int16x8_t samples[4], + int16x8_t filter, + uint16x8_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[1], filter, 0); + int64x2_t sum45 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[2], filter, 0); + int64x2_t sum67 = aom_svdot_lane_s16(vdupq_n_s64(0), samples[3], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + uint16x8_t res = vcombine_u16(vqrshrun_n_s32(sum0123, FILTER_BITS), + vqrshrun_n_s32(sum4567, FILTER_BITS)); + return vminq_u16(res, max); +} + +void highbd_convolve_y_sr_4tap_sve2(const uint16_t *src, ptrdiff_t src_stride, + uint16_t *dst, ptrdiff_t dst_stride, + int width, int height, + const int16_t *filter_y, int bd) { + assert(w >= 4 && h >= 4); + + const int16x8_t y_filter = + vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + uint16x4_t d0 = highbd_convolve4_4_y(s0123, y_filter, max); + uint16x4_t d1 = highbd_convolve4_4_y(s1234, y_filter, max); + uint16x4_t d2 = highbd_convolve4_4_y(s2345, y_filter, max); + uint16x4_t d3 = highbd_convolve4_4_y(s3456, y_filter, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample + // permute required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + uint16x8_t d0 = highbd_convolve4_8_y(s0123, y_filter, max); + uint16x8_t d1 = highbd_convolve4_8_y(s1234, y_filter, max); + uint16x8_t d2 = highbd_convolve4_8_y(s2345, y_filter, max); + uint16x8_t d3 = highbd_convolve4_8_y(s3456, y_filter, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +void av1_highbd_convolve_y_sr_sve2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_y, + const int subpel_y_qn, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + + if (y_filter_taps == 6) { + av1_highbd_convolve_y_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_y, subpel_y_qn, bd); + return; + } + + const int vert_offset = filter_params_y->taps / 2 - 1; + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + + src -= vert_offset * src_stride; + + if (y_filter_taps > 8) { + highbd_convolve_y_sr_12tap_sve2(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); + return; + } + + if (y_filter_taps == 4) { + highbd_convolve_y_sr_4tap_sve2(src + 2 * src_stride, src_stride, dst, + dst_stride, w, h, y_filter_ptr, bd); + return; + } + + highbd_convolve_y_sr_8tap_sve2(src, src_stride, dst, dst_stride, w, h, + y_filter_ptr, bd); +} + +static INLINE uint16x4_t convolve12_4_2d_h( + int16x8_t s0, int16x8_t s1, int16x8_t filter_0_7, int16x8_t filter_4_11, + const int64x2_t offset, int32x4_t shift, uint16x8x4_t permute_tbl) { + int16x8_t permuted_samples[6]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vqrshlq_s32(sum0123, shift); + return vqmovun_s32(sum0123); +} + +static INLINE uint16x8_t convolve12_8_2d_h(int16x8_t s0, int16x8_t s1, + int16x8_t s2, int16x8_t filter_0_7, + int16x8_t filter_4_11, + int64x2_t offset, int32x4_t shift, + uint16x8x4_t permute_tbl) { + int16x8_t permuted_samples[8]; + permuted_samples[0] = aom_tbl_s16(s0, permute_tbl.val[0]); + permuted_samples[1] = aom_tbl_s16(s0, permute_tbl.val[1]); + permuted_samples[2] = aom_tbl2_s16(s0, s1, permute_tbl.val[2]); + permuted_samples[3] = aom_tbl2_s16(s0, s1, permute_tbl.val[3]); + permuted_samples[4] = aom_tbl_s16(s1, permute_tbl.val[0]); + permuted_samples[5] = aom_tbl_s16(s1, permute_tbl.val[1]); + permuted_samples[6] = aom_tbl2_s16(s1, s2, permute_tbl.val[2]); + permuted_samples[7] = aom_tbl2_s16(s1, s2, permute_tbl.val[3]); + + int64x2_t sum01 = + aom_svdot_lane_s16(offset, permuted_samples[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[2], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, permuted_samples[4], filter_4_11, 1); + + int64x2_t sum23 = + aom_svdot_lane_s16(offset, permuted_samples[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[3], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, permuted_samples[5], filter_4_11, 1); + + int64x2_t sum45 = + aom_svdot_lane_s16(offset, permuted_samples[2], filter_0_7, 0); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[4], filter_0_7, 1); + sum45 = aom_svdot_lane_s16(sum45, permuted_samples[6], filter_4_11, 1); + + int64x2_t sum67 = + aom_svdot_lane_s16(offset, permuted_samples[3], filter_0_7, 0); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[5], filter_0_7, 1); + sum67 = aom_svdot_lane_s16(sum67, permuted_samples[7], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + sum0123 = vqrshlq_s32(sum0123, shift); + sum4567 = vqrshlq_s32(sum4567, shift); + + return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, const int x_offset) { + const int64x2_t offset = vdupq_n_s64(x_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x4_t permute_tbl = vld1q_u16_x4(kDotProdTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = vreinterpretq_u16_u64(vcombine_u64( + vdup_n_u64(0), vdup_n_u64(svcnth() * 0x0001000000000000ULL))); + permute_tbl.val[2] = vaddq_u16(permute_tbl.val[2], correction0); + + uint16x8_t correction1 = vreinterpretq_u16_u64( + vcombine_u64(vdup_n_u64(svcnth() * 0x0001000100000000ULL), + vdup_n_u64(svcnth() * 0x0001000100010000ULL))); + permute_tbl.val[3] = vaddq_u16(permute_tbl.val[3], correction1); + + if (width == 4) { + const int16_t *s = (const int16_t *)src; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7; + load_s16_8x4(s, src_stride, &s0, &s2, &s4, &s6); + load_s16_8x4(s + 8, src_stride, &s1, &s3, &s5, &s7); + + uint16x4_t d0 = convolve12_4_2d_h(s0, s1, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + uint16x4_t d1 = convolve12_4_2d_h(s2, s3, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + uint16x4_t d2 = convolve12_4_2d_h(s4, s5, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + uint16x4_t d3 = convolve12_4_2d_h(s6, s7, y_filter_0_7, y_filter_4_11, + offset, shift, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + dst += 4 * dst_stride; + s += 4 * src_stride; + height -= 4; + } while (height > 0); + } else { + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11; + load_s16_8x4(s, src_stride, &s0, &s3, &s6, &s9); + load_s16_8x4(s + 8, src_stride, &s1, &s4, &s7, &s10); + load_s16_8x4(s + 16, src_stride, &s2, &s5, &s8, &s11); + + uint16x8_t d0 = + convolve12_8_2d_h(s0, s1, s2, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + uint16x8_t d1 = + convolve12_8_2d_h(s3, s4, s5, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + uint16x8_t d2 = + convolve12_8_2d_h(s6, s7, s8, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + uint16x8_t d3 = + convolve12_8_2d_h(s9, s10, s11, y_filter_0_7, y_filter_4_11, offset, + shift, permute_tbl); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +static INLINE uint16x8_t convolve8_8_2d_h(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset, int32x4_t shift) { + int64x2_t sum[8]; + sum[0] = aom_sdotq_s16(offset, s0[0], filter); + sum[1] = aom_sdotq_s16(offset, s0[1], filter); + sum[2] = aom_sdotq_s16(offset, s0[2], filter); + sum[3] = aom_sdotq_s16(offset, s0[3], filter); + sum[4] = aom_sdotq_s16(offset, s0[4], filter); + sum[5] = aom_sdotq_s16(offset, s0[5], filter); + sum[6] = aom_sdotq_s16(offset, s0[6], filter); + sum[7] = aom_sdotq_s16(offset, s0[7], filter); + + sum[0] = vpaddq_s64(sum[0], sum[1]); + sum[2] = vpaddq_s64(sum[2], sum[3]); + sum[4] = vpaddq_s64(sum[4], sum[5]); + sum[6] = vpaddq_s64(sum[6], sum[7]); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum[0]), vmovn_s64(sum[2])); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum[4]), vmovn_s64(sum[6])); + + sum0123 = vqrshlq_s32(sum0123, shift); + sum4567 = vqrshlq_s32(sum4567, shift); + + return vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); +} + +static INLINE void highbd_convolve_2d_sr_horiz_8tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, const int x_offset) { + const int64x2_t offset = vdupq_n_s64(x_offset); + const int64x2_t offset_lo = vcombine_s64(vget_low_s64(offset), vdup_n_s64(0)); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + + const int16x8_t filter = vld1q_s16(y_filter_ptr); + + do { + const int16_t *s = (const int16_t *)src; + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = convolve8_8_2d_h(s0, filter, offset_lo, shift); + uint16x8_t d1 = convolve8_8_2d_h(s1, filter, offset_lo, shift); + uint16x8_t d2 = convolve8_8_2d_h(s2, filter, offset_lo, shift); + uint16x8_t d3 = convolve8_8_2d_h(s3, filter, offset_lo, shift); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); +} + +static INLINE uint16x4_t convolve4_4_2d_h(int16x8_t s0, int16x8_t filter, + int64x2_t offset, int32x4_t shift, + uint16x8x2_t permute_tbl) { + int16x8_t permuted_samples0 = aom_tbl_s16(s0, permute_tbl.val[0]); + int16x8_t permuted_samples1 = aom_tbl_s16(s0, permute_tbl.val[1]); + + int64x2_t sum01 = aom_svdot_lane_s16(offset, permuted_samples0, filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, permuted_samples1, filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vqrshlq_s32(sum0123, shift); + return vqmovun_s32(sum0123); +} + +static INLINE uint16x8_t convolve4_8_2d_h(int16x8_t s0[8], int16x8_t filter, + int64x2_t offset, int32x4_t shift, + uint16x8_t tbl) { + int64x2_t sum04 = aom_svdot_lane_s16(offset, s0[0], filter, 0); + int64x2_t sum15 = aom_svdot_lane_s16(offset, s0[1], filter, 0); + int64x2_t sum26 = aom_svdot_lane_s16(offset, s0[2], filter, 0); + int64x2_t sum37 = aom_svdot_lane_s16(offset, s0[3], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum04), vmovn_s64(sum15)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum26), vmovn_s64(sum37)); + + sum0123 = vqrshlq_s32(sum0123, shift); + sum4567 = vqrshlq_s32(sum4567, shift); + + uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); + return aom_tbl_u16(res, tbl); +} + +static INLINE void highbd_convolve_2d_sr_horiz_4tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *x_filter_ptr, + ConvolveParams *conv_params, const int x_offset) { + const int64x2_t offset = vdupq_n_s64(x_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_0); + + const int16x4_t x_filter = vld1_s16(x_filter_ptr + 2); + const int16x8_t filter = vcombine_s16(x_filter, vdup_n_s16(0)); + + if (width == 4) { + const int16_t *s = (const int16_t *)(src); + + uint16x8x2_t permute_tbl = vld1q_u16_x2(kDotProdTbl); + + do { + int16x8_t s0, s1, s2, s3; + load_s16_8x4(s, src_stride, &s0, &s1, &s2, &s3); + + uint16x4_t d0 = convolve4_4_2d_h(s0, filter, offset, shift, permute_tbl); + uint16x4_t d1 = convolve4_4_2d_h(s1, filter, offset, shift, permute_tbl); + uint16x4_t d2 = convolve4_4_2d_h(s2, filter, offset, shift, permute_tbl); + uint16x4_t d3 = convolve4_4_2d_h(s3, filter, offset, shift, permute_tbl); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } else { + uint16x8_t idx = vld1q_u16(kDeinterleaveTbl); + + do { + const int16_t *s = (const int16_t *)(src); + uint16_t *d = dst; + int w = width; + + do { + int16x8_t s0[8], s1[8], s2[8], s3[8]; + load_s16_8x8(s + 0 * src_stride, 1, &s0[0], &s0[1], &s0[2], &s0[3], + &s0[4], &s0[5], &s0[6], &s0[7]); + load_s16_8x8(s + 1 * src_stride, 1, &s1[0], &s1[1], &s1[2], &s1[3], + &s1[4], &s1[5], &s1[6], &s1[7]); + load_s16_8x8(s + 2 * src_stride, 1, &s2[0], &s2[1], &s2[2], &s2[3], + &s2[4], &s2[5], &s2[6], &s2[7]); + load_s16_8x8(s + 3 * src_stride, 1, &s3[0], &s3[1], &s3[2], &s3[3], + &s3[4], &s3[5], &s3[6], &s3[7]); + + uint16x8_t d0 = convolve4_8_2d_h(s0, filter, offset, shift, idx); + uint16x8_t d1 = convolve4_8_2d_h(s1, filter, offset, shift, idx); + uint16x8_t d2 = convolve4_8_2d_h(s2, filter, offset, shift, idx); + uint16x8_t d3 = convolve4_8_2d_h(s3, filter, offset, shift, idx); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + s += 8; + d += 8; + w -= 8; + } while (w != 0); + src += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height > 0); + } +} + +static INLINE uint16x4_t highbd_convolve12_4_2d_v( + int16x8_t s0[2], int16x8_t s1[2], int16x8_t s2[2], int16x8_t filter_0_7, + int16x8_t filter_4_11, int32x4_t shift, int64x2_t offset, uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, s0[0], filter_0_7, 0); + sum01 = aom_svdot_lane_s16(sum01, s1[0], filter_0_7, 1); + sum01 = aom_svdot_lane_s16(sum01, s2[0], filter_4_11, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, s0[1], filter_0_7, 0); + sum23 = aom_svdot_lane_s16(sum23, s1[1], filter_0_7, 1); + sum23 = aom_svdot_lane_s16(sum23, s2[1], filter_4_11, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vshlq_s32(sum0123, shift); + + uint16x4_t res = vqmovun_s32(sum0123); + + return vmin_u16(res, max); +} + +static INLINE void highbd_convolve_2d_sr_vert_12tap_sve2( + const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, + int width, int height, const int16_t *y_filter_ptr, + ConvolveParams *conv_params, int bd, const int y_offset) { + const int64x2_t offset = vdupq_n_s64(y_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); + + const int16x8_t y_filter_0_7 = vld1q_s16(y_filter_ptr); + const int16x8_t y_filter_4_11 = vld1q_s16(y_filter_ptr + 4); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + + do { + int16_t *s = (int16_t *)src; + uint16_t *d = (uint16_t *)dst; + int h = height; + + int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA; + load_s16_4x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8, + &s9, &sA); + s += 11 * src_stride; + + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2], s4567[2], s5678[2], + s6789[2], s789A[2]; + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + transpose_concat_4x4(s4, s5, s6, s7, s4567); + transpose_concat_4x4(s5, s6, s7, s8, s5678); + transpose_concat_4x4(s6, s7, s8, s9, s6789); + transpose_concat_4x4(s7, s8, s9, sA, s789A); + + do { + int16x4_t sB, sC, sD, sE; + load_s16_4x4(s, src_stride, &sB, &sC, &sD, &sE); + + int16x8_t s89AB[2], s9ABC[2], sABCD[2], sBCDE[2]; + transpose_concat_4x4(sB, sC, sD, sE, sBCDE); + + // Use the above transpose and reuse data from the previous loop to get + // the rest. + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[0], s89AB); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[1], s9ABC); + aom_tbl2x2_s16(s789A, sBCDE, merge_block_tbl.val[2], sABCD); + + uint16x4_t d0 = highbd_convolve12_4_2d_v( + s0123, s4567, s89AB, y_filter_0_7, y_filter_4_11, shift, offset, max); + uint16x4_t d1 = highbd_convolve12_4_2d_v( + s1234, s5678, s9ABC, y_filter_0_7, y_filter_4_11, shift, offset, max); + uint16x4_t d2 = highbd_convolve12_4_2d_v( + s2345, s6789, sABCD, y_filter_0_7, y_filter_4_11, shift, offset, max); + uint16x4_t d3 = highbd_convolve12_4_2d_v( + s3456, s789A, sBCDE, y_filter_0_7, y_filter_4_11, shift, offset, max); + + store_u16_4x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s4567[0] = s89AB[0]; + s4567[1] = s89AB[1]; + s5678[0] = s9ABC[0]; + s5678[1] = s9ABC[1]; + s6789[0] = sABCD[0]; + s6789[1] = sABCD[1]; + s789A[0] = sBCDE[0]; + s789A[1] = sBCDE[1]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 4; + dst += 4; + width -= 4; + } while (width != 0); +} + +static INLINE uint16x4_t highbd_convolve8_4_2d_v( + int16x8_t samples_lo[2], int16x8_t samples_hi[2], int16x8_t filter, + int32x4_t shift, int64x2_t offset, uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vshlq_s32(sum0123, shift); + + uint16x4_t res = vqmovun_s32(sum0123); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve8_8_2d_v( + int16x8_t samples_lo[4], int16x8_t samples_hi[4], int16x8_t filter, + int32x4_t shift, int64x2_t offset, uint16x8_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples_lo[0], filter, 0); + sum01 = aom_svdot_lane_s16(sum01, samples_hi[0], filter, 1); + + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples_lo[1], filter, 0); + sum23 = aom_svdot_lane_s16(sum23, samples_hi[1], filter, 1); + + int64x2_t sum45 = aom_svdot_lane_s16(offset, samples_lo[2], filter, 0); + sum45 = aom_svdot_lane_s16(sum45, samples_hi[2], filter, 1); + + int64x2_t sum67 = aom_svdot_lane_s16(offset, samples_lo[3], filter, 0); + sum67 = aom_svdot_lane_s16(sum67, samples_hi[3], filter, 1); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + sum0123 = vshlq_s32(sum0123, shift); + sum4567 = vshlq_s32(sum4567, shift); + + uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); + return vminq_u16(res, max); +} + +void highbd_convolve_2d_sr_vert_8tap_sve2(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int width, + int height, const int16_t *filter_y, + ConvolveParams *conv_params, int bd, + const int y_offset) { + assert(w >= 4 && h >= 4); + const int64x2_t offset = vdupq_n_s64(y_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); + const int16x8_t y_filter = vld1q_s16(filter_y); + + uint16x8x3_t merge_block_tbl = vld1q_u16_x3(kDotProdMergeBlockTbl); + // Scale indices by size of the true vector length to avoid reading from an + // 'undefined' portion of a vector on a system with SVE vectors > 128-bit. + uint16x8_t correction0 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000000000000ULL)); + merge_block_tbl.val[0] = vaddq_u16(merge_block_tbl.val[0], correction0); + + uint16x8_t correction1 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100000000ULL)); + merge_block_tbl.val[1] = vaddq_u16(merge_block_tbl.val[1], correction1); + + uint16x8_t correction2 = + vreinterpretq_u16_u64(vdupq_n_u64(svcnth() * 0x0001000100010000ULL)); + merge_block_tbl.val[2] = vaddq_u16(merge_block_tbl.val[2], correction2); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)src; + + int16x4_t s0, s1, s2, s3, s4, s5, s6; + load_s16_4x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + do { + int16x4_t s7, s8, s9, s10; + load_s16_4x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[2], s5678[2], s6789[2], s789A[2]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_4x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x2_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x4_t d0 = + highbd_convolve8_4_2d_v(s0123, s4567, y_filter, shift, offset, max); + uint16x4_t d1 = + highbd_convolve8_4_2d_v(s1234, s5678, y_filter, shift, offset, max); + uint16x4_t d2 = + highbd_convolve8_4_2d_v(s2345, s6789, y_filter, shift, offset, max); + uint16x4_t d3 = + highbd_convolve8_4_2d_v(s3456, s789A, y_filter, shift, offset, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)src; + uint16_t *d = dst; + + int16x8_t s0, s1, s2, s3, s4, s5, s6; + load_s16_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6); + s += 7 * src_stride; + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + do { + int16x8_t s7, s8, s9, s10; + load_s16_8x4(s, src_stride, &s7, &s8, &s9, &s10); + + int16x8_t s4567[4], s5678[4], s6789[4], s789A[4]; + // Transpose and shuffle the 4 lines that were loaded. + transpose_concat_8x4(s7, s8, s9, s10, s789A); + + // Merge new data into block from previous iteration. + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[0], s4567); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[1], s5678); + aom_tbl2x4_s16(s3456, s789A, merge_block_tbl.val[2], s6789); + + uint16x8_t d0 = + highbd_convolve8_8_2d_v(s0123, s4567, y_filter, shift, offset, max); + uint16x8_t d1 = + highbd_convolve8_8_2d_v(s1234, s5678, y_filter, shift, offset, max); + uint16x8_t d2 = + highbd_convolve8_8_2d_v(s2345, s6789, y_filter, shift, offset, max); + uint16x8_t d3 = + highbd_convolve8_8_2d_v(s3456, s789A, y_filter, shift, offset, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Prepare block for next iteration - re-using as much as possible. + // Shuffle everything up four rows. + s0123[0] = s4567[0]; + s0123[1] = s4567[1]; + s0123[2] = s4567[2]; + s0123[3] = s4567[3]; + s1234[0] = s5678[0]; + s1234[1] = s5678[1]; + s1234[2] = s5678[2]; + s1234[3] = s5678[3]; + s2345[0] = s6789[0]; + s2345[1] = s6789[1]; + s2345[2] = s6789[2]; + s2345[3] = s6789[3]; + s3456[0] = s789A[0]; + s3456[1] = s789A[1]; + s3456[2] = s789A[2]; + s3456[3] = s789A[3]; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +static INLINE uint16x4_t highbd_convolve4_4_2d_v(int16x8_t samples[2], + int16x8_t filter, + int32x4_t shift, + int64x2_t offset, + uint16x4_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + sum0123 = vshlq_s32(sum0123, shift); + + uint16x4_t res = vqmovun_s32(sum0123); + return vmin_u16(res, max); +} + +static INLINE uint16x8_t highbd_convolve4_8_2d_v(int16x8_t samples[4], + int16x8_t filter, + int32x4_t shift, + int64x2_t offset, + uint16x8_t max) { + int64x2_t sum01 = aom_svdot_lane_s16(offset, samples[0], filter, 0); + int64x2_t sum23 = aom_svdot_lane_s16(offset, samples[1], filter, 0); + int64x2_t sum45 = aom_svdot_lane_s16(offset, samples[2], filter, 0); + int64x2_t sum67 = aom_svdot_lane_s16(offset, samples[3], filter, 0); + + int32x4_t sum0123 = vcombine_s32(vmovn_s64(sum01), vmovn_s64(sum23)); + int32x4_t sum4567 = vcombine_s32(vmovn_s64(sum45), vmovn_s64(sum67)); + + sum0123 = vshlq_s32(sum0123, shift); + sum4567 = vshlq_s32(sum4567, shift); + + uint16x8_t res = vcombine_u16(vqmovun_s32(sum0123), vqmovun_s32(sum4567)); + return vminq_u16(res, max); +} + +void highbd_convolve_2d_sr_vert_4tap_sve2(const uint16_t *src, + ptrdiff_t src_stride, uint16_t *dst, + ptrdiff_t dst_stride, int width, + int height, const int16_t *filter_y, + ConvolveParams *conv_params, int bd, + const int y_offset) { + assert(w >= 4 && h >= 4); + const int64x2_t offset = vdupq_n_s64(y_offset); + const int32x4_t shift = vdupq_n_s32(-conv_params->round_1); + + const int16x8_t y_filter = + vcombine_s16(vld1_s16(filter_y + 2), vdup_n_s16(0)); + + if (width == 4) { + const uint16x4_t max = vdup_n_u16((1 << bd) - 1); + int16_t *s = (int16_t *)(src); + + int16x4_t s0, s1, s2; + load_s16_4x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x4_t s3, s4, s5, s6; + load_s16_4x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample permute + // required before computing the dot product. + int16x8_t s0123[2], s1234[2], s2345[2], s3456[2]; + transpose_concat_4x4(s0, s1, s2, s3, s0123); + transpose_concat_4x4(s1, s2, s3, s4, s1234); + transpose_concat_4x4(s2, s3, s4, s5, s2345); + transpose_concat_4x4(s3, s4, s5, s6, s3456); + + uint16x4_t d0 = + highbd_convolve4_4_2d_v(s0123, y_filter, shift, offset, max); + uint16x4_t d1 = + highbd_convolve4_4_2d_v(s1234, y_filter, shift, offset, max); + uint16x4_t d2 = + highbd_convolve4_4_2d_v(s2345, y_filter, shift, offset, max); + uint16x4_t d3 = + highbd_convolve4_4_2d_v(s3456, y_filter, shift, offset, max); + + store_u16_4x4(dst, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + dst += 4 * dst_stride; + height -= 4; + } while (height != 0); + } else { + const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); + + do { + int h = height; + int16_t *s = (int16_t *)(src); + uint16_t *d = dst; + + int16x8_t s0, s1, s2; + load_s16_8x3(s, src_stride, &s0, &s1, &s2); + s += 3 * src_stride; + + do { + int16x8_t s3, s4, s5, s6; + load_s16_8x4(s, src_stride, &s3, &s4, &s5, &s6); + + // This operation combines a conventional transpose and the sample + // permute required before computing the dot product. + int16x8_t s0123[4], s1234[4], s2345[4], s3456[4]; + transpose_concat_8x4(s0, s1, s2, s3, s0123); + transpose_concat_8x4(s1, s2, s3, s4, s1234); + transpose_concat_8x4(s2, s3, s4, s5, s2345); + transpose_concat_8x4(s3, s4, s5, s6, s3456); + + uint16x8_t d0 = + highbd_convolve4_8_2d_v(s0123, y_filter, shift, offset, max); + uint16x8_t d1 = + highbd_convolve4_8_2d_v(s1234, y_filter, shift, offset, max); + uint16x8_t d2 = + highbd_convolve4_8_2d_v(s2345, y_filter, shift, offset, max); + uint16x8_t d3 = + highbd_convolve4_8_2d_v(s3456, y_filter, shift, offset, max); + + store_u16_8x4(d, dst_stride, d0, d1, d2, d3); + + // Shuffle everything up four rows. + s0 = s4; + s1 = s5; + s2 = s6; + + s += 4 * src_stride; + d += 4 * dst_stride; + h -= 4; + } while (h != 0); + src += 8; + dst += 8; + width -= 8; + } while (width != 0); + } +} + +void av1_highbd_convolve_2d_sr_sve2(const uint16_t *src, int src_stride, + uint16_t *dst, int dst_stride, int w, int h, + const InterpFilterParams *filter_params_x, + const InterpFilterParams *filter_params_y, + const int subpel_x_qn, + const int subpel_y_qn, + ConvolveParams *conv_params, int bd) { + if (w == 2 || h == 2) { + av1_highbd_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, subpel_x_qn, + subpel_y_qn, conv_params, bd); + return; + } + + DECLARE_ALIGNED(16, uint16_t, + im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * MAX_SB_SIZE]); + const int x_filter_taps = get_filter_tap(filter_params_x, subpel_x_qn); + const int y_filter_taps = get_filter_tap(filter_params_y, subpel_y_qn); + + if (x_filter_taps == 6 || y_filter_taps == 6) { + av1_highbd_convolve_2d_sr_neon(src, src_stride, dst, dst_stride, w, h, + filter_params_x, filter_params_y, + subpel_x_qn, subpel_y_qn, conv_params, bd); + return; + } + + const int clamped_x_taps = x_filter_taps < 4 ? 4 : x_filter_taps; + const int clamped_y_taps = y_filter_taps < 4 ? 4 : y_filter_taps; + + const int im_stride = MAX_SB_SIZE; + const int vert_offset = clamped_y_taps / 2 - 1; + const int horiz_offset = clamped_x_taps / 2 - 1; + const int x_offset = (1 << (bd + FILTER_BITS - 1)); + const int y_offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0; + // The extra shim of (1 << (conv_params->round_1 - 1)) allows us to do a + // simple shift left instead of a rounding saturating shift left. + const int y_offset = + (1 << (conv_params->round_1 - 1)) - (1 << (y_offset_bits - 1)); + + const uint16_t *src_ptr = src - vert_offset * src_stride - horiz_offset; + + const int16_t *x_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_x, subpel_x_qn & SUBPEL_MASK); + const int16_t *y_filter_ptr = av1_get_interp_filter_subpel_kernel( + filter_params_y, subpel_y_qn & SUBPEL_MASK); + const int im_h = h + clamped_y_taps - 1; + + if (x_filter_taps > 8) { + highbd_convolve_2d_sr_horiz_12tap_sve2(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset); + + highbd_convolve_2d_sr_vert_12tap_sve2(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + return; + } + + if (x_filter_taps <= 4) { + highbd_convolve_2d_sr_horiz_4tap_sve2(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset); + } else { + highbd_convolve_2d_sr_horiz_8tap_sve2(src_ptr, src_stride, im_block, + im_stride, w, im_h, x_filter_ptr, + conv_params, x_offset); + } + + if (y_filter_taps <= 4) { + highbd_convolve_2d_sr_vert_4tap_sve2(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } else { + highbd_convolve_2d_sr_vert_8tap_sve2(im_block, im_stride, dst, dst_stride, + w, h, y_filter_ptr, conv_params, bd, + y_offset); + } +} diff -Nru aom-3.8.2/av1/common/arm/highbd_reconinter_neon.c aom-3.9.0/av1/common/arm/highbd_reconinter_neon.c --- aom-3.8.2/av1/common/arm/highbd_reconinter_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_reconinter_neon.c 2024-05-07 19:57:02.706000000 +0000 @@ -113,8 +113,7 @@ vget_low_u8(max_alpha)); } - store_u8_4x1(mask, m, 0); - store_u8_4x1(mask + w, m, 1); + store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -205,8 +204,7 @@ vget_low_u8(max_alpha)); } - store_u8_4x1(mask, m, 0); - store_u8_4x1(mask + w, m, 1); + store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; @@ -298,8 +296,7 @@ vget_low_u8(max_alpha)); } - store_u8_4x1(mask, m, 0); - store_u8_4x1(mask + w, m, 1); + store_u8x4_strided_x2(mask, w, m); src0 += 2 * src0_stride; src1 += 2 * src1_stride; diff -Nru aom-3.8.2/av1/common/arm/highbd_warp_plane_neon.c aom-3.9.0/av1/common/arm/highbd_warp_plane_neon.c --- aom-3.8.2/av1/common/arm/highbd_warp_plane_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_warp_plane_neon.c 2024-05-07 19:57:02.706000000 +0000 @@ -21,53 +21,10 @@ #include "av1/common/scale.h" #include "av1/common/warped_motion.h" #include "config/av1_rtcd.h" +#include "highbd_warp_plane_neon.h" -static INLINE int16x8_t load_filters_1(int ofs) { - const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); - - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; - return vld1q_s16(base + ofs0 * 8); -} - -static INLINE void load_filters_4(int16x8_t out[], int ofs, int stride) { - const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); - const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); - const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); - const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); - - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; - out[0] = vld1q_s16(base + ofs0 * 8); - out[1] = vld1q_s16(base + ofs1 * 8); - out[2] = vld1q_s16(base + ofs2 * 8); - out[3] = vld1q_s16(base + ofs3 * 8); -} - -static INLINE void load_filters_8(int16x8_t out[], int ofs, int stride) { - const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); - const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); - const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); - const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); - const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS); - const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS); - const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); - const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); - - const int16_t *base = - (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; - out[0] = vld1q_s16(base + ofs0 * 8); - out[1] = vld1q_s16(base + ofs1 * 8); - out[2] = vld1q_s16(base + ofs2 * 8); - out[3] = vld1q_s16(base + ofs3 * 8); - out[4] = vld1q_s16(base + ofs4 * 8); - out[5] = vld1q_s16(base + ofs5 * 8); - out[6] = vld1q_s16(base + ofs6 * 8); - out[7] = vld1q_s16(base + ofs7 * 8); -} - -static INLINE int16x8_t warp_affine_horizontal_step_4x1_f4_neon( - int bd, int sx, int alpha, uint16x8x2_t in) { +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha) { int16x8_t f[4]; load_filters_4(f, sx, alpha); @@ -100,11 +57,8 @@ return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } -static INLINE int16x8_t warp_affine_horizontal_step_8x1_f8_neon( - int bd, int sx, int alpha, uint16x8x2_t in) { - const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; - const int offset_bits_horiz = bd + FILTER_BITS - 1; - +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha) { int16x8_t f[8]; load_filters_8(f, sx, alpha); @@ -145,6 +99,9 @@ int32x4_t m0123[] = { m0, m1, m2, m3 }; int32x4_t m4567[] = { m4, m5, m6, m7 }; + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); int32x4_t res1 = horizontal_add_4d_s32x4(m4567); res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); @@ -154,78 +111,94 @@ return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } -static INLINE void warp_affine_horizontal_neon(const uint16_t *ref, int width, - int height, int stride, - int p_width, int16_t alpha, - int16_t beta, int iy4, int sx4, - int ix4, int16x8_t tmp[], - int bd) { +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; - if (ix4 <= -7) { - for (int k = 0; k < 15; ++k) { - int iy = clamp(iy4 + k - 7, 0, height - 1); - int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + - ref[iy * stride] * (1 << (FILTER_BITS - round0)); - tmp[k] = vdupq_n_s16(dup_val); - } - return; - } else if (ix4 >= width + 6) { - for (int k = 0; k < 15; ++k) { - int iy = clamp(iy4 + k - 7, 0, height - 1); - int32_t dup_val = - (1 << (bd + FILTER_BITS - round0 - 1)) + - ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0)); - tmp[k] = vdupq_n_s16(dup_val); - } - return; - } - - for (int k = 0; k < 15; ++k) { - const int iy = clamp(iy4 + k - 7, 0, height - 1); - uint16x8x2_t in = vld1q_u16_x2(ref + iy * stride + ix4 - 7); - - const int out_of_boundary_left = -(ix4 - 6); - const int out_of_boundary_right = (ix4 + 8) - width; - - const uint16_t k0[16] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15 }; - const uint16x8_t indx0 = vld1q_u16(&k0[0]); - const uint16x8_t indx1 = vld1q_u16(&k0[8]); - - if (out_of_boundary_left >= 0) { - uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); - uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); - uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); - uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); - in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]); - in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]); - } - if (out_of_boundary_right >= 0) { - uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); - uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); - uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); - uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); - in.val[0] = vbslq_u16(mask0, vec_dup, in.val[0]); - in.val[1] = vbslq_u16(mask1, vec_dup, in.val[1]); - } - - const int sx = sx4 + beta * (k - 3); - if (p_width == 4) { - tmp[k] = warp_affine_horizontal_step_4x1_f4_neon(bd, sx, alpha, in); - } else { - tmp[k] = warp_affine_horizontal_step_8x1_f8_neon(bd, sx, alpha, in); - } - } + int32x4_t res = horizontal_add_4d_s32x4(m0123); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); } -static INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, int bd) { - const int limit = (1 << bd) - 1; - return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit))); +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int32x4_t m0 = vmull_s16(vget_low_s16(f), vget_low_s16(rv0)); + m0 = vmlal_s16(m0, vget_high_s16(f), vget_high_s16(rv0)); + int32x4_t m1 = vmull_s16(vget_low_s16(f), vget_low_s16(rv1)); + m1 = vmlal_s16(m1, vget_high_s16(f), vget_high_s16(rv1)); + int32x4_t m2 = vmull_s16(vget_low_s16(f), vget_low_s16(rv2)); + m2 = vmlal_s16(m2, vget_high_s16(f), vget_high_s16(rv2)); + int32x4_t m3 = vmull_s16(vget_low_s16(f), vget_low_s16(rv3)); + m3 = vmlal_s16(m3, vget_high_s16(f), vget_high_s16(rv3)); + int32x4_t m4 = vmull_s16(vget_low_s16(f), vget_low_s16(rv4)); + m4 = vmlal_s16(m4, vget_high_s16(f), vget_high_s16(rv4)); + int32x4_t m5 = vmull_s16(vget_low_s16(f), vget_low_s16(rv5)); + m5 = vmlal_s16(m5, vget_high_s16(f), vget_high_s16(rv5)); + int32x4_t m6 = vmull_s16(vget_low_s16(f), vget_low_s16(rv6)); + m6 = vmlal_s16(m6, vget_high_s16(f), vget_high_s16(rv6)); + int32x4_t m7 = vmull_s16(vget_low_s16(f), vget_low_s16(rv7)); + m7 = vmlal_s16(m7, vget_high_s16(f), vget_high_s16(rv7)); + + int32x4_t m0123[] = { m0, m1, m2, m3 }; + int32x4_t m4567[] = { m4, m5, m6, m7 }; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = horizontal_add_4d_s32x4(m0123); + int32x4_t res1 = horizontal_add_4d_s32x4(m4567); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); } -static INLINE int32x4_t -warp_affine_vertical_filter_4x1_f1_neon(const int16x8_t *tmp, int sy) { +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, + int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); @@ -241,8 +214,8 @@ return m0123; } -static INLINE int32x4x2_t -warp_affine_vertical_filter_8x1_f1_neon(const int16x8_t *tmp, int sy) { +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, + int sy) { const int16x8_t f = load_filters_1(sy); const int16x4_t f0123 = vget_low_s16(f); const int16x4_t f4567 = vget_high_s16(f); @@ -267,8 +240,8 @@ return (int32x4x2_t){ { m0123, m4567 } }; } -static INLINE int32x4_t warp_affine_vertical_filter_4x1_f4_neon( - const int16x8_t *tmp, int sy, int gamma) { +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, + int sy, int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), @@ -291,8 +264,8 @@ return horizontal_add_4d_s32x4(m0123); } -static INLINE int32x4x2_t warp_affine_vertical_filter_8x1_f8_neon( - const int16x8_t *tmp, int sy, int gamma) { +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, + int sy, int gamma) { int16x8_t s0 = tmp[0]; int16x8_t s1 = tmp[1]; int16x8_t s2 = tmp[2]; @@ -332,165 +305,6 @@ return ret; } -static INLINE void warp_affine_vertical_step_4x1_f4_neon( - uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, - bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, - int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { - int32x4_t sum0 = - gamma == 0 ? warp_affine_vertical_filter_4x1_f1_neon(tmp, sy) - : warp_affine_vertical_filter_4x1_f4_neon(tmp, sy, gamma); - - const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; - const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; - - sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); - - uint16_t *dst16 = &pred[i * p_stride + j]; - - if (!is_compound) { - const int reduce_bits_vert = 2 * FILTER_BITS - round0; - sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); - - const int res_sub_const = (1 << (bd - 1)) + (1 << bd); - sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); - uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); - vst1_u16(dst16, res0); - return; - } - - sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); - - uint16_t *p = &dst[i * dst_stride + j]; - - if (!do_average) { - vst1_u16(p, vqmovun_s32(sum0)); - return; - } - - uint16x4_t p0 = vld1_u16(p); - int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0)); - if (use_dist_wtd_comp_avg) { - p_vec0 = vmulq_n_s32(p_vec0, fwd); - p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); - p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); - } else { - p_vec0 = vhaddq_s32(p_vec0, sum0); - } - - const int offset_bits = bd + 2 * FILTER_BITS - round0; - const int round1 = COMPOUND_ROUND1_BITS; - const int res_sub_const = - (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); - const int round_bits = 2 * FILTER_BITS - round0 - round1; - - p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); - p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); - uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); - vst1_u16(dst16, res0); -} - -static INLINE void warp_affine_vertical_step_8x1_f8_neon( - uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, - bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, - int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { - int32x4x2_t sums = - gamma == 0 ? warp_affine_vertical_filter_8x1_f1_neon(tmp, sy) - : warp_affine_vertical_filter_8x1_f8_neon(tmp, sy, gamma); - int32x4_t sum0 = sums.val[0]; - int32x4_t sum1 = sums.val[1]; - - const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; - const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; - - sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); - sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert)); - - uint16_t *dst16 = &pred[i * p_stride + j]; - - if (!is_compound) { - const int reduce_bits_vert = 2 * FILTER_BITS - round0; - sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); - sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert)); - - const int res_sub_const = (1 << (bd - 1)) + (1 << bd); - sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); - sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const)); - uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); - uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd); - vst1_u16(dst16, res0); - vst1_u16(dst16 + 4, res1); - return; - } - - sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); - sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS); - - uint16_t *p = &dst[i * dst_stride + j]; - - if (!do_average) { - vst1_u16(p, vqmovun_s32(sum0)); - vst1_u16(p + 4, vqmovun_s32(sum1)); - return; - } - - uint16x8_t p0 = vld1q_u16(p); - int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0))); - int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0))); - if (use_dist_wtd_comp_avg) { - p_vec0 = vmulq_n_s32(p_vec0, fwd); - p_vec1 = vmulq_n_s32(p_vec1, fwd); - p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); - p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd); - p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); - p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS); - } else { - p_vec0 = vhaddq_s32(p_vec0, sum0); - p_vec1 = vhaddq_s32(p_vec1, sum1); - } - - const int offset_bits = bd + 2 * FILTER_BITS - round0; - const int round1 = COMPOUND_ROUND1_BITS; - const int res_sub_const = - (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); - const int round_bits = 2 * FILTER_BITS - round0 - round1; - - p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); - p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const)); - - p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); - p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits)); - uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); - uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd); - vst1_u16(dst16, res0); - vst1_u16(dst16 + 4, res1); -} - -static INLINE void warp_affine_vertical_neon( - uint16_t *pred, int p_width, int p_height, int p_stride, int bd, - uint16_t *dst, int dst_stride, bool is_compound, bool do_average, - bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta, - const int16x8_t *tmp, int i, int sy4, int j) { - int limit_height = p_height > 4 ? 8 : 4; - - if (p_width > 4) { - // p_width == 8 - for (int k = 0; k < limit_height; ++k) { - int sy = sy4 + delta * k; - warp_affine_vertical_step_8x1_f8_neon( - pred, p_stride, bd, dst, dst_stride, is_compound, do_average, - use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); - } - } else { - // p_width == 4 - for (int k = 0; k < limit_height; ++k) { - int sy = sy4 + delta * k; - warp_affine_vertical_step_4x1_f4_neon( - pred, p_stride, bd, dst, dst_stride, is_compound, do_average, - use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); - } - } -} - void av1_highbd_warp_affine_neon(const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, @@ -498,63 +312,8 @@ int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { - uint16_t *const dst = conv_params->dst; - const int dst_stride = conv_params->dst_stride; - const bool is_compound = conv_params->is_compound; - const bool do_average = conv_params->do_average; - const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; - const int fwd = conv_params->fwd_offset; - const int bwd = conv_params->bck_offset; - - assert(IMPLIES(is_compound, dst != NULL)); - - for (int i = 0; i < p_height; i += 8) { - for (int j = 0; j < p_width; j += 8) { - // Calculate the center of this 8x8 block, - // project to luma coordinates (if in a subsampled chroma plane), - // apply the affine transformation, - // then convert back to the original coordinates (if necessary) - const int32_t src_x = (j + 4 + p_col) << subsampling_x; - const int32_t src_y = (i + 4 + p_row) << subsampling_y; - const int64_t dst_x = - (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; - const int64_t dst_y = - (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; - const int64_t x4 = dst_x >> subsampling_x; - const int64_t y4 = dst_y >> subsampling_y; - - const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); - int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); - int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); - - sx4 += alpha * (-4) + beta * (-4); - sy4 += gamma * (-4) + delta * (-4); - - sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); - - // Each horizontal filter result is formed by the sum of up to eight - // multiplications by filter values and then a shift. Although both the - // inputs and filters are loaded as int16, the input data is at most bd - // bits and the filters are at most 8 bits each. Additionally since we - // know all possible filter values we know that the sum of absolute - // filter values will fit in at most 9 bits. With this in mind we can - // conclude that the sum of each filter application will fit in bd + 9 - // bits. The shift following the summation is ROUND0_BITS (which is 3), - // +2 for 12-bit, which gives us a final storage of: - // bd == 8: ( 8 + 9) - 3 => 14 bits - // bd == 10: (10 + 9) - 3 => 16 bits - // bd == 12: (12 + 9) - 5 => 16 bits - // So it is safe to use int16x8_t as the intermediate storage type here. - int16x8_t tmp[15]; - - warp_affine_horizontal_neon(ref, width, height, stride, p_width, alpha, - beta, iy4, sx4, ix4, tmp, bd); - warp_affine_vertical_neon(pred, p_width, p_height, p_stride, bd, dst, - dst_stride, is_compound, do_average, - use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, - tmp, i, sy4, j); - } - } + highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); } diff -Nru aom-3.8.2/av1/common/arm/highbd_warp_plane_neon.h aom-3.9.0/av1/common/arm/highbd_warp_plane_neon.h --- aom-3.8.2/av1/common/arm/highbd_warp_plane_neon.h 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_warp_plane_neon.h 2024-05-07 19:57:02.707000000 +0000 @@ -0,0 +1,430 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ +#ifndef AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ +#define AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ + +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha); + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha); + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx); + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, int sx); + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, + int sy); + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, + int sy); + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, + int sy, int gamma); + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, + int sy, int gamma); + +static AOM_FORCE_INLINE int16x8_t load_filters_1(int ofs) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + return vld1q_s16(base + ofs0 * 8); +} + +static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int ofs, + int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); +} + +static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int ofs, + int stride) { + const int ofs0 = ROUND_POWER_OF_TWO(ofs + stride * 0, WARPEDDIFF_PREC_BITS); + const int ofs1 = ROUND_POWER_OF_TWO(ofs + stride * 1, WARPEDDIFF_PREC_BITS); + const int ofs2 = ROUND_POWER_OF_TWO(ofs + stride * 2, WARPEDDIFF_PREC_BITS); + const int ofs3 = ROUND_POWER_OF_TWO(ofs + stride * 3, WARPEDDIFF_PREC_BITS); + const int ofs4 = ROUND_POWER_OF_TWO(ofs + stride * 4, WARPEDDIFF_PREC_BITS); + const int ofs5 = ROUND_POWER_OF_TWO(ofs + stride * 5, WARPEDDIFF_PREC_BITS); + const int ofs6 = ROUND_POWER_OF_TWO(ofs + stride * 6, WARPEDDIFF_PREC_BITS); + const int ofs7 = ROUND_POWER_OF_TWO(ofs + stride * 7, WARPEDDIFF_PREC_BITS); + + const int16_t *base = + (int16_t *)av1_warped_filter + WARPEDPIXEL_PREC_SHIFTS * 8; + out[0] = vld1q_s16(base + ofs0 * 8); + out[1] = vld1q_s16(base + ofs1 * 8); + out[2] = vld1q_s16(base + ofs2 * 8); + out[3] = vld1q_s16(base + ofs3 * 8); + out[4] = vld1q_s16(base + ofs4 * 8); + out[5] = vld1q_s16(base + ofs5 * 8); + out[6] = vld1q_s16(base + ofs6 * 8); + out[7] = vld1q_s16(base + ofs7 * 8); +} + +static AOM_FORCE_INLINE uint16x4_t clip_pixel_highbd_vec(int32x4_t val, + int bd) { + const int limit = (1 << bd) - 1; + return vqmovun_s32(vminq_s32(val, vdupq_n_s32(limit))); +} + +static AOM_FORCE_INLINE void warp_affine_horizontal(const uint16_t *ref, + int width, int height, + int stride, int p_width, + int16_t alpha, int16_t beta, + int iy4, int sx4, int ix4, + int16x8_t tmp[], int bd) { + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + + if (ix4 <= -7) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } else if (ix4 >= width + 6) { + for (int k = 0; k < 15; ++k) { + int iy = clamp(iy4 + k - 7, 0, height - 1); + int32_t dup_val = + (1 << (bd + FILTER_BITS - round0 - 1)) + + ref[iy * stride + (width - 1)] * (1 << (FILTER_BITS - round0)); + tmp[k] = vdupq_n_s16(dup_val); + } + return; + } + + static const uint16_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint16x8_t indx0 = vld1q_u16(kIotaArr); + const uint16x8_t indx1 = vld1q_u16(kIotaArr + 8); + + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + +#define APPLY_HORIZONTAL_SHIFT(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + \ + if (out_of_boundary_left >= 0) { \ + uint16x8_t cmp_vec = vdupq_n_u16(out_of_boundary_left); \ + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride]); \ + uint16x8_t mask0 = vcleq_u16(indx0, cmp_vec); \ + uint16x8_t mask1 = vcleq_u16(indx1, cmp_vec); \ + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \ + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \ + } \ + if (out_of_boundary_right >= 0) { \ + uint16x8_t cmp_vec = vdupq_n_u16(15 - out_of_boundary_right); \ + uint16x8_t vec_dup = vdupq_n_u16(ref[iy * stride + width - 1]); \ + uint16x8_t mask0 = vcgeq_u16(indx0, cmp_vec); \ + uint16x8_t mask1 = vcgeq_u16(indx1, cmp_vec); \ + src_1.val[0] = vbslq_u16(mask0, vec_dup, src_1.val[0]); \ + src_1.val[1] = vbslq_u16(mask1, vec_dup, src_1.val[1]); \ + } \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < 15; ++k) { \ + const int iy = clamp(iy4 + k - 7, 0, height - 1); \ + uint16x8x2_t src_1 = vld1q_u16_x2(ref + iy * stride + ix4 - 7); \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } \ + } while (0) + + if (p_width == 4) { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_4x1_f4, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } else { + if (beta == 0) { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, sx4); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, sx4, alpha); + } + } else { + if (alpha == 0) { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f1, bd, + (sx4 + beta * (k - 3))); + } else { + APPLY_HORIZONTAL_SHIFT(highbd_horizontal_filter_8x1_f8, bd, + (sx4 + beta * (k - 3)), alpha); + } + } + } +} + +static AOM_FORCE_INLINE void highbd_vertical_filter_4x1_f4( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4_t sum0 = gamma == 0 ? vertical_filter_4x1_f1(tmp, sy) + : vertical_filter_4x1_f4(tmp, sy, gamma); + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + vst1_u16(dst16, res0); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + return; + } + + uint16x4_t p0 = vld1_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(p0)); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + vst1_u16(dst16, res0); +} + +static AOM_FORCE_INLINE void highbd_vertical_filter_8x1_f8( + uint16_t *pred, int p_stride, int bd, uint16_t *dst, int dst_stride, + bool is_compound, bool do_average, bool use_dist_wtd_comp_avg, int fwd, + int bwd, int16_t gamma, const int16x8_t *tmp, int i, int sy, int j) { + int32x4x2_t sums = gamma == 0 ? vertical_filter_8x1_f1(tmp, sy) + : vertical_filter_8x1_f8(tmp, sy, gamma); + int32x4_t sum0 = sums.val[0]; + int32x4_t sum1 = sums.val[1]; + + const int round0 = (bd == 12) ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_vert = bd + 2 * FILTER_BITS - round0; + + sum0 = vaddq_s32(sum0, vdupq_n_s32(1 << offset_bits_vert)); + sum1 = vaddq_s32(sum1, vdupq_n_s32(1 << offset_bits_vert)); + + uint16_t *dst16 = &pred[i * p_stride + j]; + + if (!is_compound) { + const int reduce_bits_vert = 2 * FILTER_BITS - round0; + sum0 = vrshlq_s32(sum0, vdupq_n_s32(-reduce_bits_vert)); + sum1 = vrshlq_s32(sum1, vdupq_n_s32(-reduce_bits_vert)); + + const int res_sub_const = (1 << (bd - 1)) + (1 << bd); + sum0 = vsubq_s32(sum0, vdupq_n_s32(res_sub_const)); + sum1 = vsubq_s32(sum1, vdupq_n_s32(res_sub_const)); + uint16x4_t res0 = clip_pixel_highbd_vec(sum0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(sum1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); + return; + } + + sum0 = vrshrq_n_s32(sum0, COMPOUND_ROUND1_BITS); + sum1 = vrshrq_n_s32(sum1, COMPOUND_ROUND1_BITS); + + uint16_t *p = &dst[i * dst_stride + j]; + + if (!do_average) { + vst1_u16(p, vqmovun_s32(sum0)); + vst1_u16(p + 4, vqmovun_s32(sum1)); + return; + } + + uint16x8_t p0 = vld1q_u16(p); + int32x4_t p_vec0 = vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(p0))); + int32x4_t p_vec1 = vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(p0))); + if (use_dist_wtd_comp_avg) { + p_vec0 = vmulq_n_s32(p_vec0, fwd); + p_vec1 = vmulq_n_s32(p_vec1, fwd); + p_vec0 = vmlaq_n_s32(p_vec0, sum0, bwd); + p_vec1 = vmlaq_n_s32(p_vec1, sum1, bwd); + p_vec0 = vshrq_n_s32(p_vec0, DIST_PRECISION_BITS); + p_vec1 = vshrq_n_s32(p_vec1, DIST_PRECISION_BITS); + } else { + p_vec0 = vhaddq_s32(p_vec0, sum0); + p_vec1 = vhaddq_s32(p_vec1, sum1); + } + + const int offset_bits = bd + 2 * FILTER_BITS - round0; + const int round1 = COMPOUND_ROUND1_BITS; + const int res_sub_const = + (1 << (offset_bits - round1)) + (1 << (offset_bits - round1 - 1)); + const int round_bits = 2 * FILTER_BITS - round0 - round1; + + p_vec0 = vsubq_s32(p_vec0, vdupq_n_s32(res_sub_const)); + p_vec1 = vsubq_s32(p_vec1, vdupq_n_s32(res_sub_const)); + + p_vec0 = vrshlq_s32(p_vec0, vdupq_n_s32(-round_bits)); + p_vec1 = vrshlq_s32(p_vec1, vdupq_n_s32(-round_bits)); + uint16x4_t res0 = clip_pixel_highbd_vec(p_vec0, bd); + uint16x4_t res1 = clip_pixel_highbd_vec(p_vec1, bd); + vst1_u16(dst16, res0); + vst1_u16(dst16 + 4, res1); +} + +static AOM_FORCE_INLINE void warp_affine_vertical( + uint16_t *pred, int p_width, int p_height, int p_stride, int bd, + uint16_t *dst, int dst_stride, bool is_compound, bool do_average, + bool use_dist_wtd_comp_avg, int fwd, int bwd, int16_t gamma, int16_t delta, + const int16x8_t *tmp, int i, int sy4, int j) { + int limit_height = p_height > 4 ? 8 : 4; + + if (p_width > 4) { + // p_width == 8 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_8x1_f8( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } else { + // p_width == 4 + for (int k = 0; k < limit_height; ++k) { + int sy = sy4 + delta * k; + highbd_vertical_filter_4x1_f4( + pred, p_stride, bd, dst, dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, tmp + k, i + k, sy, j); + } + } +} + +static AOM_FORCE_INLINE void highbd_warp_affine_common( + const int32_t *mat, const uint16_t *ref, int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, int p_width, int p_height, + int p_stride, int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, + int16_t delta) { + uint16_t *const dst = conv_params->dst; + const int dst_stride = conv_params->dst_stride; + const bool is_compound = conv_params->is_compound; + const bool do_average = conv_params->do_average; + const bool use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; + const int fwd = conv_params->fwd_offset; + const int bwd = conv_params->bck_offset; + + assert(IMPLIES(is_compound, dst != NULL)); + + for (int i = 0; i < p_height; i += 8) { + for (int j = 0; j < p_width; j += 8) { + // Calculate the center of this 8x8 block, + // project to luma coordinates (if in a subsampled chroma plane), + // apply the affine transformation, + // then convert back to the original coordinates (if necessary) + const int32_t src_x = (j + 4 + p_col) << subsampling_x; + const int32_t src_y = (i + 4 + p_row) << subsampling_y; + const int64_t dst_x = + (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0]; + const int64_t dst_y = + (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1]; + const int64_t x4 = dst_x >> subsampling_x; + const int64_t y4 = dst_y >> subsampling_y; + + const int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS); + int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + const int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS); + int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1); + + sx4 += alpha * (-4) + beta * (-4); + sy4 += gamma * (-4) + delta * (-4); + + sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1); + + // Each horizontal filter result is formed by the sum of up to eight + // multiplications by filter values and then a shift. Although both the + // inputs and filters are loaded as int16, the input data is at most bd + // bits and the filters are at most 8 bits each. Additionally since we + // know all possible filter values we know that the sum of absolute + // filter values will fit in at most 9 bits. With this in mind we can + // conclude that the sum of each filter application will fit in bd + 9 + // bits. The shift following the summation is ROUND0_BITS (which is 3), + // +2 for 12-bit, which gives us a final storage of: + // bd == 8: ( 8 + 9) - 3 => 14 bits + // bd == 10: (10 + 9) - 3 => 16 bits + // bd == 12: (12 + 9) - 5 => 16 bits + // So it is safe to use int16x8_t as the intermediate storage type here. + int16x8_t tmp[15]; + + warp_affine_horizontal(ref, width, height, stride, p_width, alpha, beta, + iy4, sx4, ix4, tmp, bd); + warp_affine_vertical(pred, p_width, p_height, p_stride, bd, dst, + dst_stride, is_compound, do_average, + use_dist_wtd_comp_avg, fwd, bwd, gamma, delta, tmp, + i, sy4, j); + } + } +} + +#endif // AOM_AV1_COMMON_ARM_HIGHBD_WARP_PLANE_NEON_H_ diff -Nru aom-3.8.2/av1/common/arm/highbd_warp_plane_sve.c aom-3.9.0/av1/common/arm/highbd_warp_plane_sve.c --- aom-3.8.2/av1/common/arm/highbd_warp_plane_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/common/arm/highbd_warp_plane_sve.c 2024-05-07 19:57:02.708000000 +0000 @@ -0,0 +1,295 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/transpose_neon.h" +#include "aom_ports/mem.h" +#include "av1/common/scale.h" +#include "av1/common/warped_motion.h" +#include "config/av1_rtcd.h" +#include "highbd_warp_plane_neon.h" + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f4(uint16x8x2_t in, int bd, int sx, int alpha) { + int16x8_t f[4]; + load_filters_4(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_8x1_f8(uint16x8x2_t in, int bd, int sx, int alpha) { + int16x8_t f[8]; + load_filters_8(f, sx, alpha); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_4x1_f1(uint16x8x2_t in, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + res = vaddq_s32(res, vdupq_n_s32(1 << offset_bits_horiz)); + res = vrshlq_s32(res, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res), vdup_n_s16(0)); +} + +static AOM_FORCE_INLINE int16x8_t +highbd_horizontal_filter_8x1_f1(uint16x8x2_t in, int bd, int sx) { + int16x8_t f = load_filters_1(sx); + + int16x8_t rv0 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 0); + int16x8_t rv1 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 1); + int16x8_t rv2 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 2); + int16x8_t rv3 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 3); + int16x8_t rv4 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 4); + int16x8_t rv5 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 5); + int16x8_t rv6 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 6); + int16x8_t rv7 = vextq_s16(vreinterpretq_s16_u16(in.val[0]), + vreinterpretq_s16_u16(in.val[1]), 7); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), rv0, f); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), rv1, f); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), rv2, f); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), rv3, f); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), rv4, f); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), rv5, f); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), rv6, f); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), rv7, f); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + const int round0 = bd == 12 ? ROUND0_BITS + 2 : ROUND0_BITS; + const int offset_bits_horiz = bd + FILTER_BITS - 1; + + int32x4_t res0 = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + int32x4_t res1 = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + res0 = vaddq_s32(res0, vdupq_n_s32(1 << offset_bits_horiz)); + res1 = vaddq_s32(res1, vdupq_n_s32(1 << offset_bits_horiz)); + res0 = vrshlq_s32(res0, vdupq_n_s32(-round0)); + res1 = vrshlq_s32(res1, vdupq_n_s32(-round0)); + return vcombine_s16(vmovn_s32(res0), vmovn_s32(res1)); +} + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f1(const int16x8_t *tmp, + int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + // No benefit to using SDOT here, the cost of rearrangement is too high. + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + return m0123; +} + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f1(const int16x8_t *tmp, + int sy) { + const int16x8_t f = load_filters_1(sy); + const int16x4_t f0123 = vget_low_s16(f); + const int16x4_t f4567 = vget_high_s16(f); + + // No benefit to using SDOT here, the cost of rearrangement is too high. + int32x4_t m0123 = vmull_lane_s16(vget_low_s16(tmp[0]), f0123, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[1]), f0123, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[2]), f0123, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[3]), f0123, 3); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[4]), f4567, 0); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[5]), f4567, 1); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[6]), f4567, 2); + m0123 = vmlal_lane_s16(m0123, vget_low_s16(tmp[7]), f4567, 3); + + int32x4_t m4567 = vmull_lane_s16(vget_high_s16(tmp[0]), f0123, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[1]), f0123, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[2]), f0123, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[3]), f0123, 3); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[4]), f4567, 0); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[5]), f4567, 1); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[6]), f4567, 2); + m4567 = vmlal_lane_s16(m4567, vget_high_s16(tmp[7]), f4567, 3); + return (int32x4x2_t){ { m0123, m4567 } }; +} + +static AOM_FORCE_INLINE int32x4_t vertical_filter_4x1_f4(const int16x8_t *tmp, + int sy, int gamma) { + int16x8_t s0, s1, s2, s3; + transpose_elems_s16_4x8( + vget_low_s16(tmp[0]), vget_low_s16(tmp[1]), vget_low_s16(tmp[2]), + vget_low_s16(tmp[3]), vget_low_s16(tmp[4]), vget_low_s16(tmp[5]), + vget_low_s16(tmp[6]), vget_low_s16(tmp[7]), &s0, &s1, &s2, &s3); + + int16x8_t f[4]; + load_filters_4(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + return vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); +} + +static AOM_FORCE_INLINE int32x4x2_t vertical_filter_8x1_f8(const int16x8_t *tmp, + int sy, int gamma) { + int16x8_t s0 = tmp[0]; + int16x8_t s1 = tmp[1]; + int16x8_t s2 = tmp[2]; + int16x8_t s3 = tmp[3]; + int16x8_t s4 = tmp[4]; + int16x8_t s5 = tmp[5]; + int16x8_t s6 = tmp[6]; + int16x8_t s7 = tmp[7]; + transpose_elems_inplace_s16_8x8(&s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7); + + int16x8_t f[8]; + load_filters_8(f, sy, gamma); + + int64x2_t m0 = aom_sdotq_s16(vdupq_n_s64(0), s0, f[0]); + int64x2_t m1 = aom_sdotq_s16(vdupq_n_s64(0), s1, f[1]); + int64x2_t m2 = aom_sdotq_s16(vdupq_n_s64(0), s2, f[2]); + int64x2_t m3 = aom_sdotq_s16(vdupq_n_s64(0), s3, f[3]); + int64x2_t m4 = aom_sdotq_s16(vdupq_n_s64(0), s4, f[4]); + int64x2_t m5 = aom_sdotq_s16(vdupq_n_s64(0), s5, f[5]); + int64x2_t m6 = aom_sdotq_s16(vdupq_n_s64(0), s6, f[6]); + int64x2_t m7 = aom_sdotq_s16(vdupq_n_s64(0), s7, f[7]); + + int64x2_t m01 = vpaddq_s64(m0, m1); + int64x2_t m23 = vpaddq_s64(m2, m3); + int64x2_t m45 = vpaddq_s64(m4, m5); + int64x2_t m67 = vpaddq_s64(m6, m7); + + int32x4x2_t ret; + ret.val[0] = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); + ret.val[1] = vcombine_s32(vmovn_s64(m45), vmovn_s64(m67)); + return ret; +} + +void av1_highbd_warp_affine_sve(const int32_t *mat, const uint16_t *ref, + int width, int height, int stride, + uint16_t *pred, int p_col, int p_row, + int p_width, int p_height, int p_stride, + int subsampling_x, int subsampling_y, int bd, + ConvolveParams *conv_params, int16_t alpha, + int16_t beta, int16_t gamma, int16_t delta) { + highbd_warp_affine_common(mat, ref, width, height, stride, pred, p_col, p_row, + p_width, p_height, p_stride, subsampling_x, + subsampling_y, bd, conv_params, alpha, beta, gamma, + delta); +} diff -Nru aom-3.8.2/av1/common/arm/reconintra_neon.c aom-3.9.0/av1/common/arm/reconintra_neon.c --- aom-3.8.2/av1/common/arm/reconintra_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/reconintra_neon.c 2024-05-07 19:57:02.709000000 +0000 @@ -15,144 +15,197 @@ #include "config/aom_config.h" #include "aom/aom_integer.h" +#include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" #define MAX_UPSAMPLE_SZ 16 -DECLARE_ALIGNED(16, const int8_t, - av1_filter_intra_taps_neon[FILTER_INTRA_MODES][8][8]) = { +// These kernels are a transposed version of those defined in reconintra.c, +// with the absolute value of the negatives taken in the top row. +DECLARE_ALIGNED(16, const uint8_t, + av1_filter_intra_taps_neon[FILTER_INTRA_MODES][7][8]) = { + // clang-format off { - { -6, 0, 0, 0, -5, 10, 0, 0 }, - { 10, 0, 12, 0, 2, 0, 9, 0 }, - { -3, 1, 0, 0, -3, 1, 10, 0 }, - { 1, 10, 7, 0, 1, 2, 5, 0 }, - { -4, 0, 0, 12, -3, 6, 0, 9 }, - { 6, 0, 2, 0, 2, 0, 2, 0 }, - { -3, 2, 0, 7, -3, 2, 6, 5 }, - { 2, 6, 2, 0, 1, 2, 3, 0 }, + { 6, 5, 3, 3, 4, 3, 3, 3 }, + { 10, 2, 1, 1, 6, 2, 2, 1 }, + { 0, 10, 1, 1, 0, 6, 2, 2 }, + { 0, 0, 10, 2, 0, 0, 6, 2 }, + { 0, 0, 0, 10, 0, 0, 0, 6 }, + { 12, 9, 7, 5, 2, 2, 2, 3 }, + { 0, 0, 0, 0, 12, 9, 7, 5 } }, { - { -10, 0, 0, 0, -6, 16, 0, 0 }, - { 16, 0, 10, 0, 0, 0, 6, 0 }, - { -4, 0, 0, 0, -2, 0, 16, 0 }, - { 0, 16, 4, 0, 0, 0, 2, 0 }, - { -10, 0, 0, 10, -6, 16, 0, 6 }, - { 16, 0, 0, 0, 0, 0, 0, 0 }, - { -4, 0, 0, 4, -2, 0, 16, 2 }, - { 0, 16, 0, 0, 0, 0, 0, 0 }, + { 10, 6, 4, 2, 10, 6, 4, 2 }, + { 16, 0, 0, 0, 16, 0, 0, 0 }, + { 0, 16, 0, 0, 0, 16, 0, 0 }, + { 0, 0, 16, 0, 0, 0, 16, 0 }, + { 0, 0, 0, 16, 0, 0, 0, 16 }, + { 10, 6, 4, 2, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 10, 6, 4, 2 } }, { - { -8, 0, 0, 0, -8, 8, 0, 0 }, - { 8, 0, 16, 0, 0, 0, 16, 0 }, - { -8, 0, 0, 0, -8, 0, 8, 0 }, - { 0, 8, 16, 0, 0, 0, 16, 0 }, - { -4, 0, 0, 16, -4, 4, 0, 16 }, - { 4, 0, 0, 0, 0, 0, 0, 0 }, - { -4, 0, 0, 16, -4, 0, 4, 16 }, - { 0, 4, 0, 0, 0, 0, 0, 0 }, + { 8, 8, 8, 8, 4, 4, 4, 4 }, + { 8, 0, 0, 0, 4, 0, 0, 0 }, + { 0, 8, 0, 0, 0, 4, 0, 0 }, + { 0, 0, 8, 0, 0, 0, 4, 0 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 16, 16, 16, 16, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 16, 16, 16, 16 } }, { - { -2, 0, 0, 0, -1, 8, 0, 0 }, - { 8, 0, 10, 0, 3, 0, 6, 0 }, - { -1, 3, 0, 0, 0, 2, 8, 0 }, - { 2, 8, 4, 0, 1, 3, 2, 0 }, - { -1, 0, 0, 10, -1, 4, 0, 6 }, - { 4, 0, 3, 0, 3, 0, 4, 0 }, - { -1, 3, 0, 4, -1, 2, 4, 3 }, - { 2, 4, 4, 0, 2, 3, 3, 0 }, + { 2, 1, 1, 0, 1, 1, 1, 1 }, + { 8, 3, 2, 1, 4, 3, 2, 2 }, + { 0, 8, 3, 2, 0, 4, 3, 2 }, + { 0, 0, 8, 3, 0, 0, 4, 3 }, + { 0, 0, 0, 8, 0, 0, 0, 4 }, + { 10, 6, 4, 2, 3, 4, 4, 3 }, + { 0, 0, 0, 0, 10, 6, 4, 3 } }, { - { -12, 0, 0, 0, -10, 14, 0, 0 }, - { 14, 0, 14, 0, 0, 0, 12, 0 }, - { -9, 0, 0, 0, -8, 0, 14, 0 }, - { 0, 14, 11, 0, 0, 0, 10, 0 }, - { -10, 0, 0, 14, -9, 12, 0, 12 }, - { 12, 0, 0, 0, 1, 0, 0, 0 }, - { -8, 0, 0, 11, -7, 0, 12, 9 }, - { 0, 12, 1, 0, 0, 1, 1, 0 }, - }, + { 12, 10, 9, 8, 10, 9, 8, 7 }, + { 14, 0, 0, 0, 12, 1, 0, 0 }, + { 0, 14, 0, 0, 0, 12, 0, 0 }, + { 0, 0, 14, 0, 0, 0, 12, 1 }, + { 0, 0, 0, 14, 0, 0, 0, 12 }, + { 14, 12, 11, 10, 0, 0, 1, 1 }, + { 0, 0, 0, 0, 14, 12, 11, 9 } + } + // clang-format on }; #define FILTER_INTRA_SCALE_BITS 4 -#define SHIFT_INTRA_SCALE_BITS 15 - FILTER_INTRA_SCALE_BITS - -#define MASK_LOW \ - 0x604020006040200 // (0 | (2 << 8) | (4 << 16) | (6 << 24)) x 2 -#define MASK_HIGH \ - 0x705030107050301 // (1 | (3 << 8) | (5 << 16) | (7 << 24)) x 2 void av1_filter_intra_predictor_neon(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode) { - int r, c; - uint8_t buffer[33][33]; - const int bw = tx_size_wide[tx_size]; - const int bh = tx_size_high[tx_size]; + const int width = tx_size_wide[tx_size]; + const int height = tx_size_high[tx_size]; + assert(width <= 32 && height <= 32); + + const uint8x8_t f0 = vld1_u8(av1_filter_intra_taps_neon[mode][0]); + const uint8x8_t f1 = vld1_u8(av1_filter_intra_taps_neon[mode][1]); + const uint8x8_t f2 = vld1_u8(av1_filter_intra_taps_neon[mode][2]); + const uint8x8_t f3 = vld1_u8(av1_filter_intra_taps_neon[mode][3]); + const uint8x8_t f4 = vld1_u8(av1_filter_intra_taps_neon[mode][4]); + const uint8x8_t f5 = vld1_u8(av1_filter_intra_taps_neon[mode][5]); + const uint8x8_t f6 = vld1_u8(av1_filter_intra_taps_neon[mode][6]); - const int8x16_t f1f0 = vld1q_s8(av1_filter_intra_taps_neon[mode][0]); - const int8x16_t f3f2 = vld1q_s8(av1_filter_intra_taps_neon[mode][2]); - const int8x16_t f5f4 = vld1q_s8(av1_filter_intra_taps_neon[mode][4]); - const int8x16_t f7f6 = vld1q_s8(av1_filter_intra_taps_neon[mode][6]); - const int16x8_t f1f0_lo = vmovl_s8(vget_low_s8(f1f0)); - const int16x8_t f1f0_hi = vmovl_s8(vget_high_s8(f1f0)); - const int16x8_t f3f2_lo = vmovl_s8(vget_low_s8(f3f2)); - const int16x8_t f3f2_hi = vmovl_s8(vget_high_s8(f3f2)); - const int16x8_t f5f4_lo = vmovl_s8(vget_low_s8(f5f4)); - const int16x8_t f5f4_hi = vmovl_s8(vget_high_s8(f5f4)); - const int16x8_t f7f6_lo = vmovl_s8(vget_low_s8(f7f6)); - const int16x8_t f7f6_hi = vmovl_s8(vget_high_s8(f7f6)); - const uint8x8_t vmask_low = vcreate_u8(MASK_LOW); - const uint8x8_t vmask_high = vcreate_u8(MASK_HIGH); - - assert(bw <= 32 && bh <= 32); - - for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r]; - memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t)); - - for (r = 1; r < bh + 1; r += 2) { - for (c = 1; c < bw + 1; c += 4) { - DECLARE_ALIGNED(16, uint8_t, p[8]); - memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t)); - p[5] = buffer[r][c - 1]; - p[6] = buffer[r + 1][c - 1]; - p[7] = 0; - - const uint8x8_t p_b = vld1_u8(p); - - const uint16x8_t p_b_lo = vmovl_u8(vtbl1_u8(p_b, vmask_low)); - const uint16x8_t p_b_hi = vmovl_u8(vtbl1_u8(p_b, vmask_high)); - - int16x8_t out_01 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f1f0_lo); - out_01 = vmlaq_s16(out_01, vreinterpretq_s16_u16(p_b_hi), f1f0_hi); - int16x8_t out_23 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f3f2_lo); - out_23 = vmlaq_s16(out_23, vreinterpretq_s16_u16(p_b_hi), f3f2_hi); - int16x8_t out_45 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f5f4_lo); - out_45 = vmlaq_s16(out_45, vreinterpretq_s16_u16(p_b_hi), f5f4_hi); - int16x8_t out_67 = vmulq_s16(vreinterpretq_s16_u16(p_b_lo), f7f6_lo); - out_67 = vmlaq_s16(out_67, vreinterpretq_s16_u16(p_b_hi), f7f6_hi); -#if AOM_ARCH_AARCH64 - const int16x8_t out_0123 = vpaddq_s16(out_01, out_23); - const int16x8_t out_4567 = vpaddq_s16(out_45, out_67); - const int16x8_t out_01234567 = vpaddq_s16(out_0123, out_4567); -#else - const int16x8_t out_0123 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_01)), - vqmovn_s32(vpaddlq_s16(out_23))); - const int16x8_t out_4567 = vcombine_s16(vqmovn_s32(vpaddlq_s16(out_45)), - vqmovn_s32(vpaddlq_s16(out_67))); - const int16x8_t out_01234567 = vcombine_s16( - vqmovn_s32(vpaddlq_s16(out_0123)), vqmovn_s32(vpaddlq_s16(out_4567))); -#endif // AOM_ARCH_AARCH64 - const uint32x2_t out_r = - vreinterpret_u32_u8(vqmovun_s16(vrshrq_n_s16(out_01234567, 4))); - // Storing - vst1_lane_u32((uint32_t *)&buffer[r][c], out_r, 0); - vst1_lane_u32((uint32_t *)&buffer[r + 1][c], out_r, 1); - } - } + uint8_t buffer[33][33]; + // Populate the top row in the scratch buffer with data from above. + memcpy(buffer[0], &above[-1], (width + 1) * sizeof(uint8_t)); + // Populate the first column in the scratch buffer with data from the left. + int r = 0; + do { + buffer[r + 1][0] = left[r]; + } while (++r < height); + + // Computing 4 cols per iteration (instead of 8) for 8x blocks is faster. + if (width <= 8) { + r = 1; + do { + int c = 1; + uint8x8_t s0 = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5 = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6 = vld1_dup_u8(&buffer[r + 1][c - 1]); + + do { + uint8x8_t s1234 = load_u8_4x1(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1 = vdup_lane_u8(s1234, 0); + uint8x8_t s2 = vdup_lane_u8(s1234, 1); + uint8x8_t s3 = vdup_lane_u8(s1234, 2); + uint8x8_t s4 = vdup_lane_u8(s1234, 3); + + uint16x8_t sum = vmull_u8(s1, f1); + // First row of each filter has all negative values so subtract. + sum = vmlsl_u8(sum, s0, f0); + sum = vmlal_u8(sum, s2, f2); + sum = vmlal_u8(sum, s3, f3); + sum = vmlal_u8(sum, s4, f4); + sum = vmlal_u8(sum, s5, f5); + sum = vmlal_u8(sum, s6, f6); + + uint8x8_t res = + vqrshrun_n_s16(vreinterpretq_s16_u16(sum), FILTER_INTRA_SCALE_BITS); + + // Store buffer[r + 0][c] and buffer[r + 1][c]. + store_u8x4_strided_x2(&buffer[r][c], 33, res); + + store_u8x4_strided_x2(dst + (r - 1) * stride + c - 1, stride, res); + + s0 = s4; + s5 = vdup_lane_u8(res, 3); + s6 = vdup_lane_u8(res, 7); + c += 4; + } while (c < width + 1); + + r += 2; + } while (r < height + 1); + } else { + r = 1; + do { + int c = 1; + uint8x8_t s0_lo = vld1_dup_u8(&buffer[r - 1][c - 1]); + uint8x8_t s5_lo = vld1_dup_u8(&buffer[r + 0][c - 1]); + uint8x8_t s6_lo = vld1_dup_u8(&buffer[r + 1][c - 1]); + + do { + uint8x8_t s1234 = vld1_u8(&buffer[r - 1][c - 1] + 1); + uint8x8_t s1_lo = vdup_lane_u8(s1234, 0); + uint8x8_t s2_lo = vdup_lane_u8(s1234, 1); + uint8x8_t s3_lo = vdup_lane_u8(s1234, 2); + uint8x8_t s4_lo = vdup_lane_u8(s1234, 3); + + uint16x8_t sum_lo = vmull_u8(s1_lo, f1); + // First row of each filter has all negative values so subtract. + sum_lo = vmlsl_u8(sum_lo, s0_lo, f0); + sum_lo = vmlal_u8(sum_lo, s2_lo, f2); + sum_lo = vmlal_u8(sum_lo, s3_lo, f3); + sum_lo = vmlal_u8(sum_lo, s4_lo, f4); + sum_lo = vmlal_u8(sum_lo, s5_lo, f5); + sum_lo = vmlal_u8(sum_lo, s6_lo, f6); + + uint8x8_t res_lo = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_lo), + FILTER_INTRA_SCALE_BITS); + + uint8x8_t s0_hi = s4_lo; + uint8x8_t s1_hi = vdup_lane_u8(s1234, 4); + uint8x8_t s2_hi = vdup_lane_u8(s1234, 5); + uint8x8_t s3_hi = vdup_lane_u8(s1234, 6); + uint8x8_t s4_hi = vdup_lane_u8(s1234, 7); + uint8x8_t s5_hi = vdup_lane_u8(res_lo, 3); + uint8x8_t s6_hi = vdup_lane_u8(res_lo, 7); + + uint16x8_t sum_hi = vmull_u8(s1_hi, f1); + // First row of each filter has all negative values so subtract. + sum_hi = vmlsl_u8(sum_hi, s0_hi, f0); + sum_hi = vmlal_u8(sum_hi, s2_hi, f2); + sum_hi = vmlal_u8(sum_hi, s3_hi, f3); + sum_hi = vmlal_u8(sum_hi, s4_hi, f4); + sum_hi = vmlal_u8(sum_hi, s5_hi, f5); + sum_hi = vmlal_u8(sum_hi, s6_hi, f6); + + uint8x8_t res_hi = vqrshrun_n_s16(vreinterpretq_s16_u16(sum_hi), + FILTER_INTRA_SCALE_BITS); + + uint32x2x2_t res = + vzip_u32(vreinterpret_u32_u8(res_lo), vreinterpret_u32_u8(res_hi)); + + vst1_u8(&buffer[r + 0][c], vreinterpret_u8_u32(res.val[0])); + vst1_u8(&buffer[r + 1][c], vreinterpret_u8_u32(res.val[1])); + + vst1_u8(dst + (r - 1) * stride + c - 1, + vreinterpret_u8_u32(res.val[0])); + vst1_u8(dst + (r + 0) * stride + c - 1, + vreinterpret_u8_u32(res.val[1])); + + s0_lo = s4_hi; + s5_lo = vdup_lane_u8(res_hi, 3); + s6_lo = vdup_lane_u8(res_hi, 7); + c += 8; + } while (c < width + 1); - for (r = 0; r < bh; ++r) { - memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t)); - dst += stride; + r += 2; + } while (r < height + 1); } } diff -Nru aom-3.8.2/av1/common/arm/resize_neon.c aom-3.9.0/av1/common/arm/resize_neon.c --- aom-3.8.2/av1/common/arm/resize_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/resize_neon.c 2024-05-07 19:57:02.710000000 +0000 @@ -929,7 +929,7 @@ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - store_u8_4x1(&temp[4 * z], d, 0); + store_u8_4x1(&temp[4 * z], d); } else { int i; for (i = 0; i < 4; ++i) { @@ -942,10 +942,10 @@ // transpose the 4x4 filters values back to dst { const uint8x8x4_t d4 = vld4_u8(temp); - store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0], 0); - store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1], 0); - store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2], 0); - store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3], 0); + store_u8_4x1(&dst[x + 0 * dst_stride], d4.val[0]); + store_u8_4x1(&dst[x + 1 * dst_stride], d4.val[1]); + store_u8_4x1(&dst[x + 2 * dst_stride], d4.val[2]); + store_u8_4x1(&dst[x + 3 * dst_stride], d4.val[3]); } x += 4; } while (x < w); @@ -1040,7 +1040,7 @@ tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); - store_u8_4x1(dst, d, 0); + store_u8_4x1(dst, d); } else { memcpy(dst, &src_y[3 * src_stride], w); } diff -Nru aom-3.8.2/av1/common/arm/warp_plane_neon.c aom-3.9.0/av1/common/arm/warp_plane_neon.c --- aom-3.8.2/av1/common/arm/warp_plane_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/warp_plane_neon.c 2024-05-07 19:57:02.714000000 +0000 @@ -11,8 +11,8 @@ #include "warp_plane_neon.h" -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -39,8 +39,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -75,7 +75,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -101,7 +102,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -135,8 +137,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); @@ -161,8 +163,9 @@ *res = m0123; } -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), @@ -186,9 +189,10 @@ *res = horizontal_add_4d_s32x4(m0123_pairs); } -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; @@ -223,10 +227,10 @@ *res_high = m4567; } -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; diff -Nru aom-3.8.2/av1/common/arm/warp_plane_neon.h aom-3.9.0/av1/common/arm/warp_plane_neon.h --- aom-3.8.2/av1/common/arm/warp_plane_neon.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/warp_plane_neon.h 2024-05-07 19:57:02.715000000 +0000 @@ -24,32 +24,37 @@ #include "av1/common/warped_motion.h" #include "av1/common/scale.h" -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha); -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha); -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx); -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx); +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx); -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy); +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy); -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma); +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma); -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy); +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy); -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma); +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma); -static INLINE void load_filters_4(int16x8_t out[], int offset, int stride) { +static AOM_FORCE_INLINE void load_filters_4(int16x8_t out[], int offset, + int stride) { out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> WARPEDDIFF_PREC_BITS))); out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> @@ -60,7 +65,8 @@ WARPEDDIFF_PREC_BITS))); } -static INLINE void load_filters_8(int16x8_t out[], int offset, int stride) { +static AOM_FORCE_INLINE void load_filters_8(int16x8_t out[], int offset, + int stride) { out[0] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 0 * stride) >> WARPEDDIFF_PREC_BITS))); out[1] = vld1q_s16((int16_t *)(av1_warped_filter + ((offset + 1 * stride) >> @@ -79,14 +85,14 @@ WARPEDDIFF_PREC_BITS))); } -static INLINE int clamp_iy(int iy, int height) { +static AOM_FORCE_INLINE int clamp_iy(int iy, int height) { return clamp(iy, 0, height - 1); } -static INLINE void warp_affine_horizontal( +static AOM_FORCE_INLINE void warp_affine_horizontal( const uint8_t *ref, int width, int height, int stride, int p_width, int p_height, int16_t alpha, int16_t beta, const int64_t x4, - const int64_t y4, const int i, int16x8_t tmp[], const uint8x16_t indx_vec) { + const int64_t y4, const int i, int16x8_t tmp[]) { const int bd = 8; const int reduce_bits_horiz = ROUND0_BITS; const int height_limit = AOMMIN(8, p_height - i) + 7; @@ -119,92 +125,83 @@ return; } - uint8x16_t in[15]; - if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) { - const int out_of_boundary_left = -(ix4 - 6); - const int out_of_boundary_right = (ix4 + 8) - width; + static const uint8_t kIotaArr[] = { 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15 }; + const uint8x16_t indx = vld1q_u8(kIotaArr); - for (int k = 0; k < height_limit; ++k) { - const int iy = clamp_iy(iy4 + k - 7, height); - const uint8_t *src = ref + iy * stride + ix4 - 7; - uint8x16_t src_1 = vld1q_u8(src); - - if (out_of_boundary_left >= 0) { - int limit = out_of_boundary_left + 1; - uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); - uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); - uint8x16_t mask_val = vcleq_u8(indx_vec, cmp_vec); - src_1 = vbslq_u8(mask_val, vec_dup, src_1); - } - if (out_of_boundary_right >= 0) { - int limit = 15 - (out_of_boundary_right + 1); - uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); - uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); - uint8x16_t mask_val = vcgeq_u8(indx_vec, cmp_vec); - src_1 = vbslq_u8(mask_val, vec_dup, src_1); - } - in[k] = src_1; - } - } else { - for (int k = 0; k < height_limit; ++k) { - const int iy = clamp_iy(iy4 + k - 7, height); - const uint8_t *src = ref + iy * stride + ix4 - 7; - in[k] = vld1q_u8(src); - } - } + const int out_of_boundary_left = -(ix4 - 6); + const int out_of_boundary_right = (ix4 + 8) - width; + +#define APPLY_HORIZONTAL_SHIFT(fn, ...) \ + do { \ + if (out_of_boundary_left >= 0 || out_of_boundary_right >= 0) { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + \ + if (out_of_boundary_left >= 0) { \ + int limit = out_of_boundary_left + 1; \ + uint8x16_t cmp_vec = vdupq_n_u8(out_of_boundary_left); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcleq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + if (out_of_boundary_right >= 0) { \ + int limit = 15 - (out_of_boundary_right + 1); \ + uint8x16_t cmp_vec = vdupq_n_u8(15 - out_of_boundary_right); \ + uint8x16_t vec_dup = vdupq_n_u8(*(src + limit)); \ + uint8x16_t mask_val = vcgeq_u8(indx, cmp_vec); \ + src_1 = vbslq_u8(mask_val, vec_dup, src_1); \ + } \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } else { \ + for (int k = 0; k < height_limit; ++k) { \ + const int iy = clamp_iy(iy4 + k - 7, height); \ + const uint8_t *src = ref + iy * stride + ix4 - 7; \ + uint8x16_t src_1 = vld1q_u8(src); \ + tmp[k] = (fn)(src_1, __VA_ARGS__); \ + } \ + } \ + } while (0) if (p_width == 4) { if (beta == 0) { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_4x1_f1(in[k], sx4); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, sx4); } else { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_4x1_f4(in[k], sx4, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, sx4, alpha); } } else { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_4x1_f1(in[k], sx); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f1, + (sx4 + beta * (k - 3))); } else { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_4x1_f4(in[k], sx, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_4x1_f4, (sx4 + beta * (k - 3)), + alpha); } } } else { if (beta == 0) { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_8x1_f1(in[k], sx4); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, sx4); } else { - for (int k = 0; k < height_limit; ++k) { - tmp[k] = horizontal_filter_8x1_f8(in[k], sx4, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, sx4, alpha); } } else { if (alpha == 0) { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_8x1_f1(in[k], sx); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f1, + (sx4 + beta * (k - 3))); } else { - for (int k = 0; k < height_limit; ++k) { - const int sx = sx4 + beta * (k - 3); - tmp[k] = horizontal_filter_8x1_f8(in[k], sx, alpha); - } + APPLY_HORIZONTAL_SHIFT(horizontal_filter_8x1_f8, (sx4 + beta * (k - 3)), + alpha); } } } } -static INLINE void warp_affine_vertical( +static AOM_FORCE_INLINE void warp_affine_vertical( uint8_t *pred, int p_width, int p_height, int p_stride, int is_compound, uint16_t *dst, int dst_stride, int do_average, int use_dist_wtd_comp_avg, int16_t gamma, int16_t delta, const int64_t y4, const int i, const int j, @@ -332,7 +329,7 @@ } } -static INLINE void av1_warp_affine_common( +static AOM_FORCE_INLINE void av1_warp_affine_common( const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, @@ -346,10 +343,6 @@ const int do_average = conv_params->do_average; const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; - static const uint8_t k0To15[16] = { 0, 1, 2, 3, 4, 5, 6, 7, - 8, 9, 10, 11, 12, 13, 14, 15 }; - const uint8x16_t indx_vec = vld1q_u8(k0To15); - assert(IMPLIES(is_compound, dst != NULL)); assert(IMPLIES(do_average, is_compound)); @@ -367,7 +360,7 @@ int16x8_t tmp[15]; warp_affine_horizontal(ref, width, height, stride, p_width, p_height, - alpha, beta, x4, y4, i, tmp, indx_vec); + alpha, beta, x4, y4, i, tmp); warp_affine_vertical(pred, p_width, p_height, p_stride, is_compound, dst, dst_stride, do_average, use_dist_wtd_comp_avg, gamma, delta, y4, i, j, tmp, w0, w1); diff -Nru aom-3.8.2/av1/common/arm/warp_plane_neon_i8mm.c aom-3.9.0/av1/common/arm/warp_plane_neon_i8mm.c --- aom-3.8.2/av1/common/arm/warp_plane_neon_i8mm.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/warp_plane_neon_i8mm.c 2024-05-07 19:57:02.715000000 +0000 @@ -17,8 +17,8 @@ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -45,8 +45,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -83,7 +83,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -112,7 +113,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -149,8 +151,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); @@ -175,8 +177,9 @@ *res = m0123; } -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), @@ -200,9 +203,10 @@ *res = horizontal_add_4d_s32x4(m0123_pairs); } -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; @@ -237,10 +241,10 @@ *res_high = m4567; } -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; diff -Nru aom-3.8.2/av1/common/arm/warp_plane_sve.c aom-3.9.0/av1/common/arm/warp_plane_sve.c --- aom-3.8.2/av1/common/arm/warp_plane_sve.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/arm/warp_plane_sve.c 2024-05-07 19:57:02.716000000 +0000 @@ -9,9 +9,10 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ -#include "warp_plane_neon.h" +#include -#include +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "warp_plane_neon.h" DECLARE_ALIGNED(16, static const uint8_t, usdot_permute_idx[48]) = { 0, 1, 2, 3, 1, 2, 3, 4, 2, 3, 4, 5, 3, 4, 5, 6, @@ -19,22 +20,8 @@ 8, 9, 10, 11, 9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14 }; -static INLINE int64x2_t aom_sdotq_s16(int64x2_t acc, int16x8_t x, int16x8_t y) { - // The 16-bit dot product instructions only exist in SVE and not Neon. - // We can get away without rewriting the existing Neon code by making use of - // the Neon-SVE bridge intrinsics to reinterpret a Neon vector as a SVE - // vector with the high part of the vector being "don't care", and then - // operating on that instead. - // This is clearly suboptimal in machines with a SVE vector length above - // 128-bits as the remainder of the vector is wasted, however this appears to - // still be beneficial compared to not using the instruction. - return svget_neonq_s64(svdot_s64(svset_neonq_s64(svundef_s64(), acc), - svset_neonq_s16(svundef_s16(), x), - svset_neonq_s16(svundef_s16(), y))); -} - -static INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f4(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -61,8 +48,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, int sx, - int alpha) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f8(const uint8x16_t in, + int sx, int alpha) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); // Loading the 8 filter taps @@ -99,7 +86,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_4x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -128,7 +116,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, int sx) { +static AOM_FORCE_INLINE int16x8_t horizontal_filter_8x1_f1(const uint8x16_t in, + int sx) { const int32x4_t add_const = vdupq_n_s32(1 << (8 + FILTER_BITS - 1)); int16x8_t f_s16 = @@ -165,8 +154,8 @@ return vreinterpretq_s16_u16(res); } -static INLINE void vertical_filter_4x1_f1(const int16x8_t *src, int32x4_t *res, - int sy) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f1(const int16x8_t *src, + int32x4_t *res, int sy) { int16x4_t s0 = vget_low_s16(src[0]); int16x4_t s1 = vget_low_s16(src[1]); int16x4_t s2 = vget_low_s16(src[2]); @@ -191,8 +180,9 @@ *res = m0123; } -static INLINE void vertical_filter_4x1_f4(const int16x8_t *src, int32x4_t *res, - int sy, int gamma) { +static AOM_FORCE_INLINE void vertical_filter_4x1_f4(const int16x8_t *src, + int32x4_t *res, int sy, + int gamma) { int16x8_t s0, s1, s2, s3; transpose_elems_s16_4x8( vget_low_s16(src[0]), vget_low_s16(src[1]), vget_low_s16(src[2]), @@ -213,9 +203,10 @@ *res = vcombine_s32(vmovn_s64(m01), vmovn_s64(m23)); } -static INLINE void vertical_filter_8x1_f1(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f1(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, + int sy) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; @@ -250,10 +241,10 @@ *res_high = m4567; } -static INLINE void vertical_filter_8x1_f8(const int16x8_t *src, - int32x4_t *res_low, - int32x4_t *res_high, int sy, - int gamma) { +static AOM_FORCE_INLINE void vertical_filter_8x1_f8(const int16x8_t *src, + int32x4_t *res_low, + int32x4_t *res_high, int sy, + int gamma) { int16x8_t s0 = src[0]; int16x8_t s1 = src[1]; int16x8_t s2 = src[2]; diff -Nru aom-3.8.2/av1/common/av1_common_int.h aom-3.9.0/av1/common/av1_common_int.h --- aom-3.8.2/av1/common/av1_common_int.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/av1_common_int.h 2024-05-07 19:57:02.717000000 +0000 @@ -17,7 +17,7 @@ #include "aom/internal/aom_codec_internal.h" #include "aom_dsp/flow_estimation/corner_detect.h" -#include "aom_util/aom_thread.h" +#include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_loopfilter.h" #include "av1/common/entropy.h" diff -Nru aom-3.8.2/av1/common/av1_rtcd.c aom-3.9.0/av1/common/av1_rtcd.c --- aom-3.8.2/av1/common/av1_rtcd.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/av1_rtcd.c 2024-05-07 19:57:02.726000000 +0000 @@ -15,4 +15,4 @@ #include "aom_ports/aom_once.h" -void av1_rtcd() { aom_once(setup_rtcd_internal); } +void av1_rtcd(void) { aom_once(setup_rtcd_internal); } diff -Nru aom-3.8.2/av1/common/av1_rtcd_defs.pl aom-3.9.0/av1/common/av1_rtcd_defs.pl --- aom-3.8.2/av1/common/av1_rtcd_defs.pl 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/av1_rtcd_defs.pl 2024-05-07 19:57:02.727000000 +0000 @@ -77,6 +77,16 @@ } forward_decls qw/av1_common_forward_decls/; +# Fallbacks for Valgrind support +# For normal use, we require SSE4.1. However, 32-bit Valgrind does not support +# SSE4.1, so we include fallbacks for some critical functions to improve +# performance +$sse2_x86 = $ssse3_x86 = ''; +if ($opts{arch} eq "x86") { + $sse2_x86 = 'sse2'; + $ssse3_x86 = 'ssse3'; +} + # functions that are 64 bit only. $mmx_x86_64 = $sse2_x86_64 = $ssse3_x86_64 = $avx_x86_64 = $avx2_x86_64 = ''; if ($opts{arch} eq "x86_64") { @@ -245,12 +255,11 @@ if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { # directional intra predictor functions add_proto qw/void av1_highbd_dr_prediction_z1/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int dx, int dy, int bd"; - specialize qw/av1_highbd_dr_prediction_z1 avx2/; + specialize qw/av1_highbd_dr_prediction_z1 avx2 neon/; add_proto qw/void av1_highbd_dr_prediction_z2/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_above, int upsample_left, int dx, int dy, int bd"; - - specialize qw/av1_highbd_dr_prediction_z2 avx2/; + specialize qw/av1_highbd_dr_prediction_z2 avx2 neon/; add_proto qw/void av1_highbd_dr_prediction_z3/, "uint16_t *dst, ptrdiff_t stride, int bw, int bh, const uint16_t *above, const uint16_t *left, int upsample_left, int dx, int dy, int bd"; - specialize qw/av1_highbd_dr_prediction_z3 avx2/; + specialize qw/av1_highbd_dr_prediction_z3 avx2 neon/; } # build compound seg mask functions @@ -319,10 +328,10 @@ # the transform coefficients are held in 32-bit # values, so the assembler code for av1_block_error can no longer be used. add_proto qw/int64_t av1_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/av1_block_error sse2 avx2 neon/; + specialize qw/av1_block_error sse2 avx2 neon sve/; add_proto qw/int64_t av1_block_error_lp/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size"; - specialize qw/av1_block_error_lp sse2 avx2 neon/; + specialize qw/av1_block_error_lp sse2 avx2 neon sve/; add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/av1_quantize_fp sse2 avx2 neon/; @@ -346,7 +355,7 @@ #fwd txfm add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param"; - specialize qw/av1_lowbd_fwd_txfm sse2 sse4_1 avx2 neon/; + specialize qw/av1_lowbd_fwd_txfm sse4_1 avx2 neon/, $sse2_x86; add_proto qw/void av1_fwd_txfm2d_4x8/, "const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd"; specialize qw/av1_fwd_txfm2d_4x8 sse4_1 neon/; @@ -437,9 +446,9 @@ specialize qw/av1_txb_init_levels sse4_1 avx2 neon/; add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N"; - specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon/; + specialize qw/av1_wedge_sse_from_residuals sse2 avx2 neon sve/; add_proto qw/int8_t av1_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit"; - specialize qw/av1_wedge_sign_from_residuals sse2 avx2 neon/; + specialize qw/av1_wedge_sign_from_residuals sse2 avx2 neon sve/; add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N"; specialize qw/av1_wedge_compute_delta_squares sse2 avx2 neon/; @@ -459,7 +468,7 @@ add_proto qw/void av1_calc_proj_params_high_bd/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int64_t H[2][2], int64_t C[2], const sgr_params_type *params"; specialize qw/av1_calc_proj_params_high_bd sse4_1 avx2 neon/; add_proto qw/int64_t av1_highbd_pixel_proj_error/, "const uint8_t *src8, int width, int height, int src_stride, const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params"; - specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2/; + specialize qw/av1_highbd_pixel_proj_error sse4_1 avx2 neon/; add_proto qw/void av1_compute_stats_highbd/, "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, aom_bit_depth_t bit_depth"; specialize qw/av1_compute_stats_highbd sse4_1 avx2 neon/; } @@ -485,6 +494,7 @@ if (aom_config("CONFIG_EXCLUDE_SIMD_MISMATCH") ne "yes") { specialize qw/av1_cnn_convolve_no_maxpool_padding_valid avx2/; } + specialize qw/av1_cnn_convolve_no_maxpool_padding_valid neon/; add_proto qw/void av1_cnn_deconvolve/, "const float **input, int in_width, int in_height, int in_stride, const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride"; add_proto qw/void av1_cnn_batchnorm/, "float **image, int channels, int width, int height, int stride, const float *gamma, const float *beta, const float *mean, const float *std"; } @@ -521,27 +531,27 @@ # structs as arguments, which makes the v256 type of the intrinsics # hard to support, so optimizations for this target are disabled. if ($opts{config} !~ /libs-x86-win32-vs.*/) { - specialize qw/cdef_find_dir sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_find_dir_dual sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_find_dir sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_find_dir_dual sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_filter_8_0 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_8_1 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_8_2 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_8_3 sse2 ssse3 sse4_1 avx2 neon/; - - specialize qw/cdef_filter_16_0 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_16_1 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_16_2 sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_filter_16_3 sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_filter_8_0 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_8_1 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_8_2 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_8_3 sse4_1 avx2 neon/, "$ssse3_x86"; + + specialize qw/cdef_filter_16_0 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_16_1 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_16_2 sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_filter_16_3 sse4_1 avx2 neon/, "$ssse3_x86"; - specialize qw/cdef_copy_rect8_8bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; - specialize qw/cdef_copy_rect8_16bit_to_16bit sse2 ssse3 sse4_1 avx2 neon/; + specialize qw/cdef_copy_rect8_8bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; + specialize qw/cdef_copy_rect8_16bit_to_16bit sse4_1 avx2 neon/, "$ssse3_x86"; } # WARPED_MOTION / GLOBAL_MOTION functions if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { add_proto qw/void av1_highbd_warp_affine/, "const int32_t *mat, const uint16_t *ref, int width, int height, int stride, uint16_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int bd, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; - specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon/; + specialize qw/av1_highbd_warp_affine sse4_1 avx2 neon sve/; } add_proto qw/void av1_warp_affine/, "const int32_t *mat, const uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta"; @@ -591,20 +601,20 @@ specialize qw/av1_convolve_y_sr sse2 avx2 neon/; specialize qw/av1_convolve_y_sr_intrabc neon/; specialize qw/av1_convolve_2d_scale sse4_1/; - specialize qw/av1_dist_wtd_convolve_2d sse2 ssse3 avx2 neon neon_dotprod neon_i8mm/; + specialize qw/av1_dist_wtd_convolve_2d ssse3 avx2 neon neon_dotprod neon_i8mm/; specialize qw/av1_dist_wtd_convolve_2d_copy sse2 avx2 neon/; specialize qw/av1_dist_wtd_convolve_x sse2 avx2 neon neon_dotprod neon_i8mm/; specialize qw/av1_dist_wtd_convolve_y sse2 avx2 neon/; if(aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") { specialize qw/av1_highbd_dist_wtd_convolve_2d sse4_1 avx2 neon/; - specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon/; + specialize qw/av1_highbd_dist_wtd_convolve_x sse4_1 avx2 neon sve2/; specialize qw/av1_highbd_dist_wtd_convolve_y sse4_1 avx2 neon/; specialize qw/av1_highbd_dist_wtd_convolve_2d_copy sse4_1 avx2 neon/; - specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon/; + specialize qw/av1_highbd_convolve_2d_sr ssse3 avx2 neon sve2/; specialize qw/av1_highbd_convolve_2d_sr_intrabc neon/; - specialize qw/av1_highbd_convolve_x_sr ssse3 avx2 neon/; + specialize qw/av1_highbd_convolve_x_sr ssse3 avx2 neon sve2/; specialize qw/av1_highbd_convolve_x_sr_intrabc neon/; - specialize qw/av1_highbd_convolve_y_sr ssse3 avx2 neon/; + specialize qw/av1_highbd_convolve_y_sr ssse3 avx2 neon sve2/; specialize qw/av1_highbd_convolve_y_sr_intrabc neon/; specialize qw/av1_highbd_convolve_2d_scale sse4_1 neon/; } diff -Nru aom-3.8.2/av1/common/blockd.h aom-3.9.0/av1/common/blockd.h --- aom-3.8.2/av1/common/blockd.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/blockd.h 2024-05-07 19:57:02.732000000 +0000 @@ -1142,7 +1142,7 @@ return largest_tx_size; } -static const uint8_t mode_to_angle_map[] = { +static const uint8_t mode_to_angle_map[INTRA_MODES] = { 0, 90, 180, 45, 135, 113, 157, 203, 67, 0, 0, 0, 0, }; diff -Nru aom-3.8.2/av1/common/cdef.c aom-3.9.0/av1/common/cdef.c --- aom-3.8.2/av1/common/cdef.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/cdef.c 2024-05-07 19:57:02.735000000 +0000 @@ -10,15 +10,19 @@ */ #include -#include +#include #include #include "config/aom_scale_rtcd.h" #include "aom/aom_integer.h" +#include "aom_util/aom_pthread.h" #include "av1/common/av1_common_int.h" #include "av1/common/cdef.h" #include "av1/common/cdef_block.h" +#include "av1/common/common.h" +#include "av1/common/common_data.h" +#include "av1/common/enums.h" #include "av1/common/reconinter.h" #include "av1/common/thread_common.h" @@ -92,7 +96,7 @@ const uint8_t *src, int src_voffset, int src_hoffset, int sstride, int vsize, int hsize) { - const uint8_t *base = &src[src_voffset * sstride + src_hoffset]; + const uint8_t *base = &src[src_voffset * (ptrdiff_t)sstride + src_hoffset]; cdef_copy_rect8_8bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); } @@ -101,7 +105,7 @@ int src_hoffset, int sstride, int vsize, int hsize) { const uint16_t *base = - &CONVERT_TO_SHORTPTR(src)[src_voffset * sstride + src_hoffset]; + &CONVERT_TO_SHORTPTR(src)[src_voffset * (ptrdiff_t)sstride + src_hoffset]; cdef_copy_rect8_16bit_to_16bit(dst, dstride, base, sstride, hsize, vsize); } @@ -247,7 +251,8 @@ static INLINE void cdef_filter_fb(CdefBlockInfo *const fb_info, int plane, uint8_t use_highbitdepth) { - int offset = fb_info->dst_stride * fb_info->roffset + fb_info->coffset; + ptrdiff_t offset = + (ptrdiff_t)fb_info->dst_stride * fb_info->roffset + fb_info->coffset; if (use_highbitdepth) { av1_cdef_filter_fb( NULL, CONVERT_TO_SHORTPTR(fb_info->dst + offset), fb_info->dst_stride, diff -Nru aom-3.8.2/av1/common/cdef_block_simd.h aom-3.9.0/av1/common/cdef_block_simd.h --- aom-3.8.2/av1/common/cdef_block_simd.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/cdef_block_simd.h 2024-05-07 19:57:02.738000000 +0000 @@ -158,9 +158,6 @@ res[0] = v128_ziphi_64(tr1_7, tr1_6); } -// There is a separate Neon implementation of this function, so disable this -// one. -#if !HAVE_NEON int SIMD_FUNC(cdef_find_dir)(const uint16_t *img, int stride, int32_t *var, int coeff_shift) { int i; @@ -199,7 +196,6 @@ *var >>= 10; return best_dir; } -#endif // Work around compiler out of memory issues with Win32 builds. This issue has // been observed with Visual Studio 2017, 2019, and 2022 (version 17.4). @@ -209,9 +205,6 @@ #define CDEF_INLINE SIMD_INLINE #endif -// There is a separate Neon implementation of these functions, so disable this -// one. -#if !HAVE_NEON // sign(a-b) * min(abs(a-b), max(0, threshold - (abs(a-b) >> adjdamp))) CDEF_INLINE v256 constrain16(v256 a, v256 b, unsigned int threshold, unsigned int adjdamp) { @@ -830,7 +823,6 @@ copy_block_4xh(/*is_lowbd=*/0, dest, dstride, in, block_height); } } -#endif // HAVE_NEON void SIMD_FUNC(cdef_copy_rect8_16bit_to_16bit)(uint16_t *dst, int dstride, const uint16_t *src, int sstride, diff -Nru aom-3.8.2/av1/common/entropymode.h aom-3.9.0/av1/common/entropymode.h --- aom-3.8.2/av1/common/entropymode.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/entropymode.h 2024-05-07 19:57:02.751000000 +0000 @@ -12,6 +12,7 @@ #ifndef AOM_AV1_COMMON_ENTROPYMODE_H_ #define AOM_AV1_COMMON_ENTROPYMODE_H_ +#include "aom_ports/bitops.h" #include "av1/common/entropy.h" #include "av1/common/entropymv.h" #include "av1/common/filter.h" @@ -192,13 +193,7 @@ // Returns (int)ceil(log2(n)). static INLINE int av1_ceil_log2(int n) { if (n < 2) return 0; - int i = 1; - unsigned int p = 2; - while (p < (unsigned int)n) { - i++; - p = p << 1; - } - return i; + return get_msb(n - 1) + 1; } // Returns the context for palette color index at row 'r' and column 'c', diff -Nru aom-3.8.2/av1/common/quant_common.c aom-3.9.0/av1/common/quant_common.c --- aom-3.8.2/av1/common/quant_common.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/quant_common.c 2024-05-07 19:57:02.771000000 +0000 @@ -9,10 +9,15 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include "config/aom_config.h" + +#include "aom/aom_frame_buffer.h" +#include "aom_scale/yv12config.h" #include "av1/common/av1_common_int.h" #include "av1/common/blockd.h" #include "av1/common/common.h" #include "av1/common/entropy.h" +#include "av1/common/filter.h" #include "av1/common/quant_common.h" #include "av1/common/seg_common.h" @@ -274,13 +279,16 @@ : quant_params->gqmatrix[NUM_QM_LEVELS - 1][0][qm_tx_size]; } +#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER #define QM_TOTAL_SIZE 3344 // We only use wt_matrix_ref[q] and iwt_matrix_ref[q] // for q = 0, ..., NUM_QM_LEVELS - 2. static const qm_val_t wt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE]; +#endif void av1_qm_init(CommonQuantParams *quant_params, int num_planes) { +#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER for (int q = 0; q < NUM_QM_LEVELS; ++q) { for (int c = 0; c < num_planes; ++c) { int current = 0; @@ -306,6 +314,10 @@ } } } +#else + (void)quant_params; + (void)num_planes; +#endif // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER } /* Provide 15 sets of quantization matrices for chroma and luma @@ -320,6 +332,8 @@ distances. Matrices for QM level 15 are omitted because they are not used. */ + +#if CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER static const qm_val_t iwt_matrix_ref[NUM_QM_LEVELS - 1][2][QM_TOTAL_SIZE] = { { { /* Luma */ @@ -12873,4 +12887,6 @@ 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32, 34, 33, 33, 33, 32, 32, 32, 32 }, }, -}; \ No newline at end of file +}; + +#endif // CONFIG_QUANT_MATRIX || CONFIG_AV1_DECODER diff -Nru aom-3.8.2/av1/common/reconinter.h aom-3.9.0/av1/common/reconinter.h --- aom-3.8.2/av1/common/reconinter.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/reconinter.h 2024-05-07 19:57:02.832000000 +0000 @@ -449,7 +449,7 @@ #define MASK_MASTER_SIZE ((MAX_WEDGE_SIZE) << 1) #define MASK_MASTER_STRIDE (MASK_MASTER_SIZE) -void av1_init_wedge_masks(); +void av1_init_wedge_masks(void); static INLINE const uint8_t *av1_get_contiguous_soft_mask(int8_t wedge_index, int8_t wedge_sign, diff -Nru aom-3.8.2/av1/common/reconintra.c aom-3.9.0/av1/common/reconintra.c --- aom-3.8.2/av1/common/reconintra.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/reconintra.c 2024-05-07 19:57:02.834000000 +0000 @@ -9,6 +9,7 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include #include #include "config/aom_config.h" @@ -959,21 +960,18 @@ } static int get_intra_edge_filter_type(const MACROBLOCKD *xd, int plane) { - int ab_sm, le_sm; + const MB_MODE_INFO *above; + const MB_MODE_INFO *left; if (plane == 0) { - const MB_MODE_INFO *ab = xd->above_mbmi; - const MB_MODE_INFO *le = xd->left_mbmi; - ab_sm = ab ? is_smooth(ab, plane) : 0; - le_sm = le ? is_smooth(le, plane) : 0; + above = xd->above_mbmi; + left = xd->left_mbmi; } else { - const MB_MODE_INFO *ab = xd->chroma_above_mbmi; - const MB_MODE_INFO *le = xd->chroma_left_mbmi; - ab_sm = ab ? is_smooth(ab, plane) : 0; - le_sm = le ? is_smooth(le, plane) : 0; + above = xd->chroma_above_mbmi; + left = xd->chroma_left_mbmi; } - return (ab_sm || le_sm) ? 1 : 0; + return (above && is_smooth(above, plane)) || (left && is_smooth(left, plane)); } static int intra_edge_filter_strength(int bs0, int bs1, int delta, int type) { @@ -1071,7 +1069,7 @@ } } -static void build_intra_predictors( +static void build_directional_and_filter_intra_predictors( const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, @@ -1090,6 +1088,7 @@ int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + assert(use_filter_intra || is_dr_mode); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_dr_prediction_z1_avx2()) from left_data, above_data are seen to @@ -1190,49 +1189,119 @@ return; } - if (is_dr_mode) { - int upsample_above = 0; - int upsample_left = 0; - if (!disable_edge_filter) { - const int need_right = p_angle < 90; - const int need_bottom = p_angle > 180; - if (p_angle != 90 && p_angle != 180) { - const int ab_le = need_above_left ? 1 : 0; - if (need_above && need_left && (txwpx + txhpx >= 24)) { - filter_intra_edge_corner(above_row, left_col); - } - if (need_above && n_top_px > 0) { - const int strength = intra_edge_filter_strength( - txwpx, txhpx, p_angle - 90, intra_edge_filter_type); - const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); - av1_filter_intra_edge(above_row - ab_le, n_px, strength); - } - if (need_left && n_left_px > 0) { - const int strength = intra_edge_filter_strength( - txhpx, txwpx, p_angle - 180, intra_edge_filter_type); - const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); - av1_filter_intra_edge(left_col - ab_le, n_px, strength); - } - } - upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, - intra_edge_filter_type); - if (need_above && upsample_above) { - const int n_px = txwpx + (need_right ? txhpx : 0); - av1_upsample_intra_edge(above_row, n_px); - } - upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, - intra_edge_filter_type); - if (need_left && upsample_left) { - const int n_px = txhpx + (need_bottom ? txwpx : 0); - av1_upsample_intra_edge(left_col, n_px); - } + assert(is_dr_mode); + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + assert(need_above_left); + const int ab_le = 1; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + filter_intra_edge_corner(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_filter_intra_edge(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_filter_intra_edge(left_col - ab_le, n_px, strength); + } + } + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_upsample_intra_edge(above_row, n_px); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_upsample_intra_edge(left_col, n_px); + } + } + dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, + upsample_left, p_angle); +} + +// This function generates the pred data of a given block for non-directional +// intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, SMOOTH_V and PAETH). +static void build_non_directional_intra_predictors( + const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px) { + const uint8_t *above_ref = ref - ref_stride; + const uint8_t *left_ref = ref - 1; + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + const int need_left = extend_modes[mode] & NEED_LEFT; + const int need_above = extend_modes[mode] & NEED_ABOVE; + const int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + int i = 0; + assert(n_top_px >= 0); + assert(n_left_px >= 0); + assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED || mode == PAETH_PRED); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val = 0; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : 129; + } else { + val = (n_left_px > 0) ? left_ref[0] : 127; + } + for (i = 0; i < txhpx; ++i) { + memset(dst, val, txwpx); + dst += dst_stride; } - dr_predictor(dst, dst_stride, tx_size, above_row, left_col, upsample_above, - upsample_left, p_angle); return; } - // predict + DECLARE_ALIGNED(16, uint8_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint8_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint8_t *const above_row = above_data + 16; + uint8_t *const left_col = left_data + 16; + + if (need_left) { + memset(left_data, 129, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_left_px > 0) { + for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (i < txhpx) memset(&left_col[i], left_col[i - 1], txhpx - i); + } else if (n_top_px > 0) { + memset(left_col, above_ref[0], txhpx); + } + } + + if (need_above) { + memset(above_data, 127, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px); + i = n_top_px; + if (i < txwpx) memset(&above_row[i], above_row[i - 1], txwpx - i); + } else if (n_left_px > 0) { + memset(above_row, left_ref[0], txwpx); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = 128; + } + left_col[-1] = above_row[-1]; + } + if (mode == DC_PRED) { dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride, above_row, left_col); @@ -1300,7 +1369,7 @@ } } -static void highbd_build_intra_predictors( +static void highbd_build_directional_and_filter_intra_predictors( const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, PREDICTION_MODE mode, int p_angle, FILTER_INTRA_MODE filter_intra_mode, TX_SIZE tx_size, int disable_edge_filter, int n_top_px, int n_topright_px, @@ -1308,7 +1377,7 @@ int bit_depth) { int i; uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); - uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); uint16_t *const above_row = above_data + 16; @@ -1322,7 +1391,8 @@ const uint16_t *left_ref = ref - 1; const int is_dr_mode = av1_is_directional_mode(mode); const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; - int base = 128 << (bit_depth - 8); + assert(use_filter_intra || is_dr_mode); + const int base = 128 << (bit_depth - 8); // The left_data, above_data buffers must be zeroed to fix some intermittent // valgrind errors. Uninitialized reads in intra pred modules (e.g. width = 4 // path in av1_highbd_dr_prediction_z2_avx2()) from left_data, above_data are @@ -1424,49 +1494,125 @@ return; } - if (is_dr_mode) { - int upsample_above = 0; - int upsample_left = 0; - if (!disable_edge_filter) { - const int need_right = p_angle < 90; - const int need_bottom = p_angle > 180; - if (p_angle != 90 && p_angle != 180) { - const int ab_le = need_above_left ? 1 : 0; - if (need_above && need_left && (txwpx + txhpx >= 24)) { - highbd_filter_intra_edge_corner(above_row, left_col); - } - if (need_above && n_top_px > 0) { - const int strength = intra_edge_filter_strength( - txwpx, txhpx, p_angle - 90, intra_edge_filter_type); - const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); - av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); - } - if (need_left && n_left_px > 0) { - const int strength = intra_edge_filter_strength( - txhpx, txwpx, p_angle - 180, intra_edge_filter_type); - const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); - av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); - } - } - upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, - intra_edge_filter_type); - if (need_above && upsample_above) { - const int n_px = txwpx + (need_right ? txhpx : 0); - av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); - } - upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, - intra_edge_filter_type); - if (need_left && upsample_left) { - const int n_px = txhpx + (need_bottom ? txwpx : 0); - av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); - } + assert(is_dr_mode); + int upsample_above = 0; + int upsample_left = 0; + if (!disable_edge_filter) { + const int need_right = p_angle < 90; + const int need_bottom = p_angle > 180; + if (p_angle != 90 && p_angle != 180) { + assert(need_above_left); + const int ab_le = 1; + if (need_above && need_left && (txwpx + txhpx >= 24)) { + highbd_filter_intra_edge_corner(above_row, left_col); + } + if (need_above && n_top_px > 0) { + const int strength = intra_edge_filter_strength( + txwpx, txhpx, p_angle - 90, intra_edge_filter_type); + const int n_px = n_top_px + ab_le + (need_right ? txhpx : 0); + av1_highbd_filter_intra_edge(above_row - ab_le, n_px, strength); + } + if (need_left && n_left_px > 0) { + const int strength = intra_edge_filter_strength( + txhpx, txwpx, p_angle - 180, intra_edge_filter_type); + const int n_px = n_left_px + ab_le + (need_bottom ? txwpx : 0); + av1_highbd_filter_intra_edge(left_col - ab_le, n_px, strength); + } + } + upsample_above = av1_use_intra_edge_upsample(txwpx, txhpx, p_angle - 90, + intra_edge_filter_type); + if (need_above && upsample_above) { + const int n_px = txwpx + (need_right ? txhpx : 0); + av1_highbd_upsample_intra_edge(above_row, n_px, bit_depth); + } + upsample_left = av1_use_intra_edge_upsample(txhpx, txwpx, p_angle - 180, + intra_edge_filter_type); + if (need_left && upsample_left) { + const int n_px = txhpx + (need_bottom ? txwpx : 0); + av1_highbd_upsample_intra_edge(left_col, n_px, bit_depth); + } + } + highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, + upsample_above, upsample_left, p_angle, bit_depth); +} + +// For HBD encode/decode, this function generates the pred data of a given +// block for non-directional intra prediction modes (i.e., DC, SMOOTH, SMOOTH_H, +// SMOOTH_V and PAETH). +static void highbd_build_non_directional_intra_predictors( + const uint8_t *ref8, int ref_stride, uint8_t *dst8, int dst_stride, + PREDICTION_MODE mode, TX_SIZE tx_size, int n_top_px, int n_left_px, + int bit_depth) { + int i = 0; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + const uint16_t *const ref = CONVERT_TO_SHORTPTR(ref8); + const int txwpx = tx_size_wide[tx_size]; + const int txhpx = tx_size_high[tx_size]; + int need_left = extend_modes[mode] & NEED_LEFT; + int need_above = extend_modes[mode] & NEED_ABOVE; + int need_above_left = extend_modes[mode] & NEED_ABOVELEFT; + const uint16_t *above_ref = ref - ref_stride; + const uint16_t *left_ref = ref - 1; + const int base = 128 << (bit_depth - 8); + + assert(n_top_px >= 0); + assert(n_left_px >= 0); + assert(mode == DC_PRED || mode == SMOOTH_PRED || mode == SMOOTH_V_PRED || + mode == SMOOTH_H_PRED || mode == PAETH_PRED); + + if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) { + int val = 0; + if (need_left) { + val = (n_top_px > 0) ? above_ref[0] : base + 1; + } else { + val = (n_left_px > 0) ? left_ref[0] : base - 1; + } + for (i = 0; i < txhpx; ++i) { + aom_memset16(dst, val, txwpx); + dst += dst_stride; } - highbd_dr_predictor(dst, dst_stride, tx_size, above_row, left_col, - upsample_above, upsample_left, p_angle, bit_depth); return; } - // predict + DECLARE_ALIGNED(16, uint16_t, left_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + DECLARE_ALIGNED(16, uint16_t, above_data[NUM_INTRA_NEIGHBOUR_PIXELS]); + uint16_t *const above_row = above_data + 16; + uint16_t *const left_col = left_data + 16; + + if (need_left) { + aom_memset16(left_data, base + 1, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_left_px > 0) { + for (i = 0; i < n_left_px; i++) left_col[i] = left_ref[i * ref_stride]; + if (i < txhpx) aom_memset16(&left_col[i], left_col[i - 1], txhpx - i); + } else if (n_top_px > 0) { + aom_memset16(left_col, above_ref[0], txhpx); + } + } + + if (need_above) { + aom_memset16(above_data, base - 1, NUM_INTRA_NEIGHBOUR_PIXELS); + if (n_top_px > 0) { + memcpy(above_row, above_ref, n_top_px * sizeof(above_ref[0])); + i = n_top_px; + if (i < txwpx) aom_memset16(&above_row[i], above_row[i - 1], (txwpx - i)); + } else if (n_left_px > 0) { + aom_memset16(above_row, left_ref[0], txwpx); + } + } + + if (need_above_left) { + if (n_top_px > 0 && n_left_px > 0) { + above_row[-1] = above_ref[-1]; + } else if (n_top_px > 0) { + above_row[-1] = above_ref[0]; + } else if (n_left_px > 0) { + above_row[-1] = left_ref[0]; + } else { + above_row[-1] = base; + } + left_col[-1] = above_row[-1]; + } + if (mode == DC_PRED) { dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size]( dst, dst_stride, above_row, left_col, bit_depth); @@ -1540,6 +1686,9 @@ const int txhpx = tx_size_high[tx_size]; const int x = col_off << MI_SIZE_LOG2; const int y = row_off << MI_SIZE_LOG2; + const int is_hbd = is_cur_buf_hbd(xd); + + assert(mode < INTRA_MODES); if (use_palette) { int r, c; @@ -1547,7 +1696,7 @@ xd->color_index_map_offset[plane != 0]; const uint16_t *const palette = mbmi->palette_mode_info.palette_colors + plane * PALETTE_MAX_SIZE; - if (is_cur_buf_hbd(xd)) { + if (is_hbd) { uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst); for (r = 0; r < txhpx; ++r) { for (c = 0; c < txwpx; ++c) { @@ -1566,16 +1715,12 @@ } const struct macroblockd_plane *const pd = &xd->plane[plane]; - const int txw = tx_size_wide_unit[tx_size]; - const int txh = tx_size_high_unit[tx_size]; const int ss_x = pd->subsampling_x; const int ss_y = pd->subsampling_y; const int have_top = row_off || (ss_y ? xd->chroma_up_available : xd->up_available); const int have_left = col_off || (ss_x ? xd->chroma_left_available : xd->left_available); - const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); - const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); // Distance between the right edge of this prediction block to // the frame right edge @@ -1583,6 +1728,36 @@ // Distance between the bottom edge of this prediction block to // the frame bottom edge const int yd = (xd->mb_to_bottom_edge >> (3 + ss_y)) + hpx - y - txhpx; + const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; + const int is_dr_mode = av1_is_directional_mode(mode); + + // The computations in this function, as well as in build_intra_predictors(), + // are generalized for all intra modes. Some of these operations are not + // required since non-directional intra modes (i.e., DC, SMOOTH, SMOOTH_H, + // SMOOTH_V, and PAETH) specifically require left and top neighbors. Hence, a + // separate function build_non_directional_intra_predictors() is introduced + // for these modes to avoid redundant computations while generating pred data. + + const int n_top_px = have_top ? AOMMIN(txwpx, xr + txwpx) : 0; + const int n_left_px = have_left ? AOMMIN(txhpx, yd + txhpx) : 0; + if (!use_filter_intra && !is_dr_mode) { +#if CONFIG_AV1_HIGHBITDEPTH + if (is_hbd) { + highbd_build_non_directional_intra_predictors( + ref, ref_stride, dst, dst_stride, mode, tx_size, n_top_px, n_left_px, + xd->bd); + return; + } +#endif // CONFIG_AV1_HIGHBITDEPTH + build_non_directional_intra_predictors(ref, ref_stride, dst, dst_stride, + mode, tx_size, n_top_px, n_left_px); + return; + } + + const int txw = tx_size_wide_unit[tx_size]; + const int txh = tx_size_high_unit[tx_size]; + const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2); + const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2); const int right_available = mi_col + ((col_off + txw) << ss_x) < xd->tile.mi_col_end; const int bottom_available = @@ -1596,8 +1771,6 @@ bsize = scale_chroma_bsize(bsize, ss_x, ss_y); } - const int is_dr_mode = av1_is_directional_mode(mode); - const int use_filter_intra = filter_intra_mode != FILTER_INTRA_MODES; int p_angle = 0; int need_top_right = extend_modes[mode] & NEED_ABOVERIGHT; int need_bottom_left = extend_modes[mode] & NEED_BOTTOMLEFT; @@ -1629,25 +1802,23 @@ const int disable_edge_filter = !enable_intra_edge_filter; const int intra_edge_filter_type = get_intra_edge_filter_type(xd, plane); + const int n_topright_px = + have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right; + const int n_bottomleft_px = + have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left; #if CONFIG_AV1_HIGHBITDEPTH - if (is_cur_buf_hbd(xd)) { - highbd_build_intra_predictors( + if (is_hbd) { + highbd_build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, - tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, - intra_edge_filter_type, xd->bd); + tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, + n_bottomleft_px, intra_edge_filter_type, xd->bd); return; } #endif - build_intra_predictors( + build_directional_and_filter_intra_predictors( ref, ref_stride, dst, dst_stride, mode, p_angle, filter_intra_mode, - tx_size, disable_edge_filter, have_top ? AOMMIN(txwpx, xr + txwpx) : 0, - have_top_right > 0 ? AOMMIN(txwpx, xr) : have_top_right, - have_left ? AOMMIN(txhpx, yd + txhpx) : 0, - have_bottom_left > 0 ? AOMMIN(txhpx, yd) : have_bottom_left, - intra_edge_filter_type); + tx_size, disable_edge_filter, n_top_px, n_topright_px, n_left_px, + n_bottomleft_px, intra_edge_filter_type); } void av1_predict_intra_block_facade(const AV1_COMMON *cm, MACROBLOCKD *xd, diff -Nru aom-3.8.2/av1/common/resize.c aom-3.9.0/av1/common/resize.c --- aom-3.8.2/av1/common/resize.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/resize.c 2024-05-07 19:57:02.837000000 +0000 @@ -316,91 +316,6 @@ } } -static void interpolate_core_double_prec(const double *const input, - int in_length, double *output, - int out_length, - const int16_t *interp_filters, - int interp_taps) { - const int32_t delta = - (((uint32_t)in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / - out_length; - const int32_t offset = - in_length > out_length - ? (((int32_t)(in_length - out_length) << (RS_SCALE_SUBPEL_BITS - 1)) + - out_length / 2) / - out_length - : -(((int32_t)(out_length - in_length) - << (RS_SCALE_SUBPEL_BITS - 1)) + - out_length / 2) / - out_length; - double *optr = output; - int x, x1, x2, k, int_pel, sub_pel; - double sum; - int32_t y; - - x = 0; - y = offset + RS_SCALE_EXTRA_OFF; - while ((y >> RS_SCALE_SUBPEL_BITS) < (interp_taps / 2 - 1)) { - x++; - y += delta; - } - x1 = x; - x = out_length - 1; - y = delta * x + offset + RS_SCALE_EXTRA_OFF; - while ((y >> RS_SCALE_SUBPEL_BITS) + (int32_t)(interp_taps / 2) >= - in_length) { - x--; - y -= delta; - } - x2 = x; - if (x1 > x2) { - for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < out_length; - ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) { - const int pk = int_pel - interp_taps / 2 + 1 + k; - sum += filter[k] * input[AOMMAX(AOMMIN(pk, in_length - 1), 0)]; - } - *optr++ = sum / (1 << FILTER_BITS); - } - } else { - // Initial part. - for (x = 0, y = offset + RS_SCALE_EXTRA_OFF; x < x1; ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) - sum += filter[k] * input[AOMMAX(int_pel - interp_taps / 2 + 1 + k, 0)]; - *optr++ = sum / (1 << FILTER_BITS); - } - // Middle part. - for (; x <= x2; ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) - sum += filter[k] * input[int_pel - interp_taps / 2 + 1 + k]; - *optr++ = sum / (1 << FILTER_BITS); - } - // End part. - for (; x < out_length; ++x, y += delta) { - int_pel = y >> RS_SCALE_SUBPEL_BITS; - sub_pel = (y >> RS_SCALE_EXTRA_BITS) & RS_SUBPEL_MASK; - const int16_t *filter = &interp_filters[sub_pel * interp_taps]; - sum = 0; - for (k = 0; k < interp_taps; ++k) - sum += filter[k] * - input[AOMMIN(int_pel - interp_taps / 2 + 1 + k, in_length - 1)]; - *optr++ = sum / (1 << FILTER_BITS); - } - } -} - static void interpolate(const uint8_t *const input, int in_length, uint8_t *output, int out_length) { const InterpKernel *interp_filters = @@ -410,15 +325,6 @@ SUBPEL_TAPS); } -static void interpolate_double_prec(const double *const input, int in_length, - double *output, int out_length) { - const InterpKernel *interp_filters = - choose_interp_filter(in_length, out_length); - - interpolate_core_double_prec(input, in_length, output, out_length, - &interp_filters[0][0], SUBPEL_TAPS); -} - int32_t av1_get_upscale_convolve_step(int in_length, int out_length) { return ((in_length << RS_SCALE_SUBPEL_BITS) + out_length / 2) / out_length; } @@ -600,12 +506,6 @@ } } -static void upscale_multistep_double_prec(const double *const input, int length, - double *output, int olength) { - assert(length < olength); - interpolate_double_prec(input, length, output, olength); -} - static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { int i; uint8_t *iptr = img; @@ -624,27 +524,7 @@ } } -static void fill_col_to_arr_double_prec(double *img, int stride, int len, - double *arr) { - int i; - double *iptr = img; - double *aptr = arr; - for (i = 0; i < len; ++i, iptr += stride) { - *aptr++ = *iptr; - } -} - -static void fill_arr_to_col_double_prec(double *img, int stride, int len, - double *arr) { - int i; - double *iptr = img; - double *aptr = arr; - for (i = 0; i < len; ++i, iptr += stride) { - *iptr = *aptr++; - } -} - -bool av1_resize_plane(const uint8_t *const input, int height, int width, +bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride) { int i; @@ -679,38 +559,6 @@ return mem_status; } -bool av1_upscale_plane_double_prec(const double *const input, int height, - int width, int in_stride, double *output, - int height2, int width2, int out_stride) { - int i; - bool mem_status = true; - double *intbuf = (double *)aom_malloc(sizeof(double) * width2 * height); - double *arrbuf = (double *)aom_malloc(sizeof(double) * height); - double *arrbuf2 = (double *)aom_malloc(sizeof(double) * height2); - if (intbuf == NULL || arrbuf == NULL || arrbuf2 == NULL) { - mem_status = false; - goto Error; - } - assert(width > 0); - assert(height > 0); - assert(width2 > 0); - assert(height2 > 0); - for (i = 0; i < height; ++i) - upscale_multistep_double_prec(input + in_stride * i, width, - intbuf + width2 * i, width2); - for (i = 0; i < width2; ++i) { - fill_col_to_arr_double_prec(intbuf + i, width2, height, arrbuf); - upscale_multistep_double_prec(arrbuf, height, arrbuf2, height2); - fill_arr_to_col_double_prec(output + i, out_stride, height2, arrbuf2); - } - -Error: - aom_free(intbuf); - aom_free(arrbuf); - aom_free(arrbuf2); - return mem_status; -} - static bool upscale_normative_rect(const uint8_t *const input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, @@ -1033,7 +881,7 @@ } } -void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, +void av1_highbd_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int bd) { int i; @@ -1132,10 +980,9 @@ } #endif // CONFIG_AV1_HIGHBITDEPTH -void av1_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +void av1_resize_frame420(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride)) @@ -1148,10 +995,9 @@ abort(); } -bool av1_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame422(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride)) @@ -1165,10 +1011,9 @@ return true; } -bool av1_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame444(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth) { if (!av1_resize_plane(y, height, width, y_stride, oy, oheight, owidth, oy_stride)) @@ -1183,8 +1028,8 @@ } #if CONFIG_AV1_HIGHBITDEPTH -void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame420(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -1197,8 +1042,8 @@ owidth / 2, ouv_stride, bd); } -void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame422(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -1211,8 +1056,8 @@ owidth / 2, ouv_stride, bd); } -void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame444(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -1247,9 +1092,11 @@ uint8_t *dst_buffer = dst->buffers[i]; const int dst_stride = dst->strides[is_uv]; for (int y = 0; y < dst_h; y += 16) { - const int y_q4 = y * 16 * src_h / dst_h + phase_scaler; + const int y_q4 = + src_h == dst_h ? 0 : y * 16 * src_h / dst_h + phase_scaler; for (int x = 0; x < dst_w; x += 16) { - const int x_q4 = x * 16 * src_w / dst_w + phase_scaler; + const int x_q4 = + src_w == dst_w ? 0 : x * 16 * src_w / dst_w + phase_scaler; const uint8_t *src_ptr = src_buffer + y * src_h / dst_h * src_stride + x * src_w / dst_w; uint8_t *dst_ptr = dst_buffer + y * dst_stride + x; @@ -1276,7 +1123,7 @@ bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, - const int num_planes) { + int num_planes) { // TODO(dkovalev): replace YV12_BUFFER_CONFIG with aom_image_t // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet @@ -1396,8 +1243,7 @@ YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, const InterpFilter filter, const int phase, const bool use_optimized_scaler, - const bool for_psnr, const int border_in_pixels, - const int num_pyramid_levels) { + const bool for_psnr, const int border_in_pixels, const bool alloc_pyramid) { // If scaling is performed for the sole purpose of calculating PSNR, then our // target dimensions are superres upscaled width/height. Otherwise our target // dimensions are coded width/height. @@ -1417,7 +1263,7 @@ scaled, scaled_width, scaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, border_in_pixels, cm->features.byte_alignment, NULL, NULL, NULL, - num_pyramid_levels, 0)) + alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled buffer"); @@ -1513,7 +1359,7 @@ // TODO(afergs): aom_ vs av1_ functions? Which can I use? // Upscale decoded image. void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, - int num_pyramid_levels) { + bool alloc_pyramid) { const int num_planes = av1_num_planes(cm); if (!av1_superres_scaled(cm)) return; const SequenceHeader *const seq_params = cm->seq_params; @@ -1528,7 +1374,7 @@ if (aom_alloc_frame_buffer( ©_buffer, aligned_width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, byte_alignment, 0, 0)) + AOM_BORDER_IN_PIXELS, byte_alignment, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate copy buffer for superres upscaling"); @@ -1561,7 +1407,7 @@ cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, byte_alignment, fb, cb, cb_priv, - num_pyramid_levels, 0)) { + alloc_pyramid, 0)) { unlock_buffer_pool(pool); aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, @@ -1578,7 +1424,7 @@ frame_to_show, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - AOM_BORDER_IN_PIXELS, byte_alignment, num_pyramid_levels, 0)) + AOM_BORDER_IN_PIXELS, byte_alignment, alloc_pyramid, 0)) aom_internal_error( cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate current frame buffer for superres upscaling"); diff -Nru aom-3.8.2/av1/common/resize.h aom-3.9.0/av1/common/resize.h --- aom-3.8.2/av1/common/resize.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/resize.h 2024-05-07 19:57:02.840000000 +0000 @@ -20,47 +20,41 @@ extern "C" { #endif -bool av1_resize_plane(const uint8_t *const input, int height, int width, +bool av1_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride); -bool av1_upscale_plane_double_prec(const double *const input, int height, - int width, int in_stride, double *output, - int height2, int width2, int out_stride); // TODO(aomedia:3228): In libaom 4.0.0, remove av1_resize_frame420 from // av1/exports_com and delete this function. -void av1_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +void av1_resize_frame420(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth); -bool av1_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame422(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth); -bool av1_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, - int uv_stride, int height, int width, uint8_t *oy, - int oy_stride, uint8_t *ou, uint8_t *ov, +bool av1_resize_frame444(const uint8_t *y, int y_stride, const uint8_t *u, + const uint8_t *v, int uv_stride, int height, int width, + uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth); -void av1_highbd_resize_plane(const uint8_t *const input, int height, int width, +void av1_highbd_resize_plane(const uint8_t *input, int height, int width, int in_stride, uint8_t *output, int height2, int width2, int out_stride, int bd); -void av1_highbd_resize_frame420(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame420(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth, int bd); -void av1_highbd_resize_frame422(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame422(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, int owidth, int bd); -void av1_highbd_resize_frame444(const uint8_t *const y, int y_stride, - const uint8_t *const u, const uint8_t *const v, +void av1_highbd_resize_frame444(const uint8_t *y, int y_stride, + const uint8_t *u, const uint8_t *v, int uv_stride, int height, int width, uint8_t *oy, int oy_stride, uint8_t *ou, uint8_t *ov, int ouv_stride, int oheight, @@ -76,12 +70,11 @@ YV12_BUFFER_CONFIG *av1_realloc_and_scale_if_required( AV1_COMMON *cm, YV12_BUFFER_CONFIG *unscaled, YV12_BUFFER_CONFIG *scaled, const InterpFilter filter, const int phase, const bool use_optimized_scaler, - const bool for_psnr, const int border_in_pixels, - const int num_pyramid_levels); + const bool for_psnr, const int border_in_pixels, const bool alloc_pyramid); bool av1_resize_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int bd, - const int num_planes); + int num_planes); // Calculates the scaled dimensions from the given original dimensions and the // resize scale denominator. @@ -98,7 +91,7 @@ void av1_calculate_unscaled_superres_size(int *width, int *height, int denom); void av1_superres_upscale(AV1_COMMON *cm, BufferPool *const pool, - int num_pyramid_levels); + bool alloc_pyramid); // Returns 1 if a superres upscaled frame is scaled and 0 otherwise. static INLINE int av1_superres_scaled(const AV1_COMMON *cm) { diff -Nru aom-3.8.2/av1/common/restoration.c aom-3.9.0/av1/common/restoration.c --- aom-3.8.2/av1/common/restoration.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/restoration.c 2024-05-07 19:57:02.842000000 +0000 @@ -11,20 +11,24 @@ */ #include +#include #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" +#include "aom/internal/aom_codec_internal.h" #include "aom_mem/aom_mem.h" +#include "aom_dsp/aom_dsp_common.h" +#include "aom_mem/aom_mem.h" +#include "aom_ports/mem.h" +#include "aom_util/aom_pthread.h" + #include "av1/common/av1_common_int.h" +#include "av1/common/convolve.h" +#include "av1/common/enums.h" #include "av1/common/resize.h" #include "av1/common/restoration.h" #include "av1/common/thread_common.h" -#include "aom_dsp/aom_dsp_common.h" -#include "aom_mem/aom_mem.h" - -#include "aom_ports/mem.h" // The 's' values are calculated based on original 'r' and 'e' values in the // spec using GenSgrprojVtable(). @@ -90,7 +94,7 @@ // Index 1 corresponds to r[1], e[1] int sgrproj_mtable[SGRPROJ_PARAMS][2]; -static void GenSgrprojVtable() { +static void GenSgrprojVtable(void) { for (int i = 0; i < SGRPROJ_PARAMS; ++i) { const sgr_params_type *const params = &av1_sgr_params[i]; for (int j = 0; j < 2; ++j) { @@ -109,14 +113,15 @@ } #endif -void av1_loop_restoration_precal() { +void av1_loop_restoration_precal(void) { #if 0 GenSgrprojVtable(); #endif } -static void extend_frame_lowbd(uint8_t *data, int width, int height, int stride, - int border_horz, int border_vert) { +static void extend_frame_lowbd(uint8_t *data, int width, int height, + ptrdiff_t stride, int border_horz, + int border_vert) { uint8_t *data_p; int i; for (i = 0; i < height; ++i) { @@ -136,7 +141,8 @@ #if CONFIG_AV1_HIGHBITDEPTH static void extend_frame_highbd(uint16_t *data, int width, int height, - int stride, int border_horz, int border_vert) { + ptrdiff_t stride, int border_horz, + int border_vert) { uint16_t *data_p; int i, j; for (i = 0; i < height; ++i) { @@ -988,8 +994,10 @@ int unit_h = limits->v_end - limits->v_start; int unit_w = limits->h_end - limits->h_start; - uint8_t *data8_tl = data8 + limits->v_start * stride + limits->h_start; - uint8_t *dst8_tl = dst8 + limits->v_start * dst_stride + limits->h_start; + uint8_t *data8_tl = + data8 + limits->v_start * (ptrdiff_t)stride + limits->h_start; + uint8_t *dst8_tl = + dst8 + limits->v_start * (ptrdiff_t)dst_stride + limits->h_start; if (unit_rtype == RESTORE_NONE) { copy_rest_unit(unit_w, unit_h, data8_tl, stride, dst8_tl, dst_stride, @@ -1074,7 +1082,8 @@ if (aom_realloc_frame_buffer( lr_ctxt->dst, frame_width, frame_height, seq_params->subsampling_x, seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0) != AOM_CODEC_OK) + cm->features.byte_alignment, NULL, NULL, NULL, false, + 0) != AOM_CODEC_OK) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate restoration dst buffer"); @@ -1349,7 +1358,7 @@ const int is_uv = plane > 0; const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); const int src_stride = frame->strides[is_uv] << use_highbd; - const uint8_t *src_rows = src_buf + row * src_stride; + const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride; uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below; @@ -1404,7 +1413,7 @@ const int is_uv = plane > 0; const uint8_t *src_buf = REAL_PTR(use_highbd, frame->buffers[plane]); const int src_stride = frame->strides[is_uv] << use_highbd; - const uint8_t *src_rows = src_buf + row * src_stride; + const uint8_t *src_rows = src_buf + row * (ptrdiff_t)src_stride; uint8_t *bdry_buf = is_above ? boundaries->stripe_boundary_above : boundaries->stripe_boundary_below; diff -Nru aom-3.8.2/av1/common/restoration.h aom-3.9.0/av1/common/restoration.h --- aom-3.8.2/av1/common/restoration.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/restoration.h 2024-05-07 19:57:02.845000000 +0000 @@ -410,7 +410,7 @@ void *lr_ctxt); /*!\cond */ -void av1_loop_restoration_precal(); +void av1_loop_restoration_precal(void); struct AV1LrSyncData; diff -Nru aom-3.8.2/av1/common/thread_common.c aom-3.9.0/av1/common/thread_common.c --- aom-3.8.2/av1/common/thread_common.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/thread_common.c 2024-05-07 19:57:02.866000000 +0000 @@ -14,12 +14,19 @@ #include "config/aom_scale_rtcd.h" #include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" +#include "aom_util/aom_pthread.h" +#include "aom_util/aom_thread.h" #include "av1/common/av1_loopfilter.h" +#include "av1/common/blockd.h" +#include "av1/common/cdef.h" #include "av1/common/entropymode.h" +#include "av1/common/enums.h" #include "av1/common/thread_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" +#include "av1/common/restoration.h" // Set up nsync by width. static INLINE int get_sync_range(int width) { @@ -57,7 +64,6 @@ void av1_loop_filter_alloc(AV1LfSync *lf_sync, AV1_COMMON *cm, int rows, int width, int num_workers) { lf_sync->rows = rows; - lf_sync->lf_mt_exit = false; #if CONFIG_MULTITHREAD { int i, j; @@ -234,7 +240,12 @@ if (sig) { pthread_mutex_lock(&lf_sync->mutex_[plane][r]); - lf_sync->cur_sb_col[plane][r] = cur; + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + lf_sync->cur_sb_col[plane][r] = AOMMAX(lf_sync->cur_sb_col[plane][r], cur); pthread_cond_broadcast(&lf_sync->cond_[plane][r]); pthread_mutex_unlock(&lf_sync->mutex_[plane][r]); @@ -373,9 +384,7 @@ error_info = ((LFWorkerData *)worker->data2)->error_info; } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); } // Row-based multi-threaded loopfilter hook @@ -551,7 +560,13 @@ if (sig) { pthread_mutex_lock(&loop_res_sync->mutex_[plane][r]); - loop_res_sync->cur_sb_col[plane][r] = cur; + // When a thread encounters an error, cur_sb_col[plane][r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // cur_sb_col[plane][r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + loop_res_sync->cur_sb_col[plane][r] = + AOMMAX(loop_res_sync->cur_sb_col[plane][r], cur); pthread_cond_broadcast(&loop_res_sync->cond_[plane][r]); pthread_mutex_unlock(&loop_res_sync->mutex_[plane][r]); @@ -601,7 +616,8 @@ } #endif // CONFIG_MULTITHREAD CHECK_MEM_ERROR(cm, lr_sync->lrworkerdata, - aom_malloc(num_workers * sizeof(*(lr_sync->lrworkerdata)))); + aom_calloc(num_workers, sizeof(*(lr_sync->lrworkerdata)))); + lr_sync->num_workers = num_workers; for (int worker_idx = 0; worker_idx < num_workers; ++worker_idx) { if (worker_idx < num_workers - 1) { @@ -616,9 +632,6 @@ } } - lr_sync->num_workers = num_workers; - lr_sync->lr_mt_exit = false; - for (int j = 0; j < num_planes; j++) { CHECK_MEM_ERROR( cm, lr_sync->cur_sb_col[j], @@ -898,9 +911,7 @@ error_info = ((LRWorkerData *)worker->data2)->error_info; } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); } static void foreach_rest_unit_in_planes_mt(AV1LrStruct *lr_ctxt, @@ -932,6 +943,7 @@ av1_loop_restoration_alloc(lr_sync, cm, num_workers, num_rows_lr, num_planes, cm->width); } + lr_sync->lr_mt_exit = false; // Initialize cur_sb_col to -1 for all SB rows. for (i = 0; i < num_planes; i++) { @@ -985,6 +997,7 @@ cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; } static AOM_INLINE void launch_cdef_workers(AVxWorker *const workers, @@ -1021,9 +1034,7 @@ error_info = ((AV1CdefWorkerData *)worker->data2)->error_info; } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); } // Updates the row index of the next job to be processed. diff -Nru aom-3.8.2/av1/common/thread_common.h aom-3.9.0/av1/common/thread_common.h --- aom-3.8.2/av1/common/thread_common.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/thread_common.h 2024-05-07 19:57:02.868000000 +0000 @@ -16,6 +16,7 @@ #include "av1/common/av1_loopfilter.h" #include "av1/common/cdef.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #ifdef __cplusplus @@ -269,6 +270,7 @@ av1_loop_filter_dealloc(lf_sync); av1_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } + lf_sync->lf_mt_exit = false; // Initialize cur_sb_col to -1 for all SB rows. for (int i = 0; i < MAX_MB_PLANE; i++) { @@ -314,15 +316,21 @@ } } -static AOM_INLINE int check_planes_to_loop_filter(const struct loopfilter *lf, - int *planes_to_lf, - int plane_start, - int plane_end) { +static AOM_INLINE void set_planes_to_loop_filter(const struct loopfilter *lf, + int planes_to_lf[MAX_MB_PLANE], + int plane_start, + int plane_end) { // For each luma and chroma plane, whether to filter it or not. planes_to_lf[0] = (lf->filter_level[0] || lf->filter_level[1]) && plane_start <= 0 && 0 < plane_end; planes_to_lf[1] = lf->filter_level_u && plane_start <= 1 && 1 < plane_end; planes_to_lf[2] = lf->filter_level_v && plane_start <= 2 && 2 < plane_end; +} + +static AOM_INLINE int check_planes_to_loop_filter( + const struct loopfilter *lf, int planes_to_lf[MAX_MB_PLANE], + int plane_start, int plane_end) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); // If the luma plane is purposely not filtered, neither are the chroma // planes. if (!planes_to_lf[0] && plane_start <= 0 && 0 < plane_end) return 0; diff -Nru aom-3.8.2/av1/common/tile_common.c aom-3.9.0/av1/common/tile_common.c --- aom-3.8.2/av1/common/tile_common.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/tile_common.c 2024-05-07 19:57:02.869000000 +0000 @@ -177,46 +177,16 @@ cm->seq_params->mib_size_log2); } -PixelRect av1_get_tile_rect(const TileInfo *tile_info, const AV1_COMMON *cm, - int is_uv) { - PixelRect r; - - // Calculate position in the Y plane - r.left = tile_info->mi_col_start * MI_SIZE; - r.right = tile_info->mi_col_end * MI_SIZE; - r.top = tile_info->mi_row_start * MI_SIZE; - r.bottom = tile_info->mi_row_end * MI_SIZE; - - // If upscaling is enabled, the tile limits need scaling to match the - // upscaled frame where the restoration units live. To do this, scale up the - // top-left and bottom-right of the tile. - if (av1_superres_scaled(cm)) { - av1_calculate_unscaled_superres_size(&r.left, &r.top, - cm->superres_scale_denominator); - av1_calculate_unscaled_superres_size(&r.right, &r.bottom, - cm->superres_scale_denominator); - } - - const int frame_w = cm->superres_upscaled_width; - const int frame_h = cm->superres_upscaled_height; - - // Make sure we don't fall off the bottom-right of the frame. - r.right = AOMMIN(r.right, frame_w); - r.bottom = AOMMIN(r.bottom, frame_h); - - // Convert to coordinates in the appropriate plane - const int ss_x = is_uv && cm->seq_params->subsampling_x; - const int ss_y = is_uv && cm->seq_params->subsampling_y; - - r.left = ROUND_POWER_OF_TWO(r.left, ss_x); - r.right = ROUND_POWER_OF_TWO(r.right, ss_x); - r.top = ROUND_POWER_OF_TWO(r.top, ss_y); - r.bottom = ROUND_POWER_OF_TWO(r.bottom, ss_y); - - return r; -} - -void av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { +// Section 7.3.1 of the AV1 spec says, on pages 200-201: +// It is a requirement of bitstream conformance that the following conditions +// are met: +// ... +// * TileHeight is equal to (use_128x128_superblock ? 128 : 64) for all +// tiles (i.e. the tile is exactly one superblock high) +// * TileWidth is identical for all tiles and is an integer multiple of +// TileHeight (i.e. the tile is an integer number of superblocks wide) +// ... +bool av1_get_uniform_tile_size(const AV1_COMMON *cm, int *w, int *h) { const CommonTileParams *const tiles = &cm->tiles; if (tiles->uniform_spacing) { *w = tiles->width; @@ -226,7 +196,10 @@ const int tile_width_sb = tiles->col_start_sb[i + 1] - tiles->col_start_sb[i]; const int tile_w = tile_width_sb * cm->seq_params->mib_size; - assert(i == 0 || tile_w == *w); // ensure all tiles have same dimension + // ensure all tiles have same dimension + if (i != 0 && tile_w != *w) { + return false; + } *w = tile_w; } @@ -234,10 +207,14 @@ const int tile_height_sb = tiles->row_start_sb[i + 1] - tiles->row_start_sb[i]; const int tile_h = tile_height_sb * cm->seq_params->mib_size; - assert(i == 0 || tile_h == *h); // ensure all tiles have same dimension + // ensure all tiles have same dimension + if (i != 0 && tile_h != *h) { + return false; + } *h = tile_h; } } + return true; } int av1_is_min_tile_width_satisfied(const AV1_COMMON *cm) { diff -Nru aom-3.8.2/av1/common/tile_common.h aom-3.9.0/av1/common/tile_common.h --- aom-3.8.2/av1/common/tile_common.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/tile_common.h 2024-05-07 19:57:02.869000000 +0000 @@ -12,13 +12,14 @@ #ifndef AOM_AV1_COMMON_TILE_COMMON_H_ #define AOM_AV1_COMMON_TILE_COMMON_H_ +#include + +#include "config/aom_config.h" + #ifdef __cplusplus extern "C" { #endif -#include "config/aom_config.h" -#include "aom_dsp/rect.h" - struct AV1Common; struct SequenceHeader; struct CommonTileParams; @@ -43,10 +44,6 @@ int av1_get_sb_rows_in_tile(const struct AV1Common *cm, const TileInfo *tile); int av1_get_sb_cols_in_tile(const struct AV1Common *cm, const TileInfo *tile); -// Return the pixel extents of the given tile -PixelRect av1_get_tile_rect(const TileInfo *tile_info, - const struct AV1Common *cm, int is_uv); - // Define tile maximum width and area // There is no maximum height since height is limited by area and width limits // The minimum tile width or height is fixed at one superblock @@ -56,7 +53,9 @@ #define MAX_TILE_AREA_LEVEL_7_AND_ABOVE (4096 * 4608) #endif -void av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); +// Gets the width and height (in units of MI_SIZE) of the tiles in a tile list. +// Returns true on success, false on failure. +bool av1_get_uniform_tile_size(const struct AV1Common *cm, int *w, int *h); void av1_get_tile_limits(struct AV1Common *const cm); void av1_calculate_tile_cols(const struct SequenceHeader *const seq_params, int cm_mi_rows, int cm_mi_cols, diff -Nru aom-3.8.2/av1/common/warped_motion.c aom-3.9.0/av1/common/warped_motion.c --- aom-3.8.2/av1/common/warped_motion.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/warped_motion.c 2024-05-07 19:57:02.884000000 +0000 @@ -291,9 +291,7 @@ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { int32_t tmp[15 * 8]; - const int reduce_bits_horiz = - conv_params->round_0 + - AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; @@ -306,6 +304,10 @@ (void)max_bits_horiz; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + for (int i = p_row; i < p_row + p_height; i += 8) { for (int j = p_col; j < p_col + p_width; j += 8) { // Calculate the center of this 8x8 block, diff -Nru aom-3.8.2/av1/common/x86/av1_convolve_scale_sse4.c aom-3.9.0/av1/common/x86/av1_convolve_scale_sse4.c --- aom-3.8.2/av1/common/x86/av1_convolve_scale_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/av1_convolve_scale_sse4.c 2024-05-07 19:57:02.887000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff -Nru aom-3.8.2/av1/common/x86/cdef_block_sse2.c aom-3.9.0/av1/common/x86/cdef_block_sse2.c --- aom-3.8.2/av1/common/x86/cdef_block_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/cdef_block_sse2.c 1970-01-01 00:00:00.000000000 +0000 @@ -1,40 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#include "aom_dsp/aom_simd.h" -#define SIMD_FUNC(name) name##_sse2 -#include "av1/common/cdef_block_simd.h" - -void cdef_find_dir_dual_sse2(const uint16_t *img1, const uint16_t *img2, - int stride, int32_t *var_out_1st, - int32_t *var_out_2nd, int coeff_shift, - int *out_dir_1st_8x8, int *out_dir_2nd_8x8) { - // Process first 8x8. - *out_dir_1st_8x8 = cdef_find_dir(img1, stride, var_out_1st, coeff_shift); - - // Process second 8x8. - *out_dir_2nd_8x8 = cdef_find_dir(img2, stride, var_out_2nd, coeff_shift); -} - -void cdef_copy_rect8_8bit_to_16bit_sse2(uint16_t *dst, int dstride, - const uint8_t *src, int sstride, - int width, int height) { - int j = 0; - for (int i = 0; i < height; i++) { - for (j = 0; j < (width & ~0x7); j += 8) { - v64 row = v64_load_unaligned(&src[i * sstride + j]); - v128_store_unaligned(&dst[i * dstride + j], v128_unpack_u8_s16(row)); - } - for (; j < width; j++) { - dst[i * dstride + j] = src[i * sstride + j]; - } - } -} diff -Nru aom-3.8.2/av1/common/x86/cdef_block_ssse3.c aom-3.9.0/av1/common/x86/cdef_block_ssse3.c --- aom-3.8.2/av1/common/x86/cdef_block_ssse3.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/cdef_block_ssse3.c 2024-05-07 19:57:02.900000000 +0000 @@ -9,6 +9,17 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +// Include SSSE3 CDEF code only for 32-bit x86, to support Valgrind. +// For normal use, we require SSE4.1, so cdef_*_sse4_1 will be used instead of +// these functions. However, 32-bit Valgrind does not support SSE4.1, so we +// include a fallback to SSSE3 to improve performance + +#include "config/aom_config.h" + +#if !AOM_ARCH_X86 +#error "cdef_block_ssse3.c is included for compatibility with 32-bit x86 only" +#endif // !AOM_ARCH_X86 + #include "aom_dsp/aom_simd.h" #define SIMD_FUNC(name) name##_ssse3 #include "av1/common/cdef_block_simd.h" diff -Nru aom-3.8.2/av1/common/x86/highbd_convolve_2d_avx2.c aom-3.9.0/av1/common/x86/highbd_convolve_2d_avx2.c --- aom-3.8.2/av1/common/x86/highbd_convolve_2d_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_convolve_2d_avx2.c 2024-05-07 19:57:02.906000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/synonyms.h" diff -Nru aom-3.8.2/av1/common/x86/highbd_convolve_2d_sse4.c aom-3.9.0/av1/common/x86/highbd_convolve_2d_sse4.c --- aom-3.8.2/av1/common/x86/highbd_convolve_2d_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_convolve_2d_sse4.c 2024-05-07 19:57:02.908000000 +0000 @@ -13,7 +13,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff -Nru aom-3.8.2/av1/common/x86/highbd_convolve_2d_ssse3.c aom-3.9.0/av1/common/x86/highbd_convolve_2d_ssse3.c --- aom-3.8.2/av1/common/x86/highbd_convolve_2d_ssse3.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_convolve_2d_ssse3.c 2024-05-07 19:57:02.908000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff -Nru aom-3.8.2/av1/common/x86/highbd_jnt_convolve_avx2.c aom-3.9.0/av1/common/x86/highbd_jnt_convolve_avx2.c --- aom-3.8.2/av1/common/x86/highbd_jnt_convolve_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_jnt_convolve_avx2.c 2024-05-07 19:57:02.922000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_avx2.h" #include "aom_dsp/x86/convolve_common_intrin.h" diff -Nru aom-3.8.2/av1/common/x86/highbd_jnt_convolve_sse4.c aom-3.9.0/av1/common/x86/highbd_jnt_convolve_sse4.c --- aom-3.8.2/av1/common/x86/highbd_jnt_convolve_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_jnt_convolve_sse4.c 2024-05-07 19:57:02.923000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/convolve_sse2.h" #include "aom_dsp/x86/convolve_sse4_1.h" diff -Nru aom-3.8.2/av1/common/x86/highbd_warp_affine_avx2.c aom-3.9.0/av1/common/x86/highbd_warp_affine_avx2.c --- aom-3.8.2/av1/common/x86/highbd_warp_affine_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_warp_affine_avx2.c 2024-05-07 19:57:02.926000000 +0000 @@ -22,9 +22,7 @@ ConvolveParams *conv_params, int16_t alpha, int16_t beta, int16_t gamma, int16_t delta) { __m256i tmp[15]; - const int reduce_bits_horiz = - conv_params->round_0 + - AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; @@ -37,6 +35,10 @@ (void)max_bits_horiz; assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL)); + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + const __m256i clip_pixel = _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert); diff -Nru aom-3.8.2/av1/common/x86/highbd_warp_plane_sse4.c aom-3.9.0/av1/common/x86/highbd_warp_plane_sse4.c --- aom-3.8.2/av1/common/x86/highbd_warp_plane_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_warp_plane_sse4.c 2024-05-07 19:57:02.927000000 +0000 @@ -302,9 +302,7 @@ int16_t beta, int16_t gamma, int16_t delta) { __m128i tmp[15]; int i, j, k; - const int reduce_bits_horiz = - conv_params->round_0 + - AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0); + const int reduce_bits_horiz = conv_params->round_0; const int reduce_bits_vert = conv_params->is_compound ? conv_params->round_1 : 2 * FILTER_BITS - reduce_bits_horiz; @@ -313,6 +311,10 @@ assert(!(bd == 12 && reduce_bits_horiz < 5)); assert(IMPLIES(conv_params->do_average, conv_params->is_compound)); + // Check that, even with 12-bit input, the intermediate values will fit + // into an unsigned 16-bit intermediate array. + assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16); + const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz; const __m128i clip_pixel = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); diff -Nru aom-3.8.2/av1/common/x86/highbd_wiener_convolve_avx2.c aom-3.9.0/av1/common/x86/highbd_wiener_convolve_avx2.c --- aom-3.8.2/av1/common/x86/highbd_wiener_convolve_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_wiener_convolve_avx2.c 2024-05-07 19:57:02.928000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" diff -Nru aom-3.8.2/av1/common/x86/highbd_wiener_convolve_ssse3.c aom-3.9.0/av1/common/x86/highbd_wiener_convolve_ssse3.c --- aom-3.8.2/av1/common/x86/highbd_wiener_convolve_ssse3.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/highbd_wiener_convolve_ssse3.c 2024-05-07 19:57:02.929000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "av1/common/convolve.h" #include "aom_dsp/aom_dsp_common.h" diff -Nru aom-3.8.2/av1/common/x86/jnt_convolve_avx2.c aom-3.9.0/av1/common/x86/jnt_convolve_avx2.c --- aom-3.8.2/av1/common/x86/jnt_convolve_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/jnt_convolve_avx2.c 2024-05-07 19:57:02.929000000 +0000 @@ -12,7 +12,7 @@ #include #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/aom_filter.h" diff -Nru aom-3.8.2/av1/common/x86/jnt_convolve_sse2.c aom-3.9.0/av1/common/x86/jnt_convolve_sse2.c --- aom-3.8.2/av1/common/x86/jnt_convolve_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/jnt_convolve_sse2.c 2024-05-07 19:57:02.931000000 +0000 @@ -11,7 +11,7 @@ #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" @@ -375,232 +375,3 @@ } while (j < w); } } - -void av1_dist_wtd_convolve_2d_sse2(const uint8_t *src, int src_stride, - uint8_t *dst0, int dst_stride0, int w, int h, - const InterpFilterParams *filter_params_x, - const InterpFilterParams *filter_params_y, - const int subpel_x_qn, const int subpel_y_qn, - ConvolveParams *conv_params) { - CONV_BUF_TYPE *dst = conv_params->dst; - int dst_stride = conv_params->dst_stride; - const int bd = 8; - - DECLARE_ALIGNED(16, int16_t, - im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]); - int im_h = h + filter_params_y->taps - 1; - int im_stride = MAX_SB_SIZE; - int i, j; - const int fo_vert = filter_params_y->taps / 2 - 1; - const int fo_horiz = filter_params_x->taps / 2 - 1; - const int do_average = conv_params->do_average; - const int use_dist_wtd_comp_avg = conv_params->use_dist_wtd_comp_avg; - const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz; - - const __m128i zero = _mm_setzero_si128(); - - const int w0 = conv_params->fwd_offset; - const int w1 = conv_params->bck_offset; - const __m128i wt0 = _mm_set1_epi16(w0); - const __m128i wt1 = _mm_set1_epi16(w1); - const __m128i wt = _mm_unpacklo_epi16(wt0, wt1); - - const int offset_0 = - bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); - const __m128i offset_const = _mm_set1_epi16(offset); - const int rounding_shift = - 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; - const __m128i rounding_const = _mm_set1_epi16((1 << rounding_shift) >> 1); - - /* Horizontal filter */ - { - const int16_t *x_filter = av1_get_interp_filter_subpel_kernel( - filter_params_x, subpel_x_qn & SUBPEL_MASK); - const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0); - - for (i = 0; i < im_h; ++i) { - for (j = 0; j < w; j += 8) { - __m128i temp_lo, temp_hi; - const __m128i data = - _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); - - const __m128i src_lo = _mm_unpacklo_epi8(data, zero); - const __m128i src_hi = _mm_unpackhi_epi8(data, zero); - - // Filter even-index pixels - const __m128i res_0 = _mm_madd_epi16(src_lo, coeff_01); - temp_lo = _mm_srli_si128(src_lo, 4); - temp_hi = _mm_slli_si128(src_hi, 12); - const __m128i src_2 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - temp_lo = _mm_srli_si128(src_lo, 8); - temp_hi = _mm_slli_si128(src_hi, 8); - const __m128i src_4 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - temp_lo = _mm_srli_si128(src_lo, 12); - temp_hi = _mm_slli_si128(src_hi, 4); - const __m128i src_6 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), - _mm_add_epi32(res_2, res_6)); - res_even = - _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift); - - // Filter odd-index pixels - temp_lo = _mm_srli_si128(src_lo, 2); - temp_hi = _mm_slli_si128(src_hi, 14); - const __m128i src_1 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - temp_lo = _mm_srli_si128(src_lo, 6); - temp_hi = _mm_slli_si128(src_hi, 10); - const __m128i src_3 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - temp_lo = _mm_srli_si128(src_lo, 10); - temp_hi = _mm_slli_si128(src_hi, 6); - const __m128i src_5 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - temp_lo = _mm_srli_si128(src_lo, 14); - temp_hi = _mm_slli_si128(src_hi, 2); - const __m128i src_7 = _mm_or_si128(temp_hi, temp_lo); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), - _mm_add_epi32(res_3, res_7)); - res_odd = - _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift); - - // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 - __m128i res = _mm_packs_epi32(res_even, res_odd); - _mm_store_si128((__m128i *)&im_block[i * im_stride + j], res); - } - } - } - - /* Vertical filter */ - { - const int16_t *y_filter = av1_get_interp_filter_subpel_kernel( - filter_params_y, subpel_y_qn & SUBPEL_MASK); - const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter); - - // coeffs 0 1 0 1 2 3 2 3 - const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); - // coeffs 4 5 4 5 6 7 6 7 - const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); - - // coeffs 0 1 0 1 0 1 0 1 - const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); - // coeffs 2 3 2 3 2 3 2 3 - const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); - // coeffs 4 5 4 5 4 5 4 5 - const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); - // coeffs 6 7 6 7 6 7 6 7 - const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); - - const __m128i round_const = _mm_set1_epi32( - ((1 << conv_params->round_1) >> 1) - - (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1))); - const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_1); - - for (i = 0; i < h; ++i) { - for (j = 0; j < w; j += 8) { - // Filter even-index pixels - const int16_t *data = &im_block[i * im_stride + j]; - const __m128i src_0 = - _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_2 = - _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_4 = - _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_6 = - _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); - const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); - const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); - const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); - - const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), - _mm_add_epi32(res_4, res_6)); - - // Filter odd-index pixels - const __m128i src_1 = - _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride), - *(__m128i *)(data + 1 * im_stride)); - const __m128i src_3 = - _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride), - *(__m128i *)(data + 3 * im_stride)); - const __m128i src_5 = - _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride), - *(__m128i *)(data + 5 * im_stride)); - const __m128i src_7 = - _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride), - *(__m128i *)(data + 7 * im_stride)); - - const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); - const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); - const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); - const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); - - const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), - _mm_add_epi32(res_5, res_7)); - - // Rearrange pixels back into the order 0 ... 7 - const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); - const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); - - const __m128i res_lo_round = - _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift); - const __m128i res_hi_round = - _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift); - - const __m128i res_16b = _mm_packs_epi32(res_lo_round, res_hi_round); - const __m128i res_unsigned = _mm_add_epi16(res_16b, offset_const); - - // Accumulate values into the destination buffer - if (do_average) { - const __m128i data_ref_0 = - _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); - - const __m128i comp_avg_res = - comp_avg(&data_ref_0, &res_unsigned, &wt, use_dist_wtd_comp_avg); - - const __m128i round_result = convolve_rounding( - &comp_avg_res, &offset_const, &rounding_const, rounding_shift); - - const __m128i res_8 = _mm_packus_epi16(round_result, round_result); - - if (w > 4) - _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_8); - else - *(int *)(&dst0[i * dst_stride0 + j]) = _mm_cvtsi128_si32(res_8); - } else { - _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned); - } - } - } - } -} diff -Nru aom-3.8.2/av1/common/x86/jnt_convolve_ssse3.c aom-3.9.0/av1/common/x86/jnt_convolve_ssse3.c --- aom-3.8.2/av1/common/x86/jnt_convolve_ssse3.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/jnt_convolve_ssse3.c 2024-05-07 19:57:02.932000000 +0000 @@ -11,7 +11,7 @@ #include -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/aom_filter.h" #include "aom_dsp/x86/convolve_sse2.h" diff -Nru aom-3.8.2/av1/common/x86/reconinter_sse4.c aom-3.9.0/av1/common/x86/reconinter_sse4.c --- aom-3.8.2/av1/common/x86/reconinter_sse4.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/common/x86/reconinter_sse4.c 2024-05-07 19:57:02.933000000 +0000 @@ -15,6 +15,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/blend.h" #include "av1/common/blockd.h" +#include "config/av1_rtcd.h" static INLINE __m128i calc_mask(const __m128i mask_base, const __m128i s0, const __m128i s1) { diff -Nru aom-3.8.2/av1/decoder/decodeframe.c aom-3.9.0/av1/decoder/decodeframe.c --- aom-3.8.2/av1/decoder/decodeframe.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/decoder/decodeframe.c 2024-05-07 19:57:02.945000000 +0000 @@ -14,20 +14,23 @@ #include #include "config/aom_config.h" -#include "config/aom_dsp_rtcd.h" #include "config/aom_scale_rtcd.h" -#include "config/av1_rtcd.h" #include "aom/aom_codec.h" +#include "aom/aom_image.h" +#include "aom/internal/aom_codec_internal.h" #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/binary_codes_reader.h" #include "aom_dsp/bitreader.h" #include "aom_dsp/bitreader_buffer.h" +#include "aom_dsp/txfm_common.h" #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_ports/mem_ops.h" #include "aom_scale/aom_scale.h" +#include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG @@ -35,33 +38,41 @@ #endif // CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG #include "av1/common/alloccommon.h" +#include "av1/common/av1_common_int.h" +#include "av1/common/blockd.h" #include "av1/common/cdef.h" #include "av1/common/cfl.h" -#if CONFIG_INSPECTION -#include "av1/decoder/inspection.h" -#endif +#include "av1/common/common_data.h" #include "av1/common/common.h" #include "av1/common/entropy.h" #include "av1/common/entropymode.h" #include "av1/common/entropymv.h" +#include "av1/common/enums.h" #include "av1/common/frame_buffers.h" #include "av1/common/idct.h" +#include "av1/common/mv.h" #include "av1/common/mvref_common.h" +#include "av1/common/obmc.h" #include "av1/common/pred_common.h" #include "av1/common/quant_common.h" #include "av1/common/reconinter.h" #include "av1/common/reconintra.h" #include "av1/common/resize.h" +#include "av1/common/restoration.h" +#include "av1/common/scale.h" #include "av1/common/seg_common.h" #include "av1/common/thread_common.h" #include "av1/common/tile_common.h" #include "av1/common/warped_motion.h" -#include "av1/common/obmc.h" + #include "av1/decoder/decodeframe.h" #include "av1/decoder/decodemv.h" #include "av1/decoder/decoder.h" #include "av1/decoder/decodetxb.h" #include "av1/decoder/detokenize.h" +#if CONFIG_INSPECTION +#include "av1/decoder/inspection.h" +#endif #define ACCT_STR __func__ @@ -1935,8 +1946,8 @@ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_DEC_BORDER_IN_PIXELS, cm->features.byte_alignment, - &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0, - 0)) { + &cm->cur_frame->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, + false, 0)) { unlock_buffer_pool(pool); aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); @@ -2293,7 +2304,11 @@ const int tile_col_size_bytes = pbi->tile_col_size_bytes; const int tile_size_bytes = pbi->tile_size_bytes; int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + aom_internal_error( + &pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Not all the tiles in the tile list have the same size."); + } const int tile_copy_mode = ((AOMMAX(tile_width, tile_height) << MI_SIZE_LOG2) <= 256) ? 1 : 0; // Read tile column sizes for all columns (we need the last tile buffer) @@ -2302,8 +2317,16 @@ size_t tile_col_size; if (!is_last) { + if (tile_col_size_bytes > data_end - data) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "Not enough data to read tile_col_size"); + } tile_col_size = mem_get_varsize(data, tile_col_size_bytes); data += tile_col_size_bytes; + if (tile_col_size > (size_t)(data_end - data)) { + aom_internal_error(&pbi->error, AOM_CODEC_CORRUPT_FRAME, + "tile_col_data_end[%d] is out of bound", c); + } tile_col_data_end[c] = data + tile_col_size; } else { tile_col_size = data_end - data; @@ -2440,6 +2463,7 @@ const int n_tiles) { AV1_COMMON *const cm = &pbi->common; aom_free(pbi->tile_data); + pbi->allocated_tiles = 0; CHECK_MEM_ERROR(cm, pbi->tile_data, aom_memalign(32, n_tiles * sizeof(*pbi->tile_data))); pbi->allocated_tiles = n_tiles; @@ -3180,18 +3204,16 @@ pthread_mutex_lock(pbi->row_mt_mutex_); #endif frame_row_mt_info->row_mt_exit = 1; - +#if CONFIG_MULTITHREAD + pthread_cond_broadcast(pbi->row_mt_cond_); + pthread_mutex_unlock(pbi->row_mt_mutex_); +#endif // If any SB row (erroneous row) processed by a thread encounters an // internal error, there is a need to indicate other threads that decoding // of the erroneous row is complete. This ensures that other threads which // wait upon the completion of SB's present in erroneous row are not waiting // indefinitely. signal_decoding_done_for_erroneous_row(pbi, &thread_data->td->dcb.xd); - -#if CONFIG_MULTITHREAD - pthread_cond_broadcast(pbi->row_mt_cond_); - pthread_mutex_unlock(pbi->row_mt_mutex_); -#endif return 0; } thread_data->error_info.setjmp = 1; @@ -4769,7 +4791,7 @@ seq_params->max_frame_height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, features->byte_alignment, - &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, 0, + &buf->raw_frame_buffer, pool->get_fb_cb, pool->cb_priv, false, 0)) { decrease_ref_count(buf, pool); unlock_buffer_pool(pool); diff -Nru aom-3.8.2/av1/decoder/decodemv.h aom-3.9.0/av1/decoder/decodemv.h --- aom-3.8.2/av1/decoder/decodemv.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/decoder/decodemv.h 2024-05-07 19:57:02.958000000 +0000 @@ -20,6 +20,8 @@ extern "C" { #endif +int av1_neg_deinterleave(int diff, int ref, int max); + void av1_read_mode_info(AV1Decoder *const pbi, DecoderCodingBlock *dcb, aom_reader *r, int x_mis, int y_mis); diff -Nru aom-3.8.2/av1/decoder/decoder.c aom-3.9.0/av1/decoder/decoder.c --- aom-3.8.2/av1/decoder/decoder.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/decoder/decoder.c 2024-05-07 19:57:02.958000000 +0000 @@ -21,6 +21,7 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" #include "aom_scale/aom_scale.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #include "av1/common/alloccommon.h" @@ -79,9 +80,10 @@ static void dec_free_mi(CommonModeInfoParams *mi_params) { aom_free(mi_params->mi_alloc); mi_params->mi_alloc = NULL; + mi_params->mi_alloc_size = 0; aom_free(mi_params->mi_grid_base); mi_params->mi_grid_base = NULL; - mi_params->mi_alloc_size = 0; + mi_params->mi_grid_size = 0; aom_free(mi_params->tx_type_map); mi_params->tx_type_map = NULL; } diff -Nru aom-3.8.2/av1/decoder/dthread.h aom-3.9.0/av1/decoder/dthread.h --- aom-3.8.2/av1/decoder/dthread.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/decoder/dthread.h 2024-05-07 19:57:02.964000000 +0000 @@ -14,7 +14,6 @@ #include "config/aom_config.h" -#include "aom_util/aom_thread.h" #include "aom/internal/aom_codec_internal.h" #ifdef __cplusplus diff -Nru aom-3.8.2/av1/decoder/obu.c aom-3.9.0/av1/decoder/obu.c --- aom-3.8.2/av1/decoder/obu.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/decoder/obu.c 2024-05-07 19:57:02.968000000 +0000 @@ -76,7 +76,7 @@ return 0; } -static uint32_t read_temporal_delimiter_obu() { return 0; } +static uint32_t read_temporal_delimiter_obu(void) { return 0; } // Returns a boolean that indicates success. static int read_bitstream_level(AV1_LEVEL *seq_level_idx, @@ -367,16 +367,13 @@ return header_size + tg_payload_size; } -static void alloc_tile_list_buffer(AV1Decoder *pbi) { +static void alloc_tile_list_buffer(AV1Decoder *pbi, int tile_width_in_pixels, + int tile_height_in_pixels) { // The resolution of the output frame is read out from the bitstream. The data // are stored in the order of Y plane, U plane and V plane. As an example, for // image format 4:2:0, the output frame of U plane and V plane is 1/4 of the // output frame. AV1_COMMON *const cm = &pbi->common; - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_width_in_pixels = tile_width * MI_SIZE; - const int tile_height_in_pixels = tile_height * MI_SIZE; const int output_frame_width = (pbi->output_frame_width_in_tiles_minus_1 + 1) * tile_width_in_pixels; const int output_frame_height = @@ -396,7 +393,7 @@ cm->seq_params->subsampling_y, (cm->seq_params->use_highbitdepth && (cm->seq_params->bit_depth > AOM_BITS_8)), - 0, cm->features.byte_alignment, 0, 0)) + 0, cm->features.byte_alignment, false, 0)) aom_internal_error(&pbi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate the tile list output buffer"); } @@ -424,13 +421,10 @@ return; } -static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, - int tile_idx) { +static void copy_decoded_tile_to_tile_list_buffer(AV1Decoder *pbi, int tile_idx, + int tile_width_in_pixels, + int tile_height_in_pixels) { AV1_COMMON *const cm = &pbi->common; - int tile_width, tile_height; - av1_get_uniform_tile_size(cm, &tile_width, &tile_height); - const int tile_width_in_pixels = tile_width * MI_SIZE; - const int tile_height_in_pixels = tile_height * MI_SIZE; const int ssy = cm->seq_params->subsampling_y; const int ssx = cm->seq_params->subsampling_x; const int num_planes = av1_num_planes(cm); @@ -501,13 +495,31 @@ pbi->output_frame_width_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); pbi->output_frame_height_in_tiles_minus_1 = aom_rb_read_literal(rb, 8); pbi->tile_count_minus_1 = aom_rb_read_literal(rb, 16); + + // The output frame is used to store the decoded tile list. The decoded tile + // list has to fit into 1 output frame. + if ((pbi->tile_count_minus_1 + 1) > + (pbi->output_frame_width_in_tiles_minus_1 + 1) * + (pbi->output_frame_height_in_tiles_minus_1 + 1)) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + if (pbi->tile_count_minus_1 > MAX_TILES - 1) { pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; return 0; } + int tile_width, tile_height; + if (!av1_get_uniform_tile_size(cm, &tile_width, &tile_height)) { + pbi->error.error_code = AOM_CODEC_CORRUPT_FRAME; + return 0; + } + const int tile_width_in_pixels = tile_width * MI_SIZE; + const int tile_height_in_pixels = tile_height * MI_SIZE; + // Allocate output frame buffer for the tile list. - alloc_tile_list_buffer(pbi); + alloc_tile_list_buffer(pbi, tile_width_in_pixels, tile_height_in_pixels); uint32_t tile_list_info_bytes = 4; tile_list_payload_size += tile_list_info_bytes; @@ -558,7 +570,8 @@ assert(data <= data_end); // Copy the decoded tile to the tile list output buffer. - copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx); + copy_decoded_tile_to_tile_list_buffer(pbi, tile_idx, tile_width_in_pixels, + tile_height_in_pixels); tile_idx++; } diff -Nru aom-3.8.2/av1/encoder/allintra_vis.c aom-3.9.0/av1/encoder/allintra_vis.c --- aom-3.8.2/av1/encoder/allintra_vis.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/allintra_vis.c 2024-05-07 19:57:02.970000000 +0000 @@ -13,6 +13,8 @@ #include "config/aom_config.h" +#include "aom_util/aom_pthread.h" + #if CONFIG_TFLITE #include "tensorflow/lite/c/c_api.h" #include "av1/encoder/deltaq4_model.c" @@ -270,13 +272,14 @@ const int coeff_count = block_size * block_size; const int mb_step = mi_size_wide[bsize]; const BitDepthInfo bd_info = get_bit_depth_info(xd); - const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; + const MultiThreadInfo *const mt_info = &cpi->mt_info; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &mt_info->intra_mt; AV1EncRowMultiThreadSync *const intra_row_mt_sync = &cpi->ppi->intra_row_mt_sync; const int mi_cols = cm->mi_params.mi_cols; const int mt_thread_id = mi_row / mb_step; // TODO(chengchen): test different unit step size - const int mt_unit_step = mi_size_wide[BLOCK_64X64]; + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; const int mt_unit_cols = (mi_cols + (mt_unit_step >> 1)) / mt_unit_step; int mt_unit_col = 0; const int is_high_bitdepth = is_cur_buf_hbd(xd); @@ -293,6 +296,18 @@ if (mi_col % mt_unit_step == 0) { intra_mt->intra_sync_read_ptr(intra_row_mt_sync, mt_thread_id, mt_unit_col); +#if CONFIG_MULTITHREAD + const int num_workers = + AOMMIN(mt_info->num_mod_workers[MOD_AI], mt_info->num_workers); + if (num_workers > 1) { + const AV1EncRowMultiThreadInfo *const enc_row_mt = &mt_info->enc_row_mt; + pthread_mutex_lock(enc_row_mt->mutex_); + const bool exit = enc_row_mt->mb_wiener_mt_exit; + pthread_mutex_unlock(enc_row_mt->mutex_); + // Stop further processing in case any worker has encountered an error. + if (exit) break; + } +#endif } PREDICTION_MODE best_mode = DC_PRED; @@ -575,7 +590,7 @@ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, - NULL, cpi->image_pyramid_levels, 0)) + NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); av1_alloc_mb_wiener_var_pred_buf(&cpi->common, &cpi->td); diff -Nru aom-3.8.2/av1/encoder/allintra_vis.h aom-3.9.0/av1/encoder/allintra_vis.h --- aom-3.8.2/av1/encoder/allintra_vis.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/allintra_vis.h 2024-05-07 19:57:02.972000000 +0000 @@ -20,6 +20,8 @@ #include "av1/encoder/block.h" #include "av1/encoder/encoder.h" +#define MB_WIENER_MT_UNIT_SIZE BLOCK_64X64 + void av1_init_mb_wiener_var_buffer(AV1_COMP *cpi); void av1_calc_mb_wiener_var_row(AV1_COMP *const cpi, MACROBLOCK *x, diff -Nru aom-3.8.2/av1/encoder/aq_cyclicrefresh.c aom-3.9.0/av1/encoder/aq_cyclicrefresh.c --- aom-3.8.2/av1/encoder/aq_cyclicrefresh.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/aq_cyclicrefresh.c 2024-05-07 19:57:02.973000000 +0000 @@ -15,6 +15,7 @@ #include "av1/common/pred_common.h" #include "av1/common/seg_common.h" #include "av1/encoder/aq_cyclicrefresh.h" +#include "av1/encoder/encoder_utils.h" #include "av1/encoder/ratectrl.h" #include "av1/encoder/segmentation.h" #include "av1/encoder/tokenize.h" @@ -295,6 +296,7 @@ const CommonModeInfoParams *const mi_params = &cm->mi_params; CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; unsigned char *const seg_map = cpi->enc_seg.map; + unsigned char *const active_map_4x4 = cpi->active_map.map; int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame; int xmis, ymis, x, y; uint64_t sb_sad = 0; @@ -302,7 +304,12 @@ uint64_t thresh_sad = INT64_MAX; const int mi_rows = mi_params->mi_rows, mi_cols = mi_params->mi_cols; const int mi_stride = mi_cols; - memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols); + // Don't set seg_map to 0 if active_maps is enabled. Active_maps will set + // seg_map to either 7 or 0 (AM_SEGMENT_ID_INACTIVE/ACTIVE), and cyclic + // refresh set below (segment 1 or 2) will only be set for ACTIVE blocks. + if (!cpi->active_map.enabled) { + memset(seg_map, CR_SEGMENT_ID_BASE, mi_rows * mi_cols); + } sb_cols = (mi_cols + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; sb_rows = (mi_rows + cm->seq_params->mib_size - 1) / cm->seq_params->mib_size; sbs_in_frame = sb_cols * sb_rows; @@ -357,7 +364,10 @@ // for possible boost/refresh (segment 1). The segment id may get // reset to 0 later if block gets coded anything other than low motion. // If the block_sad (sb_sad) is very low label it for refresh anyway. - if (cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) { + // If active_maps is enabled, only allow for setting on ACTIVE blocks. + if ((cr->map[bl_index2] == 0 || sb_sad < thresh_sad_low) && + (!cpi->active_map.enabled || + active_map_4x4[bl_index2] == AM_SEGMENT_ID_ACTIVE)) { sum_map += 4; } else if (cr->map[bl_index2] < 0) { cr->map[bl_index2]++; @@ -380,7 +390,8 @@ cr->sb_index = i; if (cr->target_num_seg_blocks == 0) { // Disable segmentation, seg_map is already set to 0 above. - av1_disable_segmentation(&cm->seg); + // Don't disable if active_map is being used. + if (!cpi->active_map.enabled) av1_disable_segmentation(&cm->seg); } } @@ -423,8 +434,6 @@ // function av1_cyclic_reset_segment_skip(). Skipping over // 4x4 will therefore have small bdrate loss (~0.2%), so // we use it only for speed > 9 for now. - // Also if loop-filter deltas is applied via segment, then - // we need to set cr->skip_over4x4 = 1. cr->skip_over4x4 = (cpi->oxcf.speed > 9) ? 1 : 0; // should we enable cyclic refresh on this frame. @@ -450,6 +459,15 @@ else cr->percent_refresh = 10 + cr->percent_refresh_adjustment; + if (cpi->active_map.enabled) { + // Scale down the percent_refresh to target the active blocks only. + cr->percent_refresh = + cr->percent_refresh * (100 - cpi->rc.percent_blocks_inactive) / 100; + if (cr->percent_refresh == 0) { + cr->apply_cyclic_refresh = 0; + } + } + cr->max_qdelta_perc = 60; cr->time_for_refresh = 0; cr->use_block_sad_scene_det = @@ -543,10 +561,14 @@ if (resolution_change) av1_cyclic_refresh_reset_resize(cpi); if (!cr->apply_cyclic_refresh) { - // Set segmentation map to 0 and disable. - unsigned char *const seg_map = cpi->enc_seg.map; - memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); - av1_disable_segmentation(&cm->seg); + // Don't disable and set seg_map to 0 if active_maps is enabled, unless + // whole frame is set as inactive (since we only apply cyclic_refresh to + // active blocks). + if (!cpi->active_map.enabled || cpi->rc.percent_blocks_inactive == 100) { + unsigned char *const seg_map = cpi->enc_seg.map; + memset(seg_map, 0, cm->mi_params.mi_rows * cm->mi_params.mi_cols); + av1_disable_segmentation(&cm->seg); + } if (frame_is_intra_only(cm) || scene_change_detected || cpi->ppi->rtc_ref.bias_recovery_frame) { cr->sb_index = 0; @@ -574,9 +596,11 @@ cr->thresh_rate_sb = INT64_MAX; } // Set up segmentation. - // Clear down the segment map. av1_enable_segmentation(&cm->seg); - av1_clearall_segfeatures(seg); + if (!cpi->active_map.enabled) { + // Clear down the segment map, only if active_maps is not enabled. + av1_clearall_segfeatures(seg); + } // Note: setting temporal_update has no effect, as the seg-map coding method // (temporal or spatial) is determined in @@ -644,6 +668,10 @@ int av1_cyclic_refresh_disable_lf_cdef(AV1_COMP *const cpi) { CYCLIC_REFRESH *const cr = cpi->cyclic_refresh; const int qindex = cpi->common.quant_params.base_qindex; + if (cpi->active_map.enabled && + cpi->rc.percent_blocks_inactive > + cpi->sf.rt_sf.thresh_active_maps_skip_lf_cdef) + return 1; if (cpi->rc.frames_since_key > 30 && cr->percent_refresh > 0 && cr->counter_encode_maxq_scene_change > 300 / cr->percent_refresh && cpi->rc.frame_source_sad < 1000 && diff -Nru aom-3.8.2/av1/encoder/arm/neon/av1_error_neon.c aom-3.9.0/av1/encoder/arm/neon/av1_error_neon.c --- aom-3.8.2/av1/encoder/arm/neon/av1_error_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/av1_error_neon.c 2024-05-07 19:57:02.978000000 +0000 @@ -1,4 +1,5 @@ /* + * Copyright (c) 2015 The WebM project authors. All Rights Reserved. * Copyright (c) 2019, Alliance for Open Media. All Rights Reserved. * * Use of this source code is governed by a BSD-style license @@ -15,73 +16,80 @@ #include "aom_dsp/aom_dsp_common.h" #include "aom_dsp/arm/mem_neon.h" +#include "aom_dsp/arm/sum_neon.h" int64_t av1_block_error_neon(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz) { - int64x2_t error = vdupq_n_s64(0); - int64x2_t sqcoeff = vdupq_n_s64(0); + uint64x2_t err_u64 = vdupq_n_u64(0); + int64x2_t ssz_s64 = vdupq_n_s64(0); - assert(block_size >= 8); - assert((block_size % 8) == 0); + assert(block_size >= 16); + assert((block_size % 16) == 0); do { - const int16x8_t c = load_tran_low_to_s16q(coeff); - const int16x8_t d = load_tran_low_to_s16q(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before - // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - - const int16x4_t coeff_lo = vget_low_s16(c); - const int16x4_t coeff_hi = vget_high_s16(c); - const int32x4_t sqcoeff0 = vmull_s16(coeff_lo, coeff_lo); - const int32x4_t sqcoeff1 = vmlal_s16(sqcoeff0, coeff_hi, coeff_hi); - const int64x2_t sqcoeff2 = - vaddl_s32(vget_low_s32(sqcoeff1), vget_high_s32(sqcoeff1)); - sqcoeff = vaddq_s64(sqcoeff, sqcoeff2); - - coeff += 8; - dqcoeff += 8; - block_size -= 8; + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + // We can't do the same here as we're operating on signed integers, so we + // can only accumulate 2 squares. + int32x4_t ssz0 = vmull_s16(vget_low_s16(c0), vget_low_s16(c0)); + ssz0 = vmlal_s16(ssz0, vget_high_s16(c0), vget_high_s16(c0)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz0); + + int32x4_t ssz1 = vmull_s16(vget_low_s16(c1), vget_low_s16(c1)); + ssz1 = vmlal_s16(ssz1, vget_high_s16(c1), vget_high_s16(c1)); + ssz_s64 = vpadalq_s32(ssz_s64, ssz1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; } while (block_size != 0); -#if AOM_ARCH_AARCH64 - *ssz = vaddvq_s64(sqcoeff); - return vaddvq_s64(error); -#else - *ssz = vgetq_lane_s64(sqcoeff, 0) + vgetq_lane_s64(sqcoeff, 1); - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); -#endif + *ssz = horizontal_add_s64x2(ssz_s64); + return (int64_t)horizontal_add_u64x2(err_u64); } int64_t av1_block_error_lp_neon(const int16_t *coeff, const int16_t *dqcoeff, int block_size) { - int64x2_t error = vdupq_n_s64(0); + uint64x2_t err_u64 = vdupq_n_u64(0); - assert(block_size >= 8); - assert((block_size % 8) == 0); + assert(block_size >= 16); + assert((block_size % 16) == 0); do { - const int16x8_t c = vld1q_s16(coeff); - const int16x8_t d = vld1q_s16(dqcoeff); - const int16x8_t diff = vsubq_s16(c, d); - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - // diff is 15-bits, the squares 30, so we can store 2 in 31-bits before - // accumulating them in 64-bits. - const int32x4_t err0 = vmull_s16(diff_lo, diff_lo); - const int32x4_t err1 = vmlal_s16(err0, diff_hi, diff_hi); - const int64x2_t err2 = vaddl_s32(vget_low_s32(err1), vget_high_s32(err1)); - error = vaddq_s64(error, err2); - coeff += 8; - dqcoeff += 8; - block_size -= 8; + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const uint16x8_t diff0 = vreinterpretq_u16_s16(vabdq_s16(c0, d0)); + const uint16x8_t diff1 = vreinterpretq_u16_s16(vabdq_s16(c1, d1)); + + // By operating on unsigned integers we can store up to 4 squared diff in a + // 32-bit element before having to widen to 64 bits. + uint32x4_t err = vmull_u16(vget_low_u16(diff0), vget_low_u16(diff0)); + err = vmlal_u16(err, vget_high_u16(diff0), vget_high_u16(diff0)); + err = vmlal_u16(err, vget_low_u16(diff1), vget_low_u16(diff1)); + err = vmlal_u16(err, vget_high_u16(diff1), vget_high_u16(diff1)); + err_u64 = vpadalq_u32(err_u64, err); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; } while (block_size != 0); - return vgetq_lane_s64(error, 0) + vgetq_lane_s64(error, 1); + return (int64_t)horizontal_add_u64x2(err_u64); } diff -Nru aom-3.8.2/av1/encoder/arm/neon/av1_error_sve.c aom-3.9.0/av1/encoder/arm/neon/av1_error_sve.c --- aom-3.8.2/av1/encoder/arm/neon/av1_error_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/av1_error_sve.c 2024-05-07 19:57:02.978000000 +0000 @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include +#include + +#include "config/aom_config.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/mem_neon.h" + +int64_t av1_block_error_sve(const tran_low_t *coeff, const tran_low_t *dqcoeff, + intptr_t block_size, int64_t *ssz) { + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + int64x2_t sqcoeff[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + assert(block_size >= 16); + assert((block_size % 16) == 0); + + do { + const int16x8_t c0 = load_tran_low_to_s16q(coeff); + const int16x8_t c1 = load_tran_low_to_s16q(coeff + 8); + const int16x8_t d0 = load_tran_low_to_s16q(dqcoeff); + const int16x8_t d1 = load_tran_low_to_s16q(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + sqcoeff[0] = aom_sdotq_s16(sqcoeff[0], c0, c0); + sqcoeff[1] = aom_sdotq_s16(sqcoeff[1], c1, c1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + *ssz = vaddvq_s64(vaddq_s64(sqcoeff[0], sqcoeff[1])); + return vaddvq_s64(vaddq_s64(error[0], error[1])); +} + +int64_t av1_block_error_lp_sve(const int16_t *coeff, const int16_t *dqcoeff, + int block_size) { + if (block_size % 32 == 0) { + int64x2_t error[4] = { vdupq_n_s64(0), vdupq_n_s64(0), vdupq_n_s64(0), + vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t c2 = vld1q_s16(coeff + 16); + const int16x8_t c3 = vld1q_s16(coeff + 24); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + const int16x8_t d2 = vld1q_s16(dqcoeff + 16); + const int16x8_t d3 = vld1q_s16(dqcoeff + 24); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + const int16x8_t diff2 = vsubq_s16(c2, d2); + const int16x8_t diff3 = vsubq_s16(c3, d3); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + error[2] = aom_sdotq_s16(error[2], diff2, diff2); + error[3] = aom_sdotq_s16(error[3], diff3, diff3); + + coeff += 32; + dqcoeff += 32; + block_size -= 32; + } while (block_size != 0); + + error[0] = vaddq_s64(error[0], error[1]); + error[2] = vaddq_s64(error[2], error[3]); + error[0] = vaddq_s64(error[0], error[2]); + return vaddvq_s64(error[0]); + } + assert(block_size == 16); + + int64x2_t error[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + do { + const int16x8_t c0 = vld1q_s16(coeff); + const int16x8_t c1 = vld1q_s16(coeff + 8); + const int16x8_t d0 = vld1q_s16(dqcoeff); + const int16x8_t d1 = vld1q_s16(dqcoeff + 8); + + const int16x8_t diff0 = vsubq_s16(c0, d0); + const int16x8_t diff1 = vsubq_s16(c1, d1); + + error[0] = aom_sdotq_s16(error[0], diff0, diff0); + error[1] = aom_sdotq_s16(error[1], diff1, diff1); + + coeff += 16; + dqcoeff += 16; + block_size -= 16; + } while (block_size != 0); + + return vaddvq_s64(vaddq_s64(error[0], error[1])); +} diff -Nru aom-3.8.2/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c aom-3.9.0/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c --- aom-3.8.2/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c 2024-05-07 19:57:02.979000000 +0000 @@ -1598,44 +1598,6 @@ int32_t *output, int stride, int cos_bit); -static const col_transform_1d_lbd_4_neon col_txfm4x4_arr[TX_TYPES] = { - fdct4x4_col_neon, // DCT_DCT - fadst4x4_col_neon, // ADST_DCT - fdct4x4_col_neon, // DCT_ADST - fadst4x4_col_neon, // ADST_ADST - fadst4x4_col_neon, // FLIPADST_DCT - fdct4x4_col_neon, // DCT_FLIPADST - fadst4x4_col_neon, // FLIPADST_FLIPADST - fadst4x4_col_neon, // ADST_FLIPADST - fadst4x4_col_neon, // FLIPADST_ADST - fidentity4x4_col_neon, // IDTX - fdct4x4_col_neon, // V_DCT - fidentity4x4_col_neon, // H_DCT - fadst4x4_col_neon, // V_ADST - fidentity4x4_col_neon, // H_ADST - fadst4x4_col_neon, // V_FLIPADST - fidentity4x4_col_neon // H_FLIPADST -}; - -static const row_transform_1d_lbd_4_neon row_txfm4x4_arr[TX_TYPES] = { - fdct4x4_row_neon, // DCT_DCT - fdct4x4_row_neon, // ADST_DCT - fadst4x4_row_neon, // DCT_ADST - fadst4x4_row_neon, // ADST_ADST - fdct4x4_row_neon, // FLIPADST_DCT - fadst4x4_row_neon, // DCT_FLIPADST - fadst4x4_row_neon, // FLIPADST_FLIPADST - fadst4x4_row_neon, // ADST_FLIPADST - fadst4x4_row_neon, // FLIPADST_ADST - fidentity4x4_row_neon, // IDTX - fidentity4x4_row_neon, // V_DCT - fdct4x4_row_neon, // H_DCT - fidentity4x4_row_neon, // V_ADST - fadst4x4_row_neon, // H_ADST - fidentity4x4_row_neon, // V_FLIPADST - fadst4x4_row_neon // H_FLIPADST -}; - static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = { fdct4x8_col_neon, // DCT_DCT fadst4x8_col_neon, // ADST_DCT @@ -1943,21 +1905,96 @@ static void lowbd_fwd_txfm2d_4x4_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; - int16x4_t buf0[4], buf1[4]; - const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x4_arr[tx_type]; - const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x4_arr[tx_type]; int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 4); - col_txfm(input, buf0, stride, 13); - transpose_arrays_s16_4x4(buf0, buf1); - if (lr_flip) { - flip_buf_4_neon(buf1, buf0, 4); - row_txfm(buf0, output, 4, 13); - } else { - row_txfm(buf1, output, 4, 13); + int16x4_t buf0[4], buf1[4]; + switch (tx_type) { + case DCT_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_ADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case ADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case FLIPADST_DCT: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case DCT_FLIPADST: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case ADST_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; + case FLIPADST_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case IDTX: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case V_DCT: + fdct4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_DCT: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fdct4x4_row_neon(buf1, output, 4, 13); + break; + case V_ADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_ADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fadst4x4_row_neon(buf1, output, 4, 13); + break; + case V_FLIPADST: + fadst4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + fidentity4x4_row_neon(buf1, output, 4, 13); + break; + case H_FLIPADST: + fidentity4x4_col_neon(input, buf0, stride, 13); + transpose_arrays_s16_4x4(buf0, buf1); + flip_buf_4_neon(buf1, buf0, 4); + fadst4x4_row_neon(buf0, output, 4, 13); + break; } } @@ -2040,22 +2077,113 @@ static void lowbd_fwd_txfm2d_8x8_neon(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd) { (void)bd; - int16x8_t buf0[8], buf1[8]; - const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type]; - const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type]; int ud_flip, lr_flip; - get_flip_cfg(tx_type, &ud_flip, &lr_flip); ud_adjust_input_and_stride(ud_flip, &input, &stride, 8); - col_txfm(input, buf0, stride, 13); - shift_right_1_round_s16_x8(buf0, buf0, 8); - transpose_arrays_s16_8x8(buf0, buf1); - if (lr_flip) { - flip_buf_8_neon(buf1, buf0, 8); - row_txfm(buf0, output, 8, 13); - } else { - row_txfm(buf1, output, 8, 13); + int16x8_t buf0[8], buf1[8]; + + switch (tx_type) { + case DCT_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_ADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case ADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case FLIPADST_DCT: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case DCT_FLIPADST: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case ADST_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; + case FLIPADST_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case IDTX: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case V_DCT: + fdct8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_DCT: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fdct8x8_row_neon(buf1, output, 8, 13); + break; + case V_ADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_ADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fadst8x8_row_neon(buf1, output, 8, 13); + break; + case V_FLIPADST: + fadst8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + fidentity8x8_row_neon(buf1, output, 8, 13); + break; + case H_FLIPADST: + fidentity8x8_col_neon(input, buf0, stride, 13); + shift_right_1_round_s16_x8(buf0, buf0, 8); + transpose_arrays_s16_8x8(buf0, buf1); + flip_buf_8_neon(buf1, buf0, 8); + fadst8x8_row_neon(buf0, output, 8, 13); + break; } } @@ -2376,8 +2504,8 @@ } } -static void fdct32_new_neon(const int32x4_t *input, int32x4_t *output, - int cos_bit) { +static void fdct32_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); @@ -2598,8 +2726,8 @@ output[31] = buf0[31]; } -static void fdct64_new_neon(const int32x4_t *input, int32x4_t *output, - int cos_bit) { +static void fdct64_neon(const int32x4_t *input, int32x4_t *output, + int cos_bit) { const int16_t *cospi = cospi_arr_q13(cos_bit); const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]); @@ -2853,8 +2981,8 @@ bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } - fdct64_new_neon(bufA, bufA, 10); - fdct64_new_neon(bufB, bufB, 10); + fdct64_neon(bufA, bufA, 10); + fdct64_neon(bufB, bufB, 10); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32); @@ -2883,8 +3011,8 @@ bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } - fdct64_new_neon(bufA, bufA, 11); - fdct64_new_neon(bufB, bufB, 11); + fdct64_neon(bufA, bufA, 11); + fdct64_neon(bufB, bufB, 11); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); @@ -2918,8 +3046,8 @@ bufA[j] = vmovl_s16(vget_low_s16(buf[j])); bufB[j] = vmovl_s16(vget_high_s16(buf[j])); } - fdct32_new_neon(bufA, bufA, 11); - fdct32_new_neon(bufB, bufB, 11); + fdct32_neon(bufA, bufA, 11); + fdct32_neon(bufB, bufB, 11); shift_right_2_round_s32_x4(bufA, bufA, 32); shift_right_2_round_s32_x4(bufB, bufB, 32); round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32); diff -Nru aom-3.8.2/av1/encoder/arm/neon/cnn_neon.c aom-3.9.0/av1/encoder/arm/neon/cnn_neon.c --- aom-3.8.2/av1/encoder/arm/neon/cnn_neon.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/cnn_neon.c 2024-05-07 19:57:02.984000000 +0000 @@ -0,0 +1,1144 @@ +/* + * Copyright (c) 2023, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include +#include +#include + +#include "config/aom_config.h" +#include "config/av1_rtcd.h" + +#include "aom_dsp/aom_dsp_common.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/av1_common_int.h" +#include "av1/encoder/cnn.h" +#include "av1/encoder/partition_cnn_weights.h" + +// The CNN weights used in av1_cnn_convolve_no_maxpool_padding_valid are +// declared (av1_intra_mode_cnn_partition_cnn_layer_[01234]_kernel) in +// partition_cnn_weights.h. However, to enable linear memory access, rearrange +// the weight tables here. +static const float weights_layer_1[] = { + 0.228403f, 0.031690f, -0.251710f, -0.046230f, 0.413294f, -0.236732f, + -0.038291f, 0.210766f, 0.427196f, -0.384319f, -0.439463f, 0.366015f, + 0.112263f, -0.144168f, -0.075017f, 0.119629f, 0.325200f, -0.678246f, + -0.370826f, -0.341362f, -0.503392f, 0.400884f, 0.465214f, -0.360847f, + 0.187100f, -0.190757f, -0.131906f, 0.121492f, -0.303556f, -0.007658f, + 0.380077f, -0.066394f, -0.016043f, -1.490730f, -0.120682f, 0.132062f, + 0.086185f, -0.042766f, -0.087069f, 0.029426f, 0.309583f, -0.029985f, + -0.297429f, -0.018139f, -0.688828f, 0.756607f, 0.706410f, -0.696826f, + -0.087793f, -0.023304f, -0.012332f, -0.018043f, -0.410268f, 0.352143f, + 0.391284f, -0.363178f, -0.295034f, 0.160246f, -0.149446f, 0.260145f, + -0.252249f, 0.190826f, 0.251206f, -0.270796f, -0.979219f, 0.884880f, + 0.962057f, -0.847601f, -0.011053f, 0.118765f, -0.028428f, -0.020138f, + 0.400274f, -0.382845f, -0.462766f, 0.390654f, 0.361223f, -0.320068f, + -0.372084f, 0.313196f, 0.241933f, -0.416614f, -0.008722f, -0.255078f, + 0.078730f, -0.381935f, -0.204577f, 0.159768f, 0.071853f, -0.126294f, + -0.036186f, -0.007900f, 0.380071f, -0.298882f, 0.387941f, -0.267350f, + -0.586802f, 0.477785f, -0.000013f, 0.197296f, -0.079154f, -0.005811f, + -0.044300f, -0.021192f, -0.020879f, -0.005265f, 0.082277f, -0.139132f, + -0.239237f, 0.440234f, -0.542342f, 0.378360f, -0.070974f, 0.272702f, + -0.278939f, -0.044948f, -0.134197f, -0.007172f, -0.353628f, -0.128091f, + 0.357458f, -0.037614f, -0.144983f, 0.220623f, -0.003394f, -0.070166f, + 0.200370f, -0.166037f, 0.224448f, -0.012990f, -0.098853f, 0.008613f, + -0.017669f, 0.070641f, 0.174530f, -0.119822f, -0.065096f, 0.118487f, + -0.024764f, -0.050466f, 0.066631f, -0.075896f, -0.062363f, 0.212604f, + -0.377322f, 0.306306f, -0.399733f, 0.238624f, 0.233571f, -0.344080f, + 0.462491f, -0.565210f, -0.035074f, -0.010459f, 0.084382f, 0.052294f, + 0.065714f, 0.013716f, 0.135036f, 0.000588f, 0.181079f, -0.566344f, + 0.395561f, -0.398509f, 0.450017f, -1.462710f, 1.138280f, -0.447774f, + 0.247936f, -0.417067f, 0.165997f, -0.458632f, -0.018527f, 0.308461f, + 0.541266f, 0.162257f, 0.601786f, -1.275840f, -0.373404f, -0.589747f, + 0.026539f, -0.219327f, 0.142972f, -0.018496f, 0.075204f, -0.775190f, + 0.237307f, -0.348252f, 0.117792f, -0.094332f, 0.363101f, -0.065025f, + 0.816662f, 0.590110f, 0.752202f, -0.308599f, 0.258337f, -0.842085f, + 0.695788f, -0.205615f, 0.093930f, -0.392536f, 0.463093f, -0.432456f, + 0.041660f, -0.827264f, 0.309128f, -0.354658f, 0.451957f, -1.406640f, + 0.773192f, -0.892943f, 0.134856f, -0.467808f, 0.306003f, -0.226560f, + 0.086865f, -0.104102f, 0.148098f, -0.082658f, 0.316655f, -1.028310f, + 0.741566f, -0.345326f, 0.052379f, -0.275613f, 0.191765f, -0.162391f, + 0.000976f, 0.093061f, 0.068649f, 0.033582f, 0.239727f, -0.647769f, + 0.218493f, -0.397120f, 0.268229f, -0.303424f, 0.185393f, -0.314189f, + 0.101728f, -0.163083f, -0.084989f, 0.136783f, -0.264346f, 0.465914f, + 0.220395f, -0.252968f, -0.326661f, 0.271483f, 0.374717f, -0.311570f, + -0.082119f, 0.020870f, 0.091975f, -0.030582f, -0.487148f, 0.198912f, + 0.024554f, -0.749363f, -0.102267f, 0.097787f, 0.141459f, -0.110706f, + 0.079467f, -0.082570f, -0.347567f, 0.341043f, -0.137871f, 0.112319f, + 0.064733f, -0.082869f, 0.269999f, -0.408184f, -0.183443f, 0.180608f, + 0.223345f, -0.357376f, -0.244593f, 0.355348f, -0.072701f, -0.034311f, + 0.096544f, 0.016407f, 0.417550f, -0.367772f, -0.484535f, 0.405977f, + 0.314243f, -0.099622f, -0.192218f, -0.012780f, 0.434551f, -0.399047f, + -0.531499f, 0.484513f, -0.691352f, 0.872823f, 1.207720f, -1.377490f, + 0.006872f, -0.041453f, 0.007845f, 0.007463f, 0.467299f, -0.476372f, + -0.452606f, 0.452357f, 0.447332f, -0.365632f, -0.332435f, 0.300284f, + -0.290504f, 0.255410f, 0.310921f, -0.293717f, -0.616299f, 0.594207f, + 0.461347f, -0.449439f, 0.278455f, 0.285085f, -1.201340f, -0.016463f, + 0.549095f, 0.610375f, -4.608530f, -1.727390f, 0.150404f, -0.012846f, + -0.481148f, -0.182257f, 0.918796f, 0.213872f, 1.050410f, 0.681526f, + -0.458777f, -0.710395f, -2.347200f, -0.277197f, 0.213294f, 0.337551f, + -0.177710f, -0.152136f, 0.167666f, 0.308403f, -1.248500f, -0.565367f, + 0.122054f, 0.087874f, -0.476556f, -0.083548f, -0.358734f, -0.073131f, + -0.146320f, -2.241960f, 0.697639f, 0.545581f, -1.889700f, -0.267725f, + 0.433045f, 0.298224f, -0.338508f, 0.250226f, 0.405675f, 0.447201f, + -1.184690f, -0.473447f, 0.307403f, 0.711236f, -3.191560f, -1.663980f, + 0.165201f, 0.101360f, -0.624451f, -0.173269f, 0.089795f, 0.227478f, + -0.136664f, 0.007907f, 0.131079f, 0.605374f, -2.991620f, -1.723790f, + 0.082428f, 0.006781f, -0.348732f, -0.019271f, -0.032040f, -0.067078f, + -0.437166f, -0.144472f, 0.069844f, 0.194625f, -0.162284f, -0.374656f, + 0.056472f, -0.236524f, -0.114241f, -0.029161f, -0.222078f, -0.053435f, + -0.313938f, -0.555472f, 1.037550f, 0.689968f, 0.575694f, 0.065826f, + -0.659979f, -0.881351f, -0.626417f, -0.953975f, -0.576106f, -0.258708f, + 0.263004f, -0.229847f, 0.463835f, 1.390960f, -2.614480f, -1.272910f, + 0.065780f, -0.058603f, 0.015612f, 0.104703f, 0.198028f, 0.262792f, + 0.253616f, -0.079126f, -0.587381f, -0.739021f, -0.822676f, -0.795512f, + 0.193644f, 0.234643f, -0.034407f, 0.421478f, -0.572610f, -0.290714f, + -0.257803f, -0.644835f, -0.536938f, -0.375899f, -0.651077f, -0.522576f, + 0.562564f, 0.834616f, 0.513893f, 0.649689f, 0.356530f, 0.400716f, + 0.300606f, 0.290505f, 0.584608f, 0.671574f, 0.564584f, 0.419870f, + 0.062061f, 0.018263f, 0.009831f, 0.084103f, -0.128281f, -0.018818f, + -0.187244f, 0.067210f, 0.437147f, 0.442029f, 0.444939f, 0.226661f, + 0.541609f, 0.444280f, 0.302795f, 0.633026f, -0.180374f, 0.265197f, + 0.210404f, -0.118916f, -0.294013f, -0.692627f, -0.402347f, -0.356287f, + 0.387578f, 0.385496f, 0.789542f, 0.690396f, -0.203542f, -0.688546f, + 0.045319f, -0.448747f, -0.157148f, 0.152581f, 0.022360f, 0.058358f, + 0.593007f, 1.131860f, 0.289006f, 1.015560f, 0.144942f, -0.411577f, + 0.264794f, -0.085791f, 0.156996f, 0.200340f, 0.169264f, 0.267615f, + -0.361015f, -0.601842f, -0.442217f, -0.781086f, 0.112938f, 0.385305f, + 0.482454f, 0.470268f, 1.193390f, 0.589642f, 0.127638f, -0.640946f, + 0.540310f, 0.741498f, 0.686937f, 0.435879f, 0.534523f, 0.693119f, + 0.817577f, 0.783109f, 0.021681f, -0.004973f, 0.201236f, -0.086311f, + 0.028628f, 0.227871f, 0.462751f, 0.126832f, -0.389997f, -0.553965f, + -0.343953f, -0.448517f, 0.053129f, -0.115083f, 0.018138f, -0.067131f, + -0.293468f, -0.220700f, 0.074348f, -0.273153f, 0.263637f, 0.122049f, + 0.153025f, 0.076292f, 0.142320f, 0.286734f, 0.100542f, 0.308660f, + -0.759591f, -0.750938f, -0.788799f, -0.853076f, -0.588019f, -0.990063f, + -0.692327f, -0.722904f, 0.084736f, 0.151068f, 0.159606f, 0.147715f, + 1.610180f, 1.950330f, 1.765670f, 2.265110f, 0.008262f, 0.185584f, + 0.039337f, 0.164721f, 0.479446f, 0.314083f, 0.043969f, 0.291320f, + 0.003400f, -0.551190f, 0.060158f, -0.147591f, 0.089117f, 0.042994f, + 0.042802f, 0.127392f, -0.066172f, 0.078370f, 0.051408f, 0.014004f, + 0.086726f, 0.133334f, -0.046733f, 0.155100f, -0.118223f, -0.100778f, + -0.225245f, -0.460397f, 0.892644f, 1.003770f, 0.405155f, 0.517477f, + 0.184585f, 0.279090f, -0.036477f, 0.198703f, 0.027139f, -0.055728f, + -0.022396f, -0.147319f, 2.275540f, 2.014990f, 2.296800f, 2.081730f, + -0.088713f, 0.105729f, -0.027871f, -0.095047f, 0.012429f, 0.014244f, + -0.014755f, -0.003017f, 1.332700f, 1.300040f, 1.464250f, 1.305030f, + 0.032568f, 0.118042f, 0.079632f, -0.089405f, 0.163905f, 0.146608f, + 0.026502f, 0.065307f, -0.056909f, -0.065052f, 0.069851f, -0.082958f, + 0.023419f, -0.026293f, 0.037616f, -0.048096f, -0.073701f, -0.208295f, + -0.782095f, 0.000523f, 0.374131f, 0.420946f, 0.466151f, 0.349651f, + -0.679275f, -0.745827f, -0.379918f, -0.900107f, 0.044070f, -0.347536f, + -1.224390f, 0.740113f, -0.779966f, 0.510920f, -0.968597f, -0.095630f, + 0.120805f, 0.676803f, -0.164827f, 0.172996f, -0.106720f, 0.197527f, + 0.337561f, 0.571094f, -0.279090f, -0.396697f, -0.253083f, -0.690170f, + -0.363291f, 0.516921f, 0.489391f, -0.920628f, 0.497572f, 0.483864f, + -0.125696f, -0.338123f, -0.041517f, -0.534630f, -0.388465f, -0.784554f, + 0.215227f, 0.055088f, 0.179638f, 0.086997f, 0.569313f, 0.572926f, + 0.137182f, -0.045485f, 0.118087f, 0.210383f, 0.212664f, 0.482443f, + 0.151921f, 0.307947f, -0.084656f, -0.386206f, 0.542277f, -0.207005f, + 0.073792f, -1.013240f, 0.303581f, 0.270527f, 0.265985f, 0.332702f, + 0.848609f, 0.686757f, 0.767212f, 0.316901f, -0.502460f, -0.567092f, + -0.484799f, -0.173350f, -0.426863f, 0.222375f, -0.200267f, -0.523758f, + 0.265180f, -0.175648f, -0.229754f, 0.148740f, 0.402515f, 0.028243f, + -0.366109f, 0.157232f, -0.131564f, 0.055136f, 0.211046f, -0.115542f, + 0.322379f, -0.137768f, -0.247832f, 0.070394f, 0.058530f, -0.295023f, + -0.196022f, -0.109097f, 0.261285f, -0.273585f, -0.240632f, 0.258326f, + -0.077364f, 0.071405f, -0.014766f, -0.008751f, -0.203622f, 0.177818f, + 0.116726f, -0.116735f, -0.723616f, -0.700154f, 0.145082f, -0.184949f, + -0.287076f, 0.150405f, 0.258075f, -0.157764f, -0.120909f, 0.105459f, + 0.113288f, -0.092963f, 0.328183f, -0.300115f, -0.361289f, 0.319792f, + -0.048875f, 0.135673f, 0.132539f, -0.162481f, 0.002109f, 0.065048f, + -0.135969f, 0.061558f, 1.510670f, -0.884925f, -0.827022f, 0.190311f, + -0.060088f, -0.033362f, 0.013354f, 0.002847f, 0.353479f, -0.462538f, + -0.319638f, 0.424484f, 0.199540f, -0.073843f, -0.140621f, 0.072133f, + -0.098662f, 0.070613f, 0.031150f, -0.021869f, -0.511253f, 0.503412f, + 0.565963f, -0.576146f, -1.081700f, 0.047670f, 0.266687f, 0.524804f, + -2.361150f, 0.147823f, 0.594717f, 0.956842f, -1.048220f, 0.127083f, + 0.079581f, 0.065419f, 0.176783f, 0.653953f, 0.260967f, 0.537892f, + -1.207580f, 0.245983f, -0.727067f, 0.071755f, -0.343025f, -0.173435f, + 0.215289f, 0.268578f, -1.158560f, 0.039263f, -0.132888f, 0.217132f, + -0.622195f, -0.071256f, 0.317333f, 0.157614f, -1.588250f, 0.316432f, + -0.736720f, -0.041698f, -1.959280f, 0.083451f, 0.570584f, 0.327620f, + -1.262200f, -0.026738f, 0.231198f, 0.326861f, -1.644200f, -0.143833f, + -0.079495f, 0.493026f, -2.488090f, -0.034046f, 0.165884f, 1.074260f, + -1.076980f, 0.248198f, -0.017987f, 0.421900f, -0.105860f, 0.076710f, + 0.002072f, 0.070264f, -1.734750f, 0.227145f, 0.209220f, 0.851459f, + -0.142369f, 0.066502f, 0.027816f, 0.044321f, -0.186591f, -0.100340f, + 0.115580f, 0.192252f, -0.892114f, 0.209531f, -0.308243f, 0.367968f, + -0.721770f, 0.220224f, -0.062744f, 0.133754f, 0.040416f, 0.190428f, + -0.035428f, 0.162974f, 0.116427f, 0.669393f, 0.278891f, 0.856676f, + 1.060390f, 0.936983f, 0.863355f, 0.990560f, -0.147111f, -0.217883f, + 0.355794f, -0.186530f, -0.275614f, -0.095719f, 0.167346f, 0.359078f, + -0.079223f, -0.581596f, -0.213134f, -0.431123f, -0.516443f, -0.388628f, + -0.643821f, -0.202345f, 0.426230f, 0.516923f, 0.548131f, 0.555973f, + 0.022286f, 0.361170f, 0.980065f, 0.648400f, -0.056813f, -0.100310f, + -0.439481f, -0.166454f, 0.412449f, 0.509400f, 0.316208f, 0.470293f, + -0.827838f, -1.078380f, -1.047040f, -1.074560f, 0.274555f, -0.316736f, + 0.128818f, 0.228566f, -0.520967f, -0.731674f, -0.687887f, -0.536388f, + -0.031187f, 0.041404f, 0.047821f, 0.064397f, 0.054230f, 0.105059f, + -0.178671f, 0.176847f, -0.394797f, -0.260255f, -0.333734f, -0.162345f, + -0.444650f, -0.928438f, -0.705840f, -0.833162f, 0.306737f, 0.429699f, + 0.417298f, 0.478469f, 0.420903f, 0.676871f, 0.429677f, 0.616921f, + -0.805199f, -0.643391f, -0.304100f, 0.797599f, -0.172157f, 0.429085f, + -0.750676f, 0.149227f, -0.207898f, -0.022534f, -0.341448f, -0.247976f, + 0.095325f, -0.561120f, 0.599694f, -0.025236f, 0.292346f, -0.312001f, + 0.517478f, 0.301457f, -0.106415f, 0.226263f, -0.184163f, -0.114419f, + -0.322702f, 0.172541f, 0.445573f, 0.157213f, 0.670704f, 0.102174f, + -0.234667f, -0.293311f, 0.769852f, 0.038028f, -0.036741f, -0.228060f, + -0.253335f, 0.424054f, -0.597980f, 0.221007f, -0.114741f, -0.411557f, + -0.592201f, 0.442684f, 0.115491f, -0.106896f, -0.028110f, 0.354751f, + -0.248375f, 0.242570f, -0.155856f, 0.280528f, -0.198742f, 0.588725f, + 0.371065f, 0.078197f, 0.114706f, -0.448021f, 0.065255f, 0.133741f, + -0.227522f, -0.047339f, -0.052849f, 0.309480f, 0.597185f, 0.209182f, + 0.226108f, -0.601036f, -0.431672f, -0.172601f, -0.000174f, 0.194292f, + -0.133937f, 0.130676f, 0.059372f, 0.091381f, 0.098751f, -0.150996f, + 0.170514f, -0.085494f, 0.336576f, 0.484004f, 0.033862f, 0.277473f, + -0.231482f, -0.328385f, -0.332739f, -0.626957f, 0.510167f, 0.575861f, + 0.421494f, 0.482540f, -0.636377f, -0.864661f, -0.694180f, -0.420014f, + -0.132781f, 0.017599f, 0.003538f, 0.486934f, 0.133878f, -0.094622f, + 0.016132f, 0.010117f, 0.156680f, -0.022201f, -0.014621f, 0.228445f, + 0.190826f, 0.171580f, 0.579923f, 0.245428f, 0.322713f, 0.480101f, + 0.406320f, 0.412229f, 0.002334f, -0.022349f, 0.074571f, -0.043828f, + 0.290453f, 0.451749f, 0.530376f, 0.271879f, 0.095144f, 0.169450f, + 0.049482f, 0.114605f, -0.635634f, -0.700768f, -0.558538f, -0.537625f, + 0.190255f, -0.308237f, -0.053703f, 0.212489f, 0.056520f, -0.040019f, + 0.089822f, -0.014155f, -0.376004f, -0.448752f, -0.526717f, -0.571440f, + 0.116482f, 0.162321f, 0.147895f, 0.280527f, 0.159037f, -0.095958f, + 0.007931f, -0.086630f, 0.285625f, 0.514914f, 0.208908f, 0.519251f, + 0.309368f, 0.379777f, 0.350565f, 0.487487f, -0.541494f, -0.421836f, + -0.390001f, -0.500696f, -0.905736f, -0.150439f, -0.942304f, -0.566771f, + 0.484233f, 0.767417f, 0.410477f, 0.670196f, 0.070210f, 0.488836f, + 0.372805f, 0.197631f, 0.337892f, 0.524423f, 0.777219f, -0.260955f, + -0.112981f, -0.060088f, -0.200250f, -0.195671f, 0.007584f, 0.252096f, + 0.235511f, 0.366612f, -0.304979f, -0.211068f, -0.420683f, -0.085370f, + 0.085762f, -0.097549f, -0.802509f, -0.468079f, -0.192787f, -0.069670f, + -0.235162f, -0.077772f, -0.441671f, -0.348479f, -0.431434f, -0.108256f, + -0.133779f, 0.017032f, 0.001964f, -0.120647f, -0.187663f, -0.194985f, + -0.231742f, -0.175288f, -0.162639f, 0.245110f, 0.049951f, 0.104229f, + -0.159634f, -0.076545f, -0.022496f, -0.036532f, -0.147028f, -0.034215f, + 0.028213f, -0.059669f, -0.078259f, 0.062993f, -0.124066f, -0.137362f, + -0.129977f, -0.010532f, -0.049090f, -0.189401f, 0.495471f, 0.615778f, + 0.451437f, 0.803526f, 0.523532f, 0.841339f, 0.699528f, 0.745129f, + 0.246264f, -0.198290f, -0.283620f, 0.189917f, -0.018306f, -0.419097f, + 0.280363f, -0.098085f, 0.138972f, -0.140867f, -0.117025f, 0.098585f, + 0.130979f, 0.268133f, -0.161731f, -0.176629f, -0.357677f, -0.126379f, + 0.553128f, -0.126821f, -0.001511f, -0.010081f, -0.031162f, 0.079203f, + -0.157731f, 0.072865f, 0.535830f, -0.529989f, -0.570075f, 0.295795f, + 0.595613f, -0.449278f, -0.669756f, 0.941452f, 0.356897f, -0.723720f, + -0.115203f, -0.134479f, 0.133048f, 0.109860f, -0.024250f, -0.049732f, + 0.020098f, 0.048356f, -0.048293f, 0.108754f, 0.062548f, -0.238315f, + 0.182700f, 0.312011f, -0.244377f, -0.118012f, 0.012276f, 0.006089f, + 0.098068f, -0.079280f, -0.423987f, -0.411931f, -0.027425f, 0.870280f, + 0.022825f, -0.024481f, -0.036320f, -0.111189f, 0.364539f, -0.244896f, + -0.373060f, 0.266345f, -0.141778f, 0.277549f, 0.059834f, -0.178242f, + -0.686222f, 0.594535f, 0.354546f, -0.272516f, 1.060730f, -1.059810f, + -0.948126f, 0.993267f, 0.116597f, -0.227574f, -0.436144f, -0.333309f, + -0.575746f, -0.828102f, 0.284561f, 0.351668f, -0.080164f, -0.762518f, + -0.511108f, -0.212855f, 0.293892f, -0.548664f, 0.072057f, 0.006748f, + 1.485110f, 0.124687f, 0.727211f, 1.557560f, -0.064383f, -0.022242f, + 0.002921f, -0.151505f, 0.270926f, 0.173632f, -0.640644f, 0.422410f, + -0.240699f, -0.361980f, -0.279864f, -0.055165f, -1.084140f, 0.231705f, + 0.366172f, -0.347698f, -0.097565f, -0.747227f, -0.243033f, 0.941545f, + -0.207460f, -0.353913f, 0.104303f, -0.403151f, 0.203177f, 0.335893f, + -0.229033f, 0.029096f, -0.409634f, -0.179599f, -0.442397f, 0.649114f, + 0.460774f, 0.170906f, -0.043857f, 0.402066f, -0.226896f, -0.199624f, + 0.016650f, 0.207894f, 0.056954f, 0.220329f, 0.374060f, 0.130361f, + -0.303960f, -0.078863f, 0.195410f, 0.729438f, 0.246818f, 0.287730f, + 0.484876f, 0.111488f, -0.168647f, -0.087878f, -0.070089f, -0.341329f, + -0.330280f, 0.259943f, -0.364205f, 0.256555f, -0.756804f, -0.086915f, + 0.777351f, 0.006136f, 0.110348f, 0.248743f, 0.209326f, -0.362741f, + -0.184416f, 0.422446f, 0.565193f, 0.310072f, -0.011212f, -0.765226f, + 0.039466f, 0.301288f, 0.172907f, -1.539450f, 0.606202f, 0.477469f, + 0.045894f, -0.222180f, -0.013192f, -0.064077f, -0.241551f, 0.192914f, + 0.028004f, -0.540538f, 0.437440f, 0.179087f, -0.753204f, -0.001374f, + 1.185930f, -0.151182f, 1.238580f, -1.389900f, 0.277954f, 0.422208f, + 0.041553f, -0.542284f, 0.139019f, -0.148580f, -0.130705f, 0.361830f, + 0.322953f, -0.092371f, 0.120180f, -0.355299f, -0.028057f, 0.128114f, + 0.250947f, -0.349926f, -0.684633f, 0.246175f, 0.186731f, -0.676313f, + 0.060535f, 0.333371f, -0.021172f, -0.421266f, -0.079650f, 0.031359f, + -0.303658f, -0.298286f, 0.119016f, 0.655585f, 0.200175f, -0.887182f, + -0.197539f, -0.318883f, -0.130250f, 0.522487f, -0.092616f, 0.405930f, + -0.281678f, 0.089728f, 0.081814f, -0.781745f, 0.348878f, 0.082274f, + -0.914136f, 1.098810f, 0.855321f, -1.078170f, -0.268018f, 0.246440f, + 0.238347f, -0.027228f, 0.074111f, -0.061197f, -0.063582f, 0.089462f, + -0.040347f, 0.117082f, 0.122772f, -0.162816f, -0.148668f, -0.342856f, + -0.495604f, -1.453630f, -0.045273f, -0.030463f, 0.043766f, 0.047978f, + 0.016910f, -0.009700f, 0.006288f, -0.042556f, 0.632896f, -0.845744f, + -0.516844f, 0.709439f, 0.486166f, -1.203050f, -0.978381f, 0.631876f, + 0.000705f, 0.123858f, -0.001187f, -0.172312f, -0.422668f, 0.241838f, + 0.437400f, -0.268186f, -0.513259f, 0.450209f, 0.542629f, -0.453810f, + -0.207119f, 0.072598f, 0.085066f, -0.018986f, -0.149512f, 0.149521f, + 0.182105f, -0.227200f, -0.363240f, 0.172670f, -0.502932f, 0.689256f, + 0.093760f, -0.090207f, -0.066803f, 0.056759f, -0.002243f, -0.050662f, + -0.059324f, 0.152943f, -0.701150f, 0.712540f, 0.660349f, -0.654970f, + 0.351772f, -0.303383f, -0.311177f, 0.247653f, 0.013035f, 0.034648f, + -0.137832f, 0.041197f, 0.410265f, 0.345129f, 0.653338f, 0.047050f, + 0.140399f, 0.018613f, -0.012431f, -0.113632f, -0.029928f, 0.051564f, + -0.031349f, 0.151944f, -0.160340f, 0.326798f, -0.458067f, 0.636235f, + 0.243184f, 0.514072f, 2.414450f, 1.421980f, -0.001474f, -0.141389f, + -0.104817f, -0.141882f, -0.026395f, 0.053014f, 0.143885f, -0.207774f, + -0.563846f, -0.242514f, -0.436574f, -0.456796f, -0.520646f, 0.282550f, + -0.684924f, 0.061105f, -0.315884f, -0.392624f, 0.009805f, -0.256597f, + -0.146732f, 0.331039f, 0.362342f, 0.270851f, 0.067679f, -0.071331f, + -0.222423f, 0.081286f, -0.208192f, -0.193816f, -0.008201f, -0.309340f, + 0.167556f, 0.106071f, 0.172254f, -0.163790f, -0.142205f, -0.043182f, + 0.096145f, 0.145037f, -0.066015f, -0.073194f, 0.132237f, -0.088522f, + -0.044292f, -0.487128f, 0.033389f, -0.573548f, 0.185449f, 0.273593f, + 0.147503f, 0.457049f, -0.021539f, 0.090786f, 0.009147f, 0.000899f, + 0.018088f, 0.115791f, -0.079165f, 0.139388f, +}; + +static const float weights_layer_2[] = { + 0.153048f, 0.112901f, 0.136781f, 0.154580f, 0.091610f, 0.045165f, + 0.088490f, 0.116991f, -0.463766f, -0.596567f, -0.567008f, -0.630565f, + 0.141874f, 0.095726f, 0.175427f, 0.145027f, -0.969824f, -1.018190f, + -1.073300f, -1.041130f, -0.070545f, -0.123600f, -0.114967f, -0.169453f, + -0.267458f, -0.147730f, -0.161419f, -0.164894f, -0.117508f, -0.204389f, + -0.122695f, -0.163107f, -0.003903f, -0.030470f, -0.037433f, -0.059568f, + 0.138243f, 0.091019f, 0.160372f, 0.141650f, -0.544565f, -0.620004f, + -0.504503f, -0.429979f, -0.099491f, -0.096384f, -0.155265f, -0.188536f, + 0.084923f, 0.038345f, 0.066706f, 0.122083f, 0.267087f, 0.184419f, + 0.261478f, 0.255746f, -0.245894f, -0.114980f, -0.193880f, -0.227785f, + 0.087536f, 0.095712f, 0.106105f, 0.099353f, -0.059473f, -0.173247f, + -0.202386f, -0.076010f, 0.125928f, 0.100793f, 0.119638f, 0.129623f, + 0.136593f, 0.102984f, 0.156550f, 0.140558f, 0.122524f, 0.051596f, + 0.084164f, 0.123630f, 0.072542f, 0.096063f, 0.083236f, 0.087630f, + 0.025900f, 0.023738f, 0.036385f, 0.053077f, -0.029501f, 0.010544f, + -0.010026f, -0.051268f, 0.086302f, 0.109909f, 0.101385f, 0.127513f, + -0.031869f, 0.005340f, -0.056267f, -0.032955f, 0.032748f, 0.023162f, + 0.092118f, -0.001780f, -0.123612f, -0.183433f, -0.202377f, -0.317516f, + 0.129052f, 0.208112f, 0.145582f, 0.175502f, 0.018476f, 0.036349f, + 0.072417f, 0.061194f, 0.086985f, 0.117086f, 0.072465f, 0.129068f, + 0.020182f, 0.052114f, 0.017878f, 0.010478f, -0.001381f, -0.034644f, + 0.025135f, -0.037748f, 0.004973f, 0.024778f, 0.041816f, 0.032111f, + 0.080268f, 0.124998f, 0.105719f, 0.177047f, -0.072114f, -0.011864f, + -0.076846f, -0.089840f, 0.069993f, 0.089362f, 0.088035f, 0.120621f, + 0.065916f, 0.100946f, -0.006784f, -0.007751f, 0.122039f, 0.126482f, + 0.078629f, 0.140299f, 0.074034f, 0.092464f, 0.089798f, 0.108968f, + 0.075729f, 0.057128f, 0.013570f, 0.021195f, 0.068901f, 0.054022f, + 0.029781f, 0.031404f, -0.209998f, -0.208731f, -0.198310f, -0.212454f, + -0.579168f, -0.490190f, -0.607567f, -0.520541f, 0.083863f, 0.056612f, + 0.030366f, 0.061790f, -0.004874f, -0.057203f, -0.060429f, -0.049145f, + 0.080086f, 0.138602f, 0.223796f, 0.133279f, -0.495954f, -0.612093f, + -0.545393f, -0.562310f, 0.070672f, 0.037702f, 0.139013f, 0.080192f, + -0.111387f, -0.048165f, 0.074359f, -0.042125f, 0.113633f, 0.106579f, + 0.042633f, 0.102734f, -0.068220f, 0.128423f, -0.181821f, -0.013260f, + -0.108563f, -0.138667f, -0.109304f, -0.131909f, -0.168667f, -0.126870f, + -0.132533f, -0.167096f, -0.184741f, -0.140890f, -0.125361f, -0.150632f, + 0.309013f, 0.364376f, 0.361102f, 0.271566f, 0.116552f, 0.091160f, + 0.096846f, 0.095954f, 0.046972f, 0.080489f, 0.028766f, -0.012223f, + 0.071379f, 0.041535f, -0.000668f, 0.033698f, -0.013493f, -0.027535f, + -0.025804f, -0.012267f, -0.097465f, -0.099232f, -0.208863f, -0.225201f, + -0.475608f, 0.077358f, -0.002872f, 0.163890f, -0.420298f, 0.072114f, + 0.121601f, -0.016727f, 0.573853f, -0.080196f, 0.193053f, 0.053012f, + -0.454179f, 0.058563f, 0.067265f, 0.141154f, 0.412541f, 0.086933f, + 0.030407f, -0.030413f, 0.478757f, -0.097731f, 0.277072f, -0.086393f, + 0.552604f, -0.334201f, 0.091765f, -0.270262f, -1.395060f, 0.271837f, + -0.005335f, 0.240499f, 0.175442f, -0.326329f, -0.019353f, -0.270338f, + -0.459273f, 0.096183f, 0.153046f, 0.135818f, 0.759028f, -0.177673f, + -0.099966f, 0.103363f, 0.697289f, -0.234184f, -0.048706f, -0.116099f, + -0.282575f, 0.025655f, -0.184759f, 0.040658f, -0.558267f, 0.214087f, + -0.095620f, 0.200522f, 0.278996f, 0.031959f, 0.122936f, -0.209196f, + -0.308217f, 0.092917f, 0.113269f, 0.136274f, -0.037046f, 0.017263f, + -0.194183f, 0.089133f, -0.161244f, 0.042799f, 0.030557f, 0.153545f, + -0.355048f, 0.070928f, -0.152852f, 0.102875f, -0.193649f, 0.007916f, + -0.062952f, 0.050602f, 0.073671f, 0.143045f, -5.978970f, -7.013850f, + 0.058713f, 0.076116f, 0.026445f, -0.056599f, -0.005966f, 0.032234f, + 0.006753f, -0.024528f, 0.120308f, 0.179939f, -6.624630f, -7.638680f, + 0.026359f, 0.020758f, 0.194274f, 0.051489f, -0.008491f, -0.028248f, + -0.061328f, -0.134423f, -0.103951f, -0.110877f, 0.042263f, 0.127016f, + 0.012473f, -0.008595f, 0.031357f, 0.087476f, -0.084022f, -0.015590f, + -0.313546f, 0.120072f, 0.123880f, 0.162148f, -6.596560f, -7.358830f, + 0.004797f, -0.003415f, 0.048455f, 0.026737f, -0.103702f, 0.034416f, + -0.003475f, -0.236827f, 0.005378f, 0.048413f, 0.054612f, -0.079359f, + 0.043707f, 0.001085f, 0.023380f, 0.007785f, 0.025938f, -0.052856f, + -0.033421f, 0.022643f, 0.034161f, 0.127681f, -5.019490f, -5.233580f, + -0.128630f, 0.087741f, -0.239834f, -0.377876f, 0.128082f, 0.142730f, + -0.086819f, -0.350927f, 0.089849f, 0.155776f, -6.155120f, -5.721720f, + 0.056110f, 0.008761f, 0.045579f, 0.016762f, -0.134076f, -0.101551f, + -0.096058f, -0.117146f, 0.003527f, -0.056942f, -0.005578f, 0.071287f, + 0.023776f, -0.028003f, -0.075390f, -0.191160f, -0.089672f, -0.104372f, + -0.104750f, -0.080813f, -0.249824f, -0.124479f, -0.243593f, -0.244284f, + -0.554911f, -0.549095f, -0.564693f, -0.475107f, -0.121771f, -0.143441f, + -0.171170f, -0.120920f, 0.109831f, 0.079708f, 0.327295f, 0.308907f, + -0.178785f, -0.428316f, -0.418882f, -0.366750f, -0.139296f, -0.129645f, + -0.081237f, -0.101533f, -0.006256f, -0.146756f, -0.322110f, -0.338865f, + -0.306085f, -0.319592f, -0.454803f, -0.363560f, -0.018557f, 0.006605f, + -0.131198f, -0.077708f, 0.138160f, 0.119611f, 0.271098f, 0.232168f, + 0.027812f, 0.035390f, -0.202503f, -0.091172f, -0.142020f, -0.159929f, + -0.106404f, -0.107433f, -0.381743f, -0.353222f, -0.484159f, -0.469926f, + -0.234659f, -0.315674f, -0.178327f, -0.213485f, -0.096207f, -0.190944f, + -0.118917f, -0.161288f, 0.015996f, 0.060737f, 0.051390f, 0.060876f, + 0.229289f, 0.282418f, 0.250945f, 0.197273f, 0.045131f, -0.008305f, + 0.072024f, 0.044547f, -0.050010f, 0.055504f, 0.001343f, -0.014445f, + 0.254909f, 0.309091f, 0.228249f, 0.274843f, 0.089778f, -0.046581f, + 0.072714f, 0.126814f, -0.048931f, -0.045743f, -0.151333f, -0.004490f, + 0.179966f, 0.058150f, -0.178622f, -0.088159f, -0.074416f, -0.005821f, + -0.011799f, -0.002225f, -0.069361f, -0.098937f, -0.081575f, -0.034796f, + 0.253792f, 0.301039f, 0.219163f, 0.256027f, 0.058007f, -0.041431f, + 0.040674f, 0.009019f, -0.099670f, -0.099077f, -0.039437f, 0.017946f, + 0.060717f, 0.045796f, 0.109664f, 0.032138f, -0.071094f, 0.023697f, + 0.011335f, -0.030465f, 0.068677f, 0.039345f, -0.045078f, 0.084037f, + 0.135517f, 0.190417f, 0.175578f, 0.155286f, -0.044505f, 0.010826f, + 0.006717f, -0.134715f, 0.068022f, 0.110095f, 0.079966f, 0.034481f, + 0.185804f, 0.188273f, 0.227283f, 0.135935f, 0.033447f, 0.031571f, + -0.014766f, -0.024565f, 0.021792f, 0.017675f, -0.001333f, -0.040069f, + -0.049384f, -0.045256f, -0.014013f, -0.000107f, -0.096928f, -0.111495f, + -0.051225f, -0.060449f, 0.071446f, 0.017294f, -0.004822f, 0.006932f, + 0.020884f, 0.089425f, 0.061097f, -0.038708f, -0.184029f, -0.089541f, + -0.158035f, -0.214607f, -0.377947f, -0.318586f, -0.336977f, -0.323908f, + 0.181612f, 0.140018f, 0.233524f, 0.193366f, -0.254507f, -0.271902f, + -0.197144f, -0.119539f, 0.042162f, 0.000320f, 0.014708f, -0.014228f, + -0.081119f, -0.089326f, 0.001763f, 0.081009f, -0.142618f, -0.160650f, + -0.214597f, -0.202143f, -0.053495f, -0.012819f, -0.071468f, -0.010883f, + 0.072570f, 0.071507f, 0.091045f, 0.083155f, -0.271237f, -0.289211f, + -0.272345f, -0.299411f, 0.031697f, -0.029795f, -0.030045f, -0.013604f, + -0.106843f, -0.045212f, -0.122459f, -0.096936f, 0.059793f, 0.006157f, + 0.028092f, 0.040589f, -0.014560f, -0.008975f, -0.051404f, -0.014309f, + -0.016883f, 0.018332f, 0.040114f, 0.050348f, 0.044921f, -0.002445f, + -0.112396f, 0.014395f, 0.115160f, 0.145350f, -0.166814f, -0.121449f, + 0.155573f, -0.099446f, -0.161661f, 0.187251f, 0.004711f, 0.024318f, + -0.060871f, -0.028311f, -0.098274f, 0.322030f, -0.069242f, -0.153173f, + -0.227428f, -0.293965f, 0.228491f, 0.111413f, -1.354720f, -0.344235f, + 0.866715f, 0.872344f, 0.078789f, -0.384865f, 0.162388f, 0.109018f, + -0.191549f, -0.002638f, 0.305053f, 0.087337f, 0.066506f, -0.055810f, + -0.010984f, -0.056160f, -0.114617f, -0.058478f, 0.022059f, -0.124368f, + -0.130989f, 0.369432f, -0.248898f, -0.003955f, -0.021578f, 0.115991f, + -0.114163f, -0.065232f, 0.339857f, -0.225997f, 0.006282f, -0.125395f, + 0.235082f, -0.347785f, 0.662321f, -0.529182f, 0.153297f, -0.001326f, + -0.026725f, -0.024677f, -0.088065f, -0.116127f, 0.080896f, 0.212542f, + 0.208421f, 0.032047f, -0.211395f, 0.074997f, 0.096659f, 0.096423f, + -0.078643f, 0.106556f, -0.123860f, 0.075609f, 0.066008f, -0.097275f, + -1.000020f, -0.780154f, -0.856922f, -0.964007f, 0.083135f, -0.018922f, + -0.266214f, -0.151480f, 0.051538f, 0.017802f, 0.066774f, -0.021341f, + -0.869494f, -0.935252f, -0.895836f, -0.853871f, -0.160490f, 0.085850f, + -0.029670f, -0.056675f, 0.159989f, 0.166872f, 0.129970f, 0.194377f, + 0.153294f, 0.199593f, 0.037692f, 0.103391f, 0.029335f, -0.085324f, + -0.079326f, -0.077216f, 0.501561f, 0.366168f, 0.330196f, 0.296432f, + -0.977282f, -0.844295f, -1.014870f, -1.098990f, -0.099858f, -0.129552f, + 0.090051f, -0.013378f, 0.081330f, 0.194911f, 0.286501f, 0.177363f, + -0.148250f, -0.111700f, -0.243081f, -0.102918f, 0.161069f, -0.012655f, + -0.071722f, -0.020329f, -0.077828f, -0.041716f, 0.109247f, 0.062229f, + -0.759722f, -0.742756f, -0.563713f, -0.631187f, 0.005911f, 0.268154f, + -0.263769f, 0.087149f, -0.163623f, -0.359600f, -0.464577f, -0.369352f, + -0.515784f, -0.475822f, -0.523485f, -0.649813f, -0.112419f, -0.029285f, + 0.021061f, -0.041515f, 0.149133f, -0.254428f, 0.115776f, -0.061892f, + 0.103675f, -0.283363f, 0.005005f, 0.022034f, -0.178454f, 0.035836f, + -0.113702f, -0.217823f, 0.209407f, -0.296257f, 0.187976f, -0.157370f, + -0.127190f, 0.251780f, 0.055633f, 0.294111f, -0.067773f, 0.467190f, + -0.192625f, -0.071084f, -0.445284f, 0.511090f, -0.319728f, 0.267971f, + 0.494929f, -0.586727f, 0.454543f, -0.520675f, -0.085900f, 0.325989f, + -0.131006f, -0.069501f, 0.199927f, -0.218919f, 0.170055f, -0.106538f, + 0.133312f, 0.127629f, -0.561625f, 0.595666f, -0.090927f, 0.363348f, + -0.249246f, 0.063068f, -0.016458f, -0.291045f, -0.040509f, 0.017866f, + 0.304871f, -0.459214f, 0.214390f, -0.238740f, -0.456541f, 0.545848f, + -0.218026f, 0.202475f, 0.128490f, -0.036417f, 0.173885f, -0.049385f, + 0.235514f, -0.132587f, -0.015066f, 0.164638f, 0.196873f, -0.125330f, + 0.216912f, -0.109398f, 0.121602f, -0.209374f, 0.164400f, -0.123049f, + 0.195520f, -0.212932f, -0.015180f, -0.005784f, 0.049726f, -5.822150f, + 0.124536f, 0.040689f, -0.018560f, -3.155020f, 0.014690f, 0.076202f, + -0.154008f, 1.070630f, -0.071606f, 0.051026f, 0.138285f, -5.836340f, + 0.162173f, 0.085890f, -0.186166f, 0.093221f, 0.019240f, -0.017053f, + -0.090144f, 0.236254f, -0.125344f, 0.056235f, -0.089813f, -0.252281f, + -0.127406f, -0.155088f, 0.009972f, -0.066449f, 0.044222f, 0.025943f, + -0.164921f, 0.165463f, -0.001132f, -0.038386f, 0.115194f, -5.757100f, + 0.163386f, 0.061226f, 0.024626f, 0.132750f, 0.107279f, -0.001622f, + -0.107860f, -0.356009f, -0.138935f, -0.145173f, -0.061198f, -0.646138f, + 0.034279f, 0.078187f, 0.108138f, -0.490444f, 0.074719f, 0.034984f, + -0.109303f, 0.741785f, -0.066939f, 0.015558f, 0.114229f, -4.001080f, + 0.130772f, 0.044675f, -0.165162f, -0.274810f, -0.042987f, -0.048579f, + 0.156603f, -1.288370f, 0.076198f, 0.035065f, 0.032043f, -5.002520f, + 0.086900f, -0.010886f, 0.030850f, -0.782259f, 0.056211f, -0.097759f, + 0.118988f, 0.106638f, 0.091419f, 0.079920f, 0.062325f, 0.097116f, + 0.126035f, 0.122530f, -0.278299f, -0.083314f, -0.300563f, -0.197946f, + 0.081664f, 0.089925f, 0.074754f, 0.074628f, 0.102338f, 0.088845f, + 0.105841f, 0.102381f, 0.003087f, 0.061599f, 0.098326f, 0.040119f, + -0.005298f, -0.028834f, 0.059938f, -0.013668f, -0.585882f, -0.631436f, + -0.742673f, -0.736666f, 0.025071f, 0.066851f, 0.075046f, 0.091360f, + 0.099045f, 0.098261f, 0.106413f, 0.099487f, -0.016742f, -0.097334f, + -0.086152f, -0.212444f, -0.028043f, -0.007362f, 0.003914f, -0.055864f, + 0.034756f, 0.081361f, 0.080183f, 0.061319f, 0.193396f, 0.173716f, + 0.207765f, 0.231701f, -0.074565f, -0.073257f, -0.086470f, -0.083114f, + 0.081489f, 0.078477f, 0.033452f, 0.058835f, -0.069665f, -0.031691f, + -0.111255f, -0.167754f, 0.184179f, 0.174673f, 0.160288f, 0.190893f, + 0.110930f, 0.103495f, 0.098408f, 0.102918f, 0.053764f, 0.089994f, + 0.140308f, 0.124867f, 0.074176f, 0.117460f, -0.160775f, -0.144132f, + -0.099373f, -0.035913f, 0.081237f, 0.062247f, -0.166421f, 0.062125f, + 0.276479f, 0.060955f, 0.066627f, 0.455347f, 0.219953f, 0.109912f, + 0.273931f, 0.233153f, 0.102236f, 0.447606f, -0.352243f, 0.499236f, + -0.931206f, 0.248595f, 0.254047f, 0.061542f, 0.268804f, 0.309517f, + -0.084414f, -0.245828f, -0.144882f, -0.296579f, -0.091628f, -0.142202f, + -0.541764f, -0.407470f, 0.053481f, 0.238955f, 0.150188f, -0.060598f, + 0.196118f, -0.215617f, -0.086238f, -0.263420f, 0.206877f, 0.241788f, + -0.122544f, -0.448790f, 0.286917f, 0.112063f, -0.268408f, -0.041770f, + 0.089161f, 0.355811f, -0.078245f, -0.148490f, -0.407301f, -1.296870f, + -0.633421f, 0.124253f, 0.275402f, 0.223048f, 0.077016f, 0.160766f, + 0.115374f, 0.061053f, -0.231872f, -0.515052f, -0.278331f, -0.235912f, + -0.416372f, -0.284106f, -0.055942f, 0.110698f, -0.428288f, -0.298137f, + -0.018101f, 0.102677f, -0.019639f, 0.013479f, 0.038549f, 0.048682f, + 0.128684f, 0.116416f, 0.044852f, 0.008133f, 0.061597f, 0.083582f, + 0.014953f, 0.063716f, -0.155318f, -0.061732f, 0.084855f, 0.129505f, + 0.068249f, 0.193775f, -0.088631f, -0.446398f, -0.075710f, -0.061327f, + 0.278715f, 0.540366f, 0.618715f, 0.538374f, -0.037843f, 0.062370f, + -0.033184f, 0.119901f, -0.008641f, -0.064789f, 0.087498f, 0.043486f, + 0.247085f, 0.419992f, 0.299935f, 0.234276f, 0.089283f, 0.070357f, + 0.068888f, 0.134311f, 0.109823f, 0.072431f, 0.081676f, 0.091366f, + -1.707980f, -2.213110f, -2.149930f, -1.556870f, 0.226598f, 0.191675f, + 0.192207f, 0.159566f, -0.070194f, -0.136070f, -0.015172f, -0.204272f, + -0.162191f, -0.043313f, -0.158007f, -0.227210f, 0.040398f, 0.043014f, + 0.039439f, -0.035439f, 0.245558f, 0.439691f, 0.219659f, 0.138210f, + -0.048129f, 0.004954f, -0.102860f, -0.185376f, 0.035548f, 0.006821f, + 0.079199f, 0.032901f, 0.039218f, 0.068113f, 0.023075f, -0.037582f, + 0.225181f, 0.164562f, 0.106718f, 0.032684f, 0.013402f, 0.018797f, + 0.076606f, 0.046512f, -0.070024f, 0.099921f, -0.051231f, 0.074167f, + 0.173313f, 0.220212f, 0.142665f, 0.069809f, -0.195130f, -0.007912f, + -0.006764f, -0.063687f, 0.306374f, 0.402035f, 0.273759f, 0.449469f, + 0.114597f, 0.210745f, 0.355326f, 0.271307f, -0.109943f, -0.171912f, + -0.070726f, -0.128932f, 0.138770f, 0.164971f, 0.308516f, 0.332536f, + 0.081537f, 0.096939f, 0.054136f, 0.052226f, 0.109489f, 0.010223f, + 0.168072f, -0.106279f, 0.525568f, 0.704816f, 0.588942f, 0.473398f, + 0.149497f, 0.120835f, 0.080049f, 0.151340f, -0.182038f, -0.191091f, + -0.196505f, -0.198309f, -0.801819f, -1.441620f, -1.107780f, -1.025650f, + 0.035750f, 0.018049f, -0.029033f, -0.067255f, 0.192049f, 0.009664f, + -0.043741f, 0.051557f, 0.082815f, 0.069547f, -0.073379f, 0.010584f, + 0.192128f, 0.208586f, 0.141904f, 0.100763f, 0.046183f, 0.044776f, + -0.033611f, -0.005812f, 0.012966f, 0.030301f, 0.100665f, 0.103641f, + -0.294776f, -0.361573f, -0.420156f, -0.388743f, 0.239287f, 0.191975f, + 0.089644f, 0.117591f, 0.069563f, 0.021480f, 0.100287f, 0.174159f, + -0.013571f, 0.090960f, 0.010232f, -0.034760f, -0.077205f, 0.060632f, + -0.145527f, -0.391110f, -0.143052f, -0.236448f, -0.103902f, -0.188463f, + 0.071311f, -0.080171f, 0.021987f, 0.041767f, -0.419487f, -0.515479f, + -0.205470f, -0.732132f, 0.150901f, 0.107202f, 0.156307f, 0.143672f, + 0.474682f, 0.178137f, 0.150063f, 0.414515f, 0.559891f, 0.697019f, + 0.541231f, 0.505310f, -0.478101f, -0.444267f, -0.586539f, -0.445996f, + -0.451873f, -0.530085f, -0.447980f, -0.364955f, 0.372435f, 0.318894f, + 0.351211f, 0.193961f, 0.212295f, 0.212842f, 0.220003f, 0.243743f, + -0.388628f, -0.789620f, -0.536618f, -0.430691f, 0.247004f, 0.266489f, + 0.261033f, 0.263692f, 0.050089f, 0.048958f, 0.065207f, 0.120180f, + -0.526230f, -0.481969f, -0.422411f, -0.272292f, 0.155593f, 0.229614f, + 0.139579f, 0.171805f, -0.251924f, -0.302067f, -0.126157f, -0.346650f, + -1.195450f, -1.281100f, -0.987911f, -1.478440f, 0.285667f, 0.284802f, + 0.301887f, 0.259556f, -0.194127f, -0.090440f, -0.257959f, -0.259572f, + -0.012273f, -0.049993f, -0.099431f, 0.012506f, 0.081526f, 0.166279f, + 0.042594f, 0.185121f, 0.148830f, 0.073161f, 0.201728f, 0.125747f, + -0.295065f, -0.187585f, -0.333066f, -0.312291f, 0.253458f, 0.321585f, + 0.178844f, 0.219944f, -0.763475f, -0.943374f, -0.816825f, -0.709901f, + -0.166132f, 0.129186f, 0.015405f, -0.065623f, -0.246006f, -0.340385f, + -0.118155f, -0.384905f, -0.233883f, -0.400666f, -0.228597f, -0.228428f, + -0.559083f, -0.377784f, -0.541458f, -0.542870f, 0.067400f, 0.122987f, + 0.180901f, 0.186004f, -0.482910f, -0.424823f, -0.477831f, -0.394719f, + 0.091558f, 0.049248f, 0.049370f, 0.160429f, 0.133641f, 0.096625f, + 0.104429f, 0.100782f, -0.238252f, -0.221459f, -0.196974f, -0.250393f, + -3.071750f, -2.418450f, -0.861410f, -1.051580f, 0.071263f, 0.118014f, + -0.028430f, -0.072073f, -0.074463f, 0.034168f, 0.044089f, -0.091109f, + -3.153840f, -2.945850f, -1.977360f, -1.498850f, -0.083429f, 0.131835f, + -0.063865f, -0.065785f, -0.069346f, -0.015520f, -0.119551f, 0.044881f, + -0.105280f, 0.127516f, 0.005255f, -0.142777f, 0.061055f, -0.117250f, + 0.020454f, 0.157879f, -0.213812f, -0.151783f, 0.028583f, 0.137759f, + -3.248250f, -3.005940f, -1.510540f, -1.475390f, 0.081874f, -0.171465f, + -0.135690f, -0.001989f, -0.227574f, -0.132799f, -0.359742f, -0.137197f, + 0.066324f, 0.039194f, -0.050857f, 0.095166f, 0.044475f, 0.011221f, + 0.054904f, 0.061414f, -0.039189f, 0.123751f, -0.017171f, -0.008494f, + -2.598220f, -2.832670f, -1.622030f, -1.201990f, 0.154313f, -0.021436f, + 0.042190f, 0.143947f, -0.090623f, 0.086853f, 0.143137f, 0.099821f, + -1.732820f, -1.429730f, -0.775125f, -0.648036f, 0.082176f, 0.079448f, + -0.040575f, 0.024511f, -0.064105f, -0.117122f, -0.190323f, -0.182589f, + -0.076430f, -0.095615f, -0.112513f, -0.101581f, 0.143037f, 0.148180f, + 0.430958f, 0.359225f, 0.001403f, -0.080541f, -0.295001f, -0.156706f, + 0.426623f, 0.475597f, 0.455210f, 0.454352f, 0.074365f, 0.099440f, + 0.066348f, -0.007078f, 0.008335f, -0.097116f, -0.133687f, -0.110535f, + 0.204145f, 0.281478f, 0.078886f, 0.112857f, -0.103620f, -0.068247f, + 0.191147f, 0.227593f, -0.011816f, -0.058755f, -0.149477f, -0.101828f, + 0.079878f, 0.304949f, 0.557555f, 0.305288f, -0.150955f, -0.118610f, + 0.052073f, 0.064707f, -0.121728f, -0.151132f, -0.193987f, -0.175046f, + 0.043655f, 0.105270f, -0.120715f, -0.040976f, 0.047776f, -0.004443f, + 0.149606f, 0.111240f, -0.047502f, -0.064146f, -0.151858f, -0.151872f, + -0.160207f, -0.113846f, -0.081585f, -0.006708f, -0.203760f, -0.068597f, + -0.179979f, -0.127779f, -0.062460f, -0.064513f, -0.121479f, -0.111122f, + -0.212384f, -0.229157f, -0.283428f, -0.184891f, +}; + +static const float weights_layer_3[] = { + -0.039388f, 0.033048f, -0.113003f, -0.011642f, 0.170478f, 0.145713f, + 0.040189f, -0.280129f, -0.049050f, -0.043788f, -0.157425f, 0.323829f, + -0.250725f, -0.166349f, 0.101650f, -0.049690f, 0.205606f, 0.281131f, + 0.623204f, 0.993452f, -0.015115f, -0.138995f, 0.009473f, 0.157673f, + -0.024687f, -0.067214f, 0.125566f, -0.317619f, 0.057002f, 0.031202f, + -0.018167f, 0.068542f, 0.011609f, -0.020233f, -0.000428f, -0.035956f, + -0.843274f, -0.800587f, -0.214917f, -0.221250f, 0.031255f, -0.077330f, + -0.074902f, -0.063979f, -0.055562f, 0.679495f, 0.146609f, 1.315330f, + -0.118399f, -0.034539f, -0.050377f, 0.172867f, -0.204607f, -0.034930f, + 0.176014f, 0.089747f, -0.003889f, 0.044980f, 0.002386f, -0.141723f, + -0.035828f, -0.204701f, 0.099813f, 0.123580f, 0.209851f, -0.110989f, + -0.043655f, -0.461118f, -0.139664f, 0.026855f, -0.081714f, 0.207623f, + 0.089942f, 0.253082f, 0.680568f, 0.811360f, -0.090528f, -0.116818f, + -0.432361f, -0.075588f, -0.269924f, -0.276810f, -0.289192f, -0.282570f, + 0.245566f, 0.267216f, 0.238622f, 0.286528f, -0.157605f, -0.200401f, + -0.138924f, -0.185006f, 0.215203f, 0.203316f, 0.209532f, 0.293135f, + 0.928046f, 0.733323f, -0.094120f, 0.036918f, -0.126643f, -0.083371f, + -0.147530f, -0.153195f, 0.097097f, 0.101852f, 0.109160f, 0.105129f, + -0.051869f, -0.064359f, -0.073469f, -0.059591f, 0.102431f, 0.109444f, + 0.113614f, 0.105617f, 0.383311f, 0.325783f, 0.393234f, 0.382508f, + 0.194720f, 0.189672f, 0.217477f, 0.177786f, 0.326461f, 0.114789f, + 0.317061f, 0.048291f, -0.061143f, -0.134641f, -0.067895f, -0.108446f, + 0.082592f, 0.029918f, -0.006580f, 0.015533f, -0.053583f, -0.055540f, + -0.063395f, -0.023157f, -0.064955f, -0.073981f, -0.115452f, -0.086626f, + -0.036616f, 0.008454f, 0.012029f, -0.008039f, -0.207395f, -0.216419f, + -0.205363f, -0.249099f, 0.343308f, 0.413215f, -0.009918f, -0.109978f, + -0.059711f, -0.045089f, -0.029130f, -0.038483f, -0.070323f, -0.099409f, + -0.008849f, -0.063527f, 0.175963f, 0.185335f, 0.149151f, 0.199997f, + -0.027516f, -0.039812f, -0.027760f, -0.047910f, -0.007337f, 0.071065f, + 0.086225f, 0.125539f, 0.151390f, 0.215488f, 0.203450f, 0.045380f, + 0.095761f, 0.107809f, 0.103918f, 0.122383f, 0.116287f, 0.135455f, + 0.115446f, 0.155673f, -0.044648f, -0.027455f, -0.015473f, -0.026657f, + 0.089852f, 0.077459f, 0.077631f, 0.082507f, -0.102761f, -0.054669f, + -0.132223f, -0.024768f, 0.111573f, 0.060467f, 0.107883f, 0.056621f, + 0.219357f, -0.161153f, 0.074379f, -0.118743f, -0.169931f, -0.153995f, + -0.220003f, -0.200186f, 0.032318f, -0.060687f, -0.087550f, -0.038022f, + 0.026633f, -0.005534f, 0.029532f, 0.027081f, 0.011926f, 0.058412f, + 0.010631f, 0.003068f, -0.014911f, 0.063070f, 0.065271f, 0.089550f, + 0.012885f, 0.005320f, -0.037494f, -0.019849f, -0.009624f, -0.059090f, + -0.021222f, -0.088033f, -0.055261f, -0.055113f, -0.047598f, -0.055478f, + -0.023648f, -0.046827f, -0.036572f, -0.057655f, 0.104194f, 0.179800f, + 0.175751f, 0.192851f, -0.016950f, -0.073650f, -0.028592f, -0.088219f, + 0.011130f, 0.061825f, 0.025643f, 0.034183f, 0.095548f, 0.001457f, + -0.132869f, 0.032981f, -0.140178f, -0.105343f, -0.161799f, -0.161983f, + 0.177746f, 0.132903f, 0.135627f, 0.152489f, -0.012532f, -0.068747f, + -0.085849f, -0.095434f, 0.087037f, 0.139497f, 0.111899f, 0.100189f, + -0.024649f, -0.092003f, 0.020783f, -0.115807f, 0.092039f, 0.093943f, + 0.109466f, 0.049639f, -0.133727f, 0.128430f, -0.050546f, 0.190632f, + 0.123733f, 0.082305f, 0.114878f, 0.122572f, 0.201618f, 0.137588f, + 0.065582f, 0.125161f, -0.095179f, -0.120719f, -0.127126f, -0.101961f, + -0.118120f, -0.104833f, -0.179632f, -0.131764f, -0.138096f, -0.147861f, + -0.131512f, -0.153905f, -0.201816f, -0.206641f, -0.196707f, -0.160013f, + -0.212605f, -0.093998f, -0.186258f, -0.076137f, -0.065340f, -0.006969f, + -0.071383f, -0.075005f, +}; + +static const float weights_layer_4[] = { + -0.016102f, -0.022836f, 0.624049f, 0.273485f, 0.222800f, -0.290175f, + -0.518415f, 0.413484f, -0.264495f, 0.498083f, -0.450145f, -0.106419f, + 0.095103f, -0.187451f, 0.145933f, -0.371542f, -0.088871f, 0.184017f, + -0.429625f, -0.110882f, 0.292781f, 0.289588f, 0.185127f, 0.326017f, + -0.432009f, -0.342663f, -0.312206f, 0.004004f, -1.114290f, 0.028497f, + -0.264944f, -0.419611f, 0.046336f, 0.138232f, -0.869528f, 0.425557f, + -0.954838f, -0.186830f, -0.464622f, -0.757107f, -0.432686f, -0.125978f, + -0.402633f, -0.172266f, -0.041749f, -0.822238f, -0.118486f, 0.238617f, + -0.198037f, 0.146347f, 0.405257f, 0.513303f, -0.078876f, -0.300385f, + -0.010293f, -0.183962f, 0.155738f, 0.186797f, -0.086814f, 0.000179f, + 0.123467f, 0.362523f, 0.068805f, 0.371834f, 0.038122f, -0.117867f, + -0.120445f, -0.422322f, -0.131402f, 0.285449f, 0.038957f, 0.008844f, + -0.020197f, 0.187723f, 0.190433f, 0.146532f, -0.091068f, -0.270865f, + -0.194231f, -0.226777f, 0.013548f, 0.248351f, 0.537685f, 0.056316f, + -0.171540f, -0.003865f, 0.406439f, 0.126507f, 0.192780f, 0.149335f, + -0.149602f, 0.255202f, -0.015426f, 0.032335f, -1.791330f, -0.894602f, + -0.196641f, -0.282846f, -0.391100f, -0.040969f, 0.049934f, 0.056348f, + -0.041426f, -0.075159f, -0.658335f, -0.827270f, -0.175029f, -0.427235f, + 0.311201f, 0.560413f, 0.363408f, 0.374580f, -0.433531f, -0.180580f, + 0.142142f, 0.194768f, -0.054118f, -0.376541f, -0.366185f, -0.308782f, + -0.273143f, -0.074097f, 0.009000f, -0.182198f, -0.015616f, -0.003882f, + -0.174340f, -0.354866f, 0.527972f, 0.348355f, 0.091381f, -0.419828f, + -0.530529f, 0.159899f, -0.511867f, -0.104237f, -0.286079f, -0.659039f, + -0.266596f, -0.256557f, -0.600437f, -0.446333f, -0.229629f, 0.024931f, + -0.143716f, -0.415754f, -0.003760f, -0.107195f, -0.666165f, -0.697312f, + -0.650255f, -0.703877f, 0.243402f, 0.426710f, 0.217210f, 0.260255f, + 0.027416f, 0.163147f, 0.132188f, 0.142374f, 0.558627f, 0.065717f, + 0.382781f, -1.192240f, 0.195492f, 0.028439f, 0.278252f, -0.491806f, + 0.497701f, -0.448835f, -0.245079f, -0.014336f, -0.174907f, -0.409633f, + 0.207548f, 0.433813f, 0.459889f, 0.431728f, 0.605050f, 0.485520f, + 0.218548f, 0.437307f, 0.027023f, -0.204251f, 0.012100f, 0.150677f, + -1.097980f, 0.086866f, -1.293130f, -0.372575f, -0.876264f, -0.021818f, + 0.322864f, -0.231043f, -0.271608f, 0.132782f, -0.314895f, 0.396800f, + 0.262788f, -0.317212f, -0.666308f, 0.830742f, 0.319409f, -0.564373f, + -0.178656f, 0.306993f, 0.265634f, -0.332480f, -0.491514f, -0.186745f, + -0.063044f, -0.009321f, 0.074944f, -0.372082f, -0.029479f, 0.081548f, + 0.028172f, -0.233148f, -0.337938f, -0.087695f, 0.596556f, 0.559530f, + 0.139332f, 0.107223f, -0.190915f, 0.137401f, -0.150625f, -0.225484f, + -0.191344f, -0.232535f, 0.126510f, 0.296323f, -0.547901f, -0.653080f, + 0.358514f, 0.726289f, -0.421725f, -0.243620f, 0.236206f, 0.390823f, + -0.076560f, -0.282329f, -0.012460f, -0.428484f, 0.349469f, 0.394629f, + 0.421537f, 0.219632f, -0.117550f, -0.087894f, 0.077155f, 0.016000f, + -0.289137f, -0.092937f, -0.014518f, -0.027111f, 0.210329f, -0.159678f, + 0.013288f, -0.039268f, 0.008112f, 0.003152f, 0.030084f, -0.039859f, + 0.322028f, -0.407797f, 0.447087f, -0.381562f, 0.529297f, -0.520298f, + 0.562865f, -0.616878f, 0.689389f, 0.754262f, 0.138475f, 0.750697f, + -0.760157f, -0.383740f, 0.074219f, 0.556257f, 0.087827f, -0.511826f, + -0.305507f, -0.638214f, 0.114833f, -0.444022f, 0.526612f, -0.604984f, + -0.100415f, 0.037824f, -0.106264f, 0.337615f, 0.070743f, 0.031129f, + 0.281954f, 0.176144f, -0.032833f, -0.073902f, -0.285492f, -0.803803f, + -0.015589f, 0.186077f, -0.033351f, 0.517269f, -1.878800f, -1.685210f, + -0.416581f, 0.158476f, -0.071929f, -0.624353f, -0.122069f, -0.075065f, + 0.311816f, 0.506305f, 0.383896f, 0.259450f, -0.308232f, -0.094221f, + -0.421885f, -0.293573f, +}; + +static const float weights_layer_5[] = { + 0.131894f, 0.078431f, 0.323121f, -0.230680f, -0.684740f, 0.020895f, + 0.364983f, 0.121656f, 0.132448f, -0.731198f, 0.071148f, 0.739642f, + 0.318437f, -0.033021f, -1.037080f, 0.135335f, 0.383582f, 0.287332f, + 0.054042f, -0.825482f, 0.418533f, 0.305606f, 0.041549f, 0.432422f, + -0.826878f, -0.593536f, 0.105657f, 0.125357f, 0.408567f, -0.293338f, + 0.233905f, -0.039609f, 0.547727f, -0.435806f, 0.036160f, 0.220275f, + -0.020337f, -0.619403f, -0.455858f, 0.681455f, 0.543846f, -0.495084f, + 0.251496f, -0.085686f, 0.091395f, -0.476696f, 0.453628f, -0.109663f, + 0.383493f, -0.456563f, -0.212935f, 0.020567f, -0.719564f, -0.377813f, + -0.737511f, 0.765965f, 0.624309f, -0.063679f, -0.055681f, -0.475969f, + -0.069902f, 0.725690f, 0.641094f, 0.439922f, -0.111544f, -0.309061f, + 0.280091f, 0.381416f, 0.481168f, 0.483543f, -0.901267f, -0.499230f, + 0.043449f, -0.372395f, 0.021216f, -0.002200f, -0.524089f, -0.071485f, + -0.273974f, -0.462654f, 0.042369f, -0.138679f, -0.330060f, 0.021886f, + -0.306075f, -0.011130f, -0.260224f, -0.288435f, -0.104039f, -0.183563f, + 0.118990f, -0.531160f, 0.339632f, -0.028374f, 0.159084f, -0.008824f, + -0.791388f, 0.245242f, 0.356510f, 0.469867f, -0.396949f, -0.476146f, + -0.168472f, 1.068400f, 0.474629f, -0.117554f, -0.142453f, -0.306604f, + 0.348525f, -0.111929f, -0.435384f, 0.019952f, -0.260185f, 0.373376f, + 0.109729f, -0.639168f, 0.033392f, -0.082573f, -0.196018f, 0.301637f, + -0.124210f, -0.202515f, -1.221920f, -0.253690f, -0.144864f, 0.287753f, + -0.161206f, -0.213246f, 0.373968f, 0.141397f, -0.248237f, 0.283090f, + -0.008977f, -0.172960f, -0.234146f, -0.720014f, -0.322451f, 0.181083f, + 0.310659f, -0.422646f, -0.719994f, -0.354339f, 0.352739f, 0.230923f, + 0.427013f, -0.660316f, 0.232140f, 0.685896f, 0.660208f, 0.225748f, + -0.918750f, -0.650790f, -0.674525f, -0.450305f, -0.152529f, 0.498480f, + 0.895092f, 0.688242f, 0.669057f, 0.612669f, 0.593484f, 0.318204f, + -0.169294f, 0.388789f, -0.529777f, -0.219706f, -0.044916f, 0.161697f, + -0.145288f, 0.196153f, -0.022212f, -0.434209f, -0.208115f, -0.117745f, + -0.279029f, -0.009506f, 0.137474f, 0.330148f, 0.439258f, 0.345879f, + -0.845131f, -0.215713f, 0.094463f, 0.638604f, 0.882254f, -0.964082f, + -0.383920f, 0.292645f, 0.266341f, 0.747473f, -0.645631f, -0.538896f, + -0.319764f, 0.521880f, 0.460091f, -0.470898f, -0.778283f, -0.061622f, + -0.142433f, 0.210520f, 0.804197f, 0.285840f, -0.138414f, -0.381846f, + -0.499991f, 0.223648f, 0.439025f, 0.321508f, -0.099560f, -0.622893f, + 0.750925f, 0.740994f, 0.140405f, 0.074631f, -0.270223f, -0.829049f, + -0.753355f, -0.258015f, 0.006285f, -0.730573f, -1.107390f, -0.538015f, + -1.005520f, -0.724115f, -0.440183f, -0.395239f, 0.508768f, 0.204620f, + -0.267331f, 0.001740f, -0.838709f, 0.659333f, 0.043739f, -0.024099f, + 0.262431f, 0.252433f, -0.265215f, 0.057289f, -0.428192f, -0.114350f, + -0.011475f, 0.463995f, 0.668833f, -0.604556f, -0.122780f, -0.441645f, + 0.145769f, 0.310450f, -1.003500f, 0.936069f, 0.516604f, -0.643386f, + -0.518571f, 0.306130f, 0.337387f, 0.583400f, -0.366025f, -0.560035f, + -0.262332f, 0.465242f, 0.964332f, -0.545410f, -0.637428f, -0.202695f, + 0.378931f, 0.834604f, 0.000970f, -0.553303f, -0.562879f, 0.221665f, + 0.395160f, 0.446281f, -0.184394f, -0.591780f, 0.170595f, 1.164390f, + 0.227068f, -0.150910f, -0.393690f, -0.131151f, 0.309956f, -0.413518f, + -0.768334f, -0.548975f, 0.245384f, -0.256904f, -0.514790f, -0.102616f, + -0.347625f, 0.420456f, 0.037804f, -0.283200f, -0.578815f, 0.319282f, + 0.674622f, -0.011791f, -0.339329f, 0.466705f, 0.563444f, 0.409660f, + 0.445784f, -0.899507f, -0.605116f, 0.622438f, 0.427385f, -0.062509f, + 0.666570f, 0.057105f, 0.357894f, -0.811016f, -0.421715f, -0.458397f, + 0.288955f, 0.005857f, 0.236331f, 0.107957f, 0.587276f, -0.375800f, + 0.323799f, -0.623363f, 0.254122f, -0.198478f, -0.098436f, -0.282531f, + 0.452453f, -0.163349f, -0.413382f, -0.448732f, -0.528770f, -0.457449f, + -0.619619f, -0.265919f, -0.042760f, 0.438730f, 0.501798f, -0.403851f, + 0.519564f, 0.817314f, 0.366203f, 0.492610f, 0.546929f, 0.853094f, + 0.289000f, 0.453941f, -0.076152f, 0.007226f, -0.183717f, -0.506252f, + -0.599989f, -0.576006f, 0.746488f, 0.631466f, -0.475599f, -0.334991f, + -0.879614f, 0.918957f, 0.473471f, -0.043781f, -0.688234f, -0.925875f, + -0.188081f, 0.050918f, 0.116855f, 0.221413f, -0.066680f, -0.674395f, + -0.481985f, 0.247368f, 0.271129f, 0.637979f, -1.006970f, -0.855441f, + 0.144874f, 0.507424f, 1.506960f, -0.338910f, 0.398203f, 0.738000f, + 0.263193f, -0.425908f, 0.358271f, -1.072900f, -0.816209f, -0.425519f, + 0.264373f, 0.694014f, 0.036333f, 0.635532f, 0.518856f, 0.047585f, + -0.854817f, -0.138202f, 0.006811f, -0.052020f, -0.468498f, 0.489080f, + -0.105778f, 0.357038f, -0.782875f, 0.649049f, -0.562652f, -0.544392f, + -0.328526f, -0.402121f, -0.263172f, -0.668459f, -0.526702f, -0.395829f, + 0.190986f, 0.307766f, -1.001830f, -0.293051f, 0.283334f, 0.572450f, + 0.906095f, -1.144300f, 0.180989f, 0.421092f, 0.684571f, 0.527276f, + -0.122287f, 0.575067f, 0.675221f, 0.755029f, 0.094957f, 0.481403f, + 0.825155f, 0.755035f, 0.641420f, 0.034497f, 0.518783f, 0.283800f, + 0.293733f, -0.074778f, -0.268720f, 0.798921f, 0.317714f, -0.236391f, + -0.375071f, -0.414600f, 0.223413f, -0.349044f, -0.191033f, -0.391779f, + -0.596894f, -0.378608f, -0.185920f, -0.822171f, -0.754962f, -0.167706f, + 0.755378f, 0.671847f, 0.969414f, 0.793048f, 1.078610f, -0.418963f, + 0.367648f, 0.217645f, 0.294232f, 0.113027f, 0.060312f, -0.327488f, + -0.305035f, -0.243600f, -0.020588f, -0.326324f, -0.417534f, -0.425868f, + -0.404614f, -0.346750f, -0.339145f, -0.348094f, -0.527290f, -0.617825f, + -0.258342f, -0.200753f, -0.249779f, -0.321039f, -0.023117f, -0.004167f, + -0.206788f, -0.612420f, -0.646428f, -0.548969f, -0.158875f, 0.213814f, + -0.084040f, -0.217365f, -0.511895f, -0.653285f, 0.440971f, 0.455591f, + -0.123900f, 0.134097f, -0.251241f, 0.682463f, 0.740614f, 0.991212f, + 0.565984f, 0.592690f, +}; + +static INLINE float32x4_t add_f32x4_x4(const float32x4_t a[4]) { + float32x4_t sum01 = vaddq_f32(a[0], a[1]); + float32x4_t sum23 = vaddq_f32(a[2], a[3]); + return vaddq_f32(sum01, sum23); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width >= 16); + const int in_size = in_height * in_width; + + do { + const float32x4_t bias_v = vdupq_n_f32(bias[0]); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum0[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + float32x4_t sum1[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo_0 = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi_0 = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo_0 = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi_0 = + vld2q_f32(in_ptr2 + in_size + in_stride); + + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[0], weights0_lo, 0); + sum0[0] = vmlaq_lane_f32(sum0[0], in0_lo_0.val[1], weights0_lo, 1); + + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[0], weights0_hi, 0); + sum0[1] = vmlaq_lane_f32(sum0[1], in0_hi_0.val[1], weights0_hi, 1); + + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[0], weights1_lo, 0); + sum0[2] = vmlaq_lane_f32(sum0[2], in1_lo_0.val[1], weights1_lo, 1); + + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[0], weights1_hi, 0); + sum0[3] = vmlaq_lane_f32(sum0[3], in1_hi_0.val[1], weights1_hi, 1); + + const float32x4x2_t in0_lo_1 = vld2q_f32(in_ptr2 + 8); + const float32x4x2_t in0_hi_1 = vld2q_f32(in_ptr2 + in_stride + 8); + const float32x4x2_t in1_lo_1 = vld2q_f32(in_ptr2 + in_size + 8); + const float32x4x2_t in1_hi_1 = + vld2q_f32(in_ptr2 + in_size + in_stride + 8); + + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[0], weights0_lo, 0); + sum1[0] = vmlaq_lane_f32(sum1[0], in0_lo_1.val[1], weights0_lo, 1); + + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[0], weights0_hi, 0); + sum1[1] = vmlaq_lane_f32(sum1[1], in0_hi_1.val[1], weights0_hi, 1); + + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[0], weights1_lo, 0); + sum1[2] = vmlaq_lane_f32(sum1[2], in1_lo_1.val[1], weights1_lo, 1); + + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[0], weights1_hi, 0); + sum1[3] = vmlaq_lane_f32(sum1[3], in1_hi_1.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum0)); + vst1q_f32(out_ptr1 + 4, add_f32x4_x4(sum1)); + + out_ptr1 += 8; + in_ptr1 += 8 * skip_width; + w += 8 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 2 && filter_width == 2); + assert(skip_width == 2 && skip_height == 2); + assert(in_width == 8); + const int in_size = in_height * in_width; + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *weight_ptr0 = weights; + const float *in_ptr0 = *input; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + const float *weight_ptr1 = weight_ptr0; + const float *in_ptr2 = in_ptr1; + int k = 0; + float32x4_t sum[4] = { bias_v, vdupq_n_f32(0), vdupq_n_f32(0), + vdupq_n_f32(0) }; + + do { + const float32x4_t weights0 = vld1q_f32(weight_ptr1); + const float32x4_t weights1 = vld1q_f32(weight_ptr1 + 4); + const float32x2_t weights0_lo = vget_low_f32(weights0); + const float32x2_t weights0_hi = vget_high_f32(weights0); + const float32x2_t weights1_lo = vget_low_f32(weights1); + const float32x2_t weights1_hi = vget_high_f32(weights1); + + const float32x4x2_t in0_lo = vld2q_f32(in_ptr2); + const float32x4x2_t in0_hi = vld2q_f32(in_ptr2 + in_stride); + const float32x4x2_t in1_lo = vld2q_f32(in_ptr2 + in_size); + const float32x4x2_t in1_hi = vld2q_f32(in_ptr2 + in_size + in_stride); + + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[0], weights0_lo, 0); + sum[0] = vmlaq_lane_f32(sum[0], in0_lo.val[1], weights0_lo, 1); + + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[0], weights0_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0_hi.val[1], weights0_hi, 1); + + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[0], weights1_lo, 0); + sum[2] = vmlaq_lane_f32(sum[2], in1_lo.val[1], weights1_lo, 1); + + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[0], weights1_hi, 0); + sum[3] = vmlaq_lane_f32(sum[3], in1_hi.val[1], weights1_hi, 1); + + weight_ptr1 += 8; + in_ptr2 += 2 * in_size; + k += 2; + } while (k < in_channels); + + vst1q_f32(out_ptr1, add_f32x4_x4(sum)); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++bias; + ++output; + weights += in_channels * filter_height * filter_width; + } while (++start_idx < out_channels); +} + +static INLINE void av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + const float **input, int in_width, int in_height, int in_stride, + const float *bias, const int skip_width, const int skip_height, + const int filter_width, const int filter_height, const int in_channels, + const int out_channels, float **output, int out_stride, int start_idx, + const float *weights) { + assert(filter_height == 5 && filter_width == 5); + assert(skip_width == 4 && skip_height == 4); + assert(in_width >= 16); + assert(in_channels == 1); + (void)in_channels; + + do { + const float32x4_t bias_v = vdupq_n_f32(*bias); + const float *in_ptr0 = *input; + const float *weights_ptr0 = weights; + float *out_ptr0 = *output; + int h = 0; + + do { + const float *in_ptr1 = in_ptr0; + float *out_ptr1 = out_ptr0; + int w = 0; + + do { + float32x4_t sum[2] = { bias_v, vdupq_n_f32(0) }; + + const float32x4_t weight_0_3 = vld1q_f32(weights_ptr0); + const float32x4_t weight_4_7 = vld1q_f32(weights_ptr0 + 4); + const float32x4_t weight_8_11 = vld1q_f32(weights_ptr0 + 8); + const float32x4_t weight_12_15 = vld1q_f32(weights_ptr0 + 12); + const float32x4_t weight_16_19 = vld1q_f32(weights_ptr0 + 16); + const float32x4_t weight_20_23 = vld1q_f32(weights_ptr0 + 20); + + const float32x2_t weight_0_3_lo = vget_low_f32(weight_0_3); + const float32x2_t weight_0_3_hi = vget_high_f32(weight_0_3); + const float32x2_t weight_4_7_lo = vget_low_f32(weight_4_7); + const float32x2_t weight_4_7_hi = vget_high_f32(weight_4_7); + const float32x2_t weight_8_11_lo = vget_low_f32(weight_8_11); + const float32x2_t weight_8_11_hi = vget_high_f32(weight_8_11); + const float32x2_t weight_12_15_lo = vget_low_f32(weight_12_15); + const float32x2_t weight_12_15_hi = vget_high_f32(weight_12_15); + const float32x2_t weight_16_19_lo = vget_low_f32(weight_16_19); + const float32x2_t weight_16_19_hi = vget_high_f32(weight_16_19); + const float32x2_t weight_20_23_lo = vget_low_f32(weight_20_23); + const float32x2_t weight_20_23_hi = vget_high_f32(weight_20_23); + + const float32x4x4_t in0 = vld4q_f32(in_ptr1 + 0 * in_stride); + const float32x4x4_t in1 = vld4q_f32(in_ptr1 + 1 * in_stride); + const float32x4x4_t in2 = vld4q_f32(in_ptr1 + 2 * in_stride); + const float32x4x4_t in3 = vld4q_f32(in_ptr1 + 3 * in_stride); + const float32x4x4_t in4 = vld4q_f32(in_ptr1 + 4 * in_stride); + + const float32x4_t in0_4 = vextq_f32( + in0.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 0 * in_stride)), 1); + const float32x4_t in1_4 = vextq_f32( + in1.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 1 * in_stride)), 1); + const float32x4_t in2_4 = vextq_f32( + in2.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 2 * in_stride)), 1); + const float32x4_t in3_4 = vextq_f32( + in3.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 3 * in_stride)), 1); + const float32x4_t in4_4 = vextq_f32( + in4.val[0], vdupq_n_f32(*(in_ptr1 + 16 + 4 * in_stride)), 1); + + // Kernel row 0. + sum[0] = vmlaq_lane_f32(sum[0], in0.val[0], weight_0_3_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[1], weight_0_3_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0.val[2], weight_0_3_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in0.val[3], weight_0_3_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in0_4, weight_4_7_lo, 0); + + // Kernel row 1. + sum[1] = vmlaq_lane_f32(sum[1], in1.val[0], weight_4_7_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[1], weight_4_7_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1.val[2], weight_4_7_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in1.val[3], weight_8_11_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in1_4, weight_8_11_lo, 1); + + // Kernel row 2. + sum[0] = vmlaq_lane_f32(sum[0], in2.val[0], weight_8_11_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[1], weight_8_11_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2.val[2], weight_12_15_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in2.val[3], weight_12_15_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in2_4, weight_12_15_hi, 0); + + // Kernel row 3. + sum[1] = vmlaq_lane_f32(sum[1], in3.val[0], weight_12_15_hi, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[1], weight_16_19_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3.val[2], weight_16_19_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in3.val[3], weight_16_19_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in3_4, weight_16_19_hi, 1); + + // Kernel row 4. + sum[0] = vmlaq_lane_f32(sum[0], in4.val[0], weight_20_23_lo, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[1], weight_20_23_lo, 1); + sum[0] = vmlaq_lane_f32(sum[0], in4.val[2], weight_20_23_hi, 0); + sum[1] = vmlaq_lane_f32(sum[1], in4.val[3], weight_20_23_hi, 1); + sum[0] = vmlaq_f32(sum[0], vdupq_n_f32(*(weights_ptr0 + 24)), in4_4); + + vst1q_f32(out_ptr1, vaddq_f32(sum[0], sum[1])); + + out_ptr1 += 4; + in_ptr1 += 4 * skip_width; + w += 4 * skip_width; + } while (w < in_width - filter_width + 1); + + out_ptr0 += out_stride; + in_ptr0 += skip_height * in_stride; + h += skip_height; + } while (h < in_height - filter_height + 1); + + ++output; + ++bias; + weights += 25; + } while (++start_idx < out_channels); +} + +// Neon variant of av1_cnn_convolve_no_maxpool_padding_valid_c(). +// As per the current encoder, av1_cnn_convolve function gets called for +// block size equal to 64x64. av1_cnn_convolve() uses layer config values +// set by av1_intra_mode_cnn_partition_cnn_config. The following are a few +// details related to each layer's config parameters. +// Layer_Number in_size out_size filter_wd filter_ht skip_wd skip_ht +// 0 64x64 16x16 5 5 4 4 +// 1 16x16 8x8 2 2 2 2 +// 2 8x8 4x4 2 2 2 2 +// 3 4x4 2x2 2 2 2 2 +// 4 2x2 1x1 2 2 2 2 +// Here, +// filter_wd = filter_width and filter_ht = filter_height, +// skip_wd = skip_width and skip_ht = skip_height. +void av1_cnn_convolve_no_maxpool_padding_valid_neon( + const float **input, int in_width, int in_height, int in_stride, + const CNN_LAYER_CONFIG *layer_config, float **output, int out_stride, + int start_idx, int cstep, int channel_step) { + assert((layer_config->skip_height == 1 && layer_config->skip_width == 1) || + !layer_config->maxpool); + assert(layer_config->filter_height > 1 || layer_config->filter_width > 1); + assert(layer_config->pad == PADDING_VALID); + assert(channel_step == 1); + assert(cstep == layer_config->in_channels * layer_config->out_channels); + + if (layer_config->filter_width == 5 && layer_config->filter_height == 5 && + layer_config->skip_width == 4 && layer_config->skip_height == 4) { + av1_cnn_convolve_no_maxpool_padding_valid_5x5_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights_layer_5); + } else if (layer_config->filter_width == 2 && + layer_config->filter_height == 2 && + layer_config->skip_width == 2 && layer_config->skip_height == 2) { + const float *weights = weights_layer_1; + if (layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[2].output_num) { + weights = weights_layer_2; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[3] + .output_num)) { + weights = weights_layer_3; + } else if ((layer_config->output_num == + av1_intra_mode_cnn_partition_cnn_config.layer_config[4] + .output_num)) { + weights = weights_layer_4; + } + if (in_width >= 16) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_large_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else if (in_width == 8) { + av1_cnn_convolve_no_maxpool_padding_valid_2x2_neon( + input, in_width, in_height, in_stride, layer_config->bias, + layer_config->skip_width, layer_config->skip_height, + layer_config->filter_width, layer_config->filter_height, + layer_config->in_channels, layer_config->out_channels, output, + out_stride, start_idx, weights); + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, + out_stride, start_idx, cstep, channel_step); + } + } else { + av1_cnn_convolve_no_maxpool_padding_valid_c( + input, in_width, in_height, in_stride, layer_config, output, out_stride, + start_idx, cstep, channel_step); + } +} diff -Nru aom-3.8.2/av1/encoder/arm/neon/highbd_pickrst_neon.c aom-3.9.0/av1/encoder/arm/neon/highbd_pickrst_neon.c --- aom-3.8.2/av1/encoder/arm/neon/highbd_pickrst_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/highbd_pickrst_neon.c 2024-05-07 19:57:02.996000000 +0000 @@ -15,7 +15,6 @@ #include "aom_dsp/arm/mem_neon.h" #include "aom_dsp/arm/sum_neon.h" -#include "aom_dsp/arm/transpose_neon.h" #include "av1/encoder/arm/neon/pickrst_neon.h" #include "av1/encoder/pickrst.h" @@ -273,57 +272,724 @@ } } -static int16_t highbd_find_average_neon(const int16_t *src, int src_stride, - int width, int height) { +static INLINE int16x8_t tbl2q(int16x8_t a, int16x8_t b, uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b) } }; + return vreinterpretq_s16_u8(vqtbl2q_u8(table, idx)); +#else + uint8x8x4_t table = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)) } }; + return vreinterpretq_s16_u8(vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx)))); +#endif +} + +static INLINE int16x8_t tbl3q(int16x8_t a, int16x8_t b, int16x8_t c, + uint8x16_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x3_t table = { { vreinterpretq_u8_s16(a), vreinterpretq_u8_s16(b), + vreinterpretq_u8_s16(c) } }; + return vreinterpretq_s16_u8(vqtbl3q_u8(table, idx)); +#else + // This is a specific implementation working only for compute stats with + // wiener_win == 5. + uint8x8x3_t table_lo = { { vreinterpret_u8_s16(vget_low_s16(a)), + vreinterpret_u8_s16(vget_high_s16(a)), + vreinterpret_u8_s16(vget_low_s16(b)) } }; + uint8x8x3_t table_hi = { { vreinterpret_u8_s16(vget_low_s16(b)), + vreinterpret_u8_s16(vget_high_s16(b)), + vreinterpret_u8_s16(vget_low_s16(c)) } }; + return vreinterpretq_s16_u8(vcombine_u8( + vtbl3_u8(table_lo, vget_low_u8(idx)), + vtbl3_u8(table_hi, vsub_u8(vget_high_u8(idx), vdup_n_u8(16))))); +#endif +} + +static INLINE int64_t div_shift_s64(int64_t x, int power) { + return (x < 0 ? x + (1ll << power) - 1 : x) >> power; +} + +// The M matrix is accumulated in a bitdepth-dependent number of steps to +// speed up the computation. This function computes the final M from the +// accumulated (src_s64) and the residual parts (src_s32). It also transposes +// the result as the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int shift) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in a bitdepth-dependent number of +// steps to speed up the computation. This function computes the final H from +// the accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static INLINE void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int stride, int shift) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-right triangle, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ = div_shift_s64(src_s64[tr_idx] + src_s32[tr_idx], shift); + } + } + } + } +} + +// Load 7x7 matrix into 7 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_8x7(int16x8_t dst[7], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src); + src += stride; + dst[5] = vld1q_s16(src); + src += stride; + dst[6] = vld1q_s16(src - 1); +} + +static INLINE void highbd_compute_stats_win7_neon( + const uint16_t *dgd, const uint16_t *src, int avg, int width, int height, + int dgd_stride, int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7_highbd[192]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, + 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, + 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, + 12, 13, 14, 15, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7_highbd + 80); + const uint8x16_t lut6 = vld1q_u8(shuffle_stats7_highbd + 96); + const uint8x16_t lut7 = vld1q_u8(shuffle_stats7_highbd + 112); + const uint8x16_t lut8 = vld1q_u8(shuffle_stats7_highbd + 128); + const uint8x16_t lut9 = vld1q_u8(shuffle_stats7_highbd + 144); + const uint8x16_t lut10 = vld1q_u8(shuffle_stats7_highbd + 160); + const uint8x16_t lut11 = vld1q_u8(shuffle_stats7_highbd + 176); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + dgd_rows[5] = vsubq_s16(dgd_rows[5], avg_s16); + dgd_rows[6] = vsubq_s16(dgd_rows[6], avg_s16); + + // Re-arrange the combined 8x7 matrix to have the 2 whole 7x7 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[6] arrays. + // These arrays contain 48 elements of the 49 (7x7). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut6); + dgd_avg0[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg1[1] = tbl2q(dgd_rows[1], dgd_rows[2], lut7); + dgd_avg0[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[2], dgd_rows[3], lut8); + dgd_avg0[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut3); + dgd_avg1[3] = tbl2q(dgd_rows[3], dgd_rows[4], lut9); + dgd_avg0[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut4); + dgd_avg1[4] = tbl2q(dgd_rows[4], dgd_rows[5], lut10); + dgd_avg0[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut5); + dgd_avg1[5] = tbl2q(dgd_rows[5], dgd_rows[6], lut11); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + int16x8_t dgd_rows[7]; + load_and_pack_s16_8x7(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange the combined 8x7 matrix to have a whole 7x7 matrix tightly + // packed into a int16x8_t[6] array. This array contains 48 elements of + // the 49 (7x7). Compute `dgd - avg` for the whole buffer. The DGD_AVG + // buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16(tbl2q(dgd_rows[1], dgd_rows[2], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[2], dgd_rows[3], lut2), avg_s16); + dgd_avg0[3] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut3), avg_s16); + dgd_avg0[4] = vsubq_s16(tbl2q(dgd_rows[4], dgd_rows[5], lut4), avg_s16); + dgd_avg0[5] = vsubq_s16(tbl2q(dgd_rows[5], dgd_rows[6], lut5), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, bit_depth_shift); +} + +// Load 5x5 matrix into 5 128-bit vectors from consecutive rows, the last load +// address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_s16_6x5(int16x8_t dst[5], const int16_t *src, + ptrdiff_t stride) { + dst[0] = vld1q_s16(src); + src += stride; + dst[1] = vld1q_s16(src); + src += stride; + dst[2] = vld1q_s16(src); + src += stride; + dst[3] = vld1q_s16(src); + src += stride; + dst[4] = vld1q_s16(src - 3); +} + +static void highbd_compute_stats_win5_neon(const uint16_t *dgd, + const uint16_t *src, int avg, + int width, int height, + int dgd_stride, int src_stride, + int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from 5x5 + // matrix. + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5_highbd[96]) = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, + 6, 7, 8, 9, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 32, 33, + 2, 3, 4, 5, 6, 7, 8, 9, 22, 23, 24, 25, 26, 27, 28, 29, + 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, + 8, 9, 10, 11, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 34, 35, + 4, 5, 6, 7, 8, 9, 10, 11, 24, 25, 26, 27, 28, 29, 30, 31, + }; + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5_highbd + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5_highbd + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5_highbd + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats5_highbd + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats5_highbd + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats5_highbd + 80); + + // We can accumulate up to 65536/4096/256 8/10/12-bit multiplication results + // in 32-bit. We are processing 2 pixels at a time, so the accumulator max can + // be as high as 32768/2048/128 for the compute stats. + const int acc_cnt_max = (1 << (32 - 2 * bit_depth)) >> 1; + int acc_cnt = acc_cnt_max; + const int src_next = src_stride - width; + const int dgd_next = dgd_stride - width; + const int16x8_t avg_s16 = vdupq_n_s16(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + dgd += 2; + + dgd_rows[0] = vsubq_s16(dgd_rows[0], avg_s16); + dgd_rows[1] = vsubq_s16(dgd_rows[1], avg_s16); + dgd_rows[2] = vsubq_s16(dgd_rows[2], avg_s16); + dgd_rows[3] = vsubq_s16(dgd_rows[3], avg_s16); + dgd_rows[4] = vsubq_s16(dgd_rows[4], avg_s16); + + // Re-arrange the combined 6x5 matrix to have the 2 whole 5x5 matrices (1 + // for each of the 2 pixels) separated into distinct int16x8_t[3] arrays. + // These arrays contain 24 elements of the 25 (5x5). Compute `dgd - avg` + // for both buffers. Each DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + + dgd_avg0[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg1[0] = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + dgd_avg0[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1); + dgd_avg1[1] = tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut4); + dgd_avg0[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut2); + dgd_avg1[2] = tbl2q(dgd_rows[3], dgd_rows[4], lut5); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after a bit depth dependent number of iterations + // to prevent overflow. + if (--acc_cnt == 0) { + acc_cnt = acc_cnt_max; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + int16x8_t dgd_rows[5]; + load_and_pack_s16_6x5(dgd_rows, (const int16_t *)dgd, dgd_stride); + + const int16_t *dgd_ptr = (const int16_t *)dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + + dgd_avg0[0] = vsubq_s16(tbl2q(dgd_rows[0], dgd_rows[1], lut0), avg_s16); + dgd_avg0[1] = vsubq_s16( + tbl3q(dgd_rows[1], dgd_rows[2], dgd_rows[3], lut1), avg_s16); + dgd_avg0[2] = vsubq_s16(tbl2q(dgd_rows[3], dgd_rows[4], lut2), avg_s16); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + int bit_depth_shift = bit_depth - AOM_BITS_8; + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, bit_depth_shift); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + bit_depth_shift); +} + +static uint16_t highbd_find_average_neon(const uint16_t *src, int src_stride, + int width, int height) { assert(width > 0); assert(height > 0); - int64x2_t sum_s64 = vdupq_n_s64(0); - int64_t sum = 0; + uint64x2_t sum_u64 = vdupq_n_u64(0); + uint64_t sum = 0; int h = height; do { - int32x4_t sum_s32[2] = { vdupq_n_s32(0), vdupq_n_s32(0) }; + uint32x4_t sum_u32[2] = { vdupq_n_u32(0), vdupq_n_u32(0) }; int w = width; - const int16_t *row = src; + const uint16_t *row = src; while (w >= 32) { - int16x8_t s0 = vld1q_s16(row + 0); - int16x8_t s1 = vld1q_s16(row + 8); - int16x8_t s2 = vld1q_s16(row + 16); - int16x8_t s3 = vld1q_s16(row + 24); - - s0 = vaddq_s16(s0, s1); - s2 = vaddq_s16(s2, s3); - sum_s32[0] = vpadalq_s16(sum_s32[0], s0); - sum_s32[1] = vpadalq_s16(sum_s32[1], s2); + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); + uint16x8_t s2 = vld1q_u16(row + 16); + uint16x8_t s3 = vld1q_u16(row + 24); + + s0 = vaddq_u16(s0, s1); + s2 = vaddq_u16(s2, s3); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); + sum_u32[1] = vpadalq_u16(sum_u32[1], s2); row += 32; w -= 32; } if (w >= 16) { - int16x8_t s0 = vld1q_s16(row + 0); - int16x8_t s1 = vld1q_s16(row + 8); + uint16x8_t s0 = vld1q_u16(row + 0); + uint16x8_t s1 = vld1q_u16(row + 8); - s0 = vaddq_s16(s0, s1); - sum_s32[0] = vpadalq_s16(sum_s32[0], s0); + s0 = vaddq_u16(s0, s1); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); row += 16; w -= 16; } if (w >= 8) { - int16x8_t s0 = vld1q_s16(row); - sum_s32[1] = vpadalq_s16(sum_s32[1], s0); + uint16x8_t s0 = vld1q_u16(row); + sum_u32[1] = vpadalq_u16(sum_u32[1], s0); row += 8; w -= 8; } if (w >= 4) { - int16x8_t s0 = vcombine_s16(vld1_s16(row), vdup_n_s16(0)); - sum_s32[0] = vpadalq_s16(sum_s32[0], s0); + uint16x8_t s0 = vcombine_u16(vld1_u16(row), vdup_n_u16(0)); + sum_u32[0] = vpadalq_u16(sum_u32[0], s0); row += 4; w -= 4; @@ -333,409 +999,209 @@ sum += *row++; } - sum_s64 = vpadalq_s32(sum_s64, vaddq_s32(sum_s32[0], sum_s32[1])); + sum_u64 = vpadalq_u32(sum_u64, vaddq_u32(sum_u32[0], sum_u32[1])); src += src_stride; } while (--h != 0); - return (int16_t)((horizontal_add_s64x2(sum_s64) + sum) / (height * width)); -} - -static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, - const int wiener_win, - const int wiener_win2) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = row0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]), - vget_high_s16(dgd[row1])); - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); - } - } + return (uint16_t)((horizontal_add_u64x2(sum_u64) + sum) / (height * width)); } -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 7 * 7. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 49 * 49. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void highbd_compute_stats_win7_neon( - const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride, - int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) { - const int wiener_win = 7; - const int wiener_win2 = wiener_win * wiener_win; - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); - - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[49]; - memset(M_trn, 0, sizeof(M_trn)); - - int16x8_t vavg = vdupq_n_s16(avg); - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = - vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg); - int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg); - - // Compute all the elements of one row of M. - compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src[j] - avg; - int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg; - int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg; - int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg; - int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg; - int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg; - int16_t d5 = dgd[row * dgd_stride + 5 + j] - avg; - int16_t d6 = dgd[row * dgd_stride + 6 + j] - avg; - - M_trn[row * wiener_win + 0] += d0 * s; - M_trn[row * wiener_win + 1] += d1 * s; - M_trn[row * wiener_win + 2] += d2 * s; - M_trn[row * wiener_win + 3] += d3 * s; - M_trn[row * wiener_win + 4] += d4 * s; - M_trn[row * wiener_win + 5] += d5 * s; - M_trn[row * wiener_win + 6] += d6 * s; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - int16x8_t dgd0[7]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg); - dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[7]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg); - dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } +void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, + const uint8_t *src8, int h_start, int h_end, + int v_start, int v_end, int dgd_stride, + int src_stride, int64_t *M, int64_t *H, + aom_bit_depth_t bit_depth) { + assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); - if (j < width) { - // Process remaining columns using a mask to discard excess elements. - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - dgd0[4] = vandq_s16(dgd0[4], mask); - dgd0[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col0), vavg); - dgd0[5] = vandq_s16(dgd0[5], mask); - dgd0[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col0), vavg); - dgd0[6] = vandq_s16(dgd0[6], mask); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[7]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - dgd1[5] = vsubq_s16(vld1q_s16(dgd + 5 * dgd_stride + j + col1), vavg); - dgd1[6] = vsubq_s16(vld1q_s16(dgd + 6 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd += dgd_stride; - src += src_stride; - } while (--height != 0); + const int wiener_halfwin = wiener_win >> 1; + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8); + const int height = v_end - v_start; + const int width = h_end - h_start; - // Transpose M_trn. - transpose_M_win7(M, M_trn, 7); + const uint16_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint16_t *src_start = src + h_start + v_start * src_stride; - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); + // The wiener window will slide along the dgd frame, centered on each pixel. + // For the top left pixel and all the pixels on the side of the frame this + // means half of the window will be outside of the frame. As such the actual + // buffer that we need to subtract the avg from will be 2 * wiener_halfwin + // wider and 2 * wiener_halfwin higher than the original dgd buffer. + const int vert_offset = v_start - wiener_halfwin; + const int horiz_offset = h_start - wiener_halfwin; + const uint16_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; - // Scaling the results. - uint8_t bit_depth_divider = 1; - if (bit_depth == AOM_BITS_12) { - bit_depth_divider = 16; - } else if (bit_depth == AOM_BITS_10) { - bit_depth_divider = 4; - } + uint16_t avg = highbd_find_average_neon(dgd_start, dgd_stride, width, height); - for (int i = 0; i < wiener_win2; ++i) { - M[i] /= bit_depth_divider; - for (int j = 0; j < wiener_win2; ++j) { - H[i * wiener_win2 + j] /= bit_depth_divider; - } + if (wiener_win == WIENER_WIN) { + highbd_compute_stats_win7_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); + } else { + highbd_compute_stats_win5_neon(dgd_win, src_start, avg, width, height, + dgd_stride, src_stride, M, H, bit_depth); } } -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 5 * 5. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 25 * 25. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void highbd_compute_stats_win5_neon( - const int16_t *dgd, int dgd_stride, const int16_t *src, int src_stride, - int width, int height, int64_t *M, int64_t *H, int16_t avg, int bit_depth) { - const int wiener_win = 5; - const int wiener_win2 = wiener_win * wiener_win; - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); +int64_t av1_highbd_pixel_proj_error_neon( + const uint8_t *src8, int width, int height, int src_stride, + const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { + const uint16_t *src = CONVERT_TO_SHORTPTR(src8); + const uint16_t *dat = CONVERT_TO_SHORTPTR(dat8); + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[25]; - memset(M_trn, 0, sizeof(M_trn)); + if (params->r[0] > 0 && params->r[1] > 0) { + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), 4); - int16x8_t vavg = vdupq_n_s16(avg); - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vsubq_s16(vld1q_s16(dgd + row * dgd_stride), vavg); + do { int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = - vsubq_s16(vld1q_s16(dgd + row * dgd_stride + j + 8), vavg); - int16x8_t s = vsubq_s16(vld1q_s16(src + j), vavg); + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t d_s32_lo = vreinterpretq_s32_u32( + vmull_lane_u16(vget_low_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + int32x4_t d_s32_hi = vreinterpretq_s32_u32(vmull_lane_u16( + vget_high_u16(d), vreinterpret_u16_s32(xq_sum_v), 0)); + + int32x4_t v0 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_lo); + int32x4_t v1 = vsubq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), + d_s32_hi); + + v0 = vmlaq_lane_s32(v0, flt0_0, xq_v, 0); + v1 = vmlaq_lane_s32(v1, flt0_1, xq_v, 0); + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d, s))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); - // Compute all the elements of one row of M. - compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row); + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); - dgd0 = dgd1; j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src[j] - avg; - int16_t d0 = dgd[row * dgd_stride + 0 + j] - avg; - int16_t d1 = dgd[row * dgd_stride + 1 + j] - avg; - int16_t d2 = dgd[row * dgd_stride + 2 + j] - avg; - int16_t d3 = dgd[row * dgd_stride + 3 + j] - avg; - int16_t d4 = dgd[row * dgd_stride + 4 + j] - avg; - - M_trn[row * wiener_win + 0] += d0 * s; - M_trn[row * wiener_win + 1] += d1 * s; - M_trn[row * wiener_win + 2] += d2 * s; - M_trn[row * wiener_win + 3] += d3 * s; - M_trn[row * wiener_win + 4] += d4 * s; + } while (j <= width - 8); - j++; + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq[0] * (flt0[k]) + xq[1] * (flt1[k]); + v -= (xq[1] + xq[0]) * (int32_t)(dat[k] << 4); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); } - } - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[5]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } + sse_s64 = vpadalq_s32(sse_s64, sse_s32); - if (j < width) { - // Process remaining columns using a mask to discard excess elements. - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col0), vavg); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col0), vavg); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col0), vavg); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col0), vavg); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col0), vavg); - dgd0[4] = vandq_s16(dgd0[4], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column. - int16x8_t dgd1[5]; - dgd1[0] = vsubq_s16(vld1q_s16(dgd + 0 * dgd_stride + j + col1), vavg); - dgd1[1] = vsubq_s16(vld1q_s16(dgd + 1 * dgd_stride + j + col1), vavg); - dgd1[2] = vsubq_s16(vld1q_s16(dgd + 2 * dgd_stride + j + col1), vavg); - dgd1[3] = vsubq_s16(vld1q_s16(dgd + 3 * dgd_stride + j + col1), vavg); - dgd1[4] = vsubq_s16(vld1q_s16(dgd + 4 * dgd_stride + j + col1), vavg); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd += dgd_stride; - src += src_stride; - } while (--height != 0); + dat += dat_stride; + src += src_stride; + flt0 += flt0_stride; + flt1 += flt1_stride; + } while (--height != 0); + } else if (params->r[0] > 0 || params->r[1] > 0) { + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x4_t xq_v = vdupq_n_s32(xq_active); - // Transpose M_trn. - transpose_M_win5(M, M_trn, 5); + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + do { + const uint16x8_t d0 = vld1q_u16(&dat[j]); + const uint16x8_t s0 = vld1q_u16(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt[j]); + int32x4_t flt0_1 = vld1q_s32(&flt[j + 4]); + + uint16x8_t d_u16 = vshlq_n_u16(d0, 4); + int32x4_t sub0 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_0), vget_low_u16(d_u16))); + int32x4_t sub1 = vreinterpretq_s32_u32( + vsubw_u16(vreinterpretq_u32_s32(flt0_1), vget_high_u16(d_u16))); + + int32x4_t v0 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub0, + xq_v); + int32x4_t v1 = vmlaq_s32( + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)), sub1, + xq_v); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), + vreinterpretq_s16_u16(vsubq_u16(d0, s0))); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); - // Scaling the results. - uint8_t bit_depth_divider = 1; - if (bit_depth == AOM_BITS_12) { - bit_depth_divider = 16; - } else if (bit_depth == AOM_BITS_10) { - bit_depth_divider = 4; - } + j += 8; + } while (j <= width - 8); - for (int i = 0; i < wiener_win2; ++i) { - M[i] /= bit_depth_divider; - for (int j = 0; j < wiener_win2; ++j) { - H[i * wiener_win2 + j] /= bit_depth_divider; - } - } -} + for (int k = j; k < width; ++k) { + int32_t v = 1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1); + v += xq_active * (int32_t)((uint32_t)flt[j] - (uint16_t)(dat[k] << 4)); + const int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += ((int64_t)e * e); + } -void av1_compute_stats_highbd_neon(int wiener_win, const uint8_t *dgd8, - const uint8_t *src8, int h_start, int h_end, - int v_start, int v_end, int dgd_stride, - int src_stride, int64_t *M, int64_t *H, - aom_bit_depth_t bit_depth) { - assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_REDUCED); + sse_s64 = vpadalq_s32(sse_s64, sse_s32); - const int wiener_halfwin = wiener_win >> 1; - const int wiener_win2 = wiener_win * wiener_win; - memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); + dat += dat_stride; + flt += flt_stride; + src += src_stride; + } while (--height != 0); + } else { + do { + int j = 0; - const int16_t *src = (const int16_t *)CONVERT_TO_SHORTPTR(src8); - const int16_t *dgd = (const int16_t *)CONVERT_TO_SHORTPTR(dgd8); - const int height = v_end - v_start; - const int width = h_end - h_start; - const int vert_offset = v_start - wiener_halfwin; - const int horiz_offset = h_start - wiener_halfwin; + do { + const uint16x8_t d = vld1q_u16(&dat[j]); + const uint16x8_t s = vld1q_u16(&src[j]); + + uint16x8_t diff = vabdq_u16(d, s); + uint16x4_t diff_lo = vget_low_u16(diff); + uint16x4_t diff_hi = vget_high_u16(diff); - int16_t avg = highbd_find_average_neon(dgd + v_start * dgd_stride + h_start, - dgd_stride, width, height); + uint32x4_t sqr_lo = vmull_u16(diff_lo, diff_lo); + uint32x4_t sqr_hi = vmull_u16(diff_hi, diff_hi); - src += v_start * src_stride + h_start; - dgd += vert_offset * dgd_stride + horiz_offset; + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_lo)); + sse_s64 = vpadalq_s32(sse_s64, vreinterpretq_s32_u32(sqr_hi)); - if (wiener_win == WIENER_WIN) { - highbd_compute_stats_win7_neon(dgd, dgd_stride, src, src_stride, width, - height, M, H, avg, bit_depth); - } else { - highbd_compute_stats_win5_neon(dgd, dgd_stride, src, src_stride, width, - height, M, H, avg, bit_depth); + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; + } + + dat += dat_stride; + src += src_stride; + } while (--height != 0); } + + sse += horizontal_add_s64x2(sse_s64); + return sse; } diff -Nru aom-3.8.2/av1/encoder/arm/neon/pickrst_neon.c aom-3.9.0/av1/encoder/arm/neon/pickrst_neon.c --- aom-3.8.2/av1/encoder/arm/neon/pickrst_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/pickrst_neon.c 2024-05-07 19:57:03.002000000 +0000 @@ -15,139 +15,815 @@ #include "config/av1_rtcd.h" #include "aom_dsp/arm/sum_neon.h" -#include "aom_dsp/arm/transpose_neon.h" #include "av1/common/restoration.h" #include "av1/encoder/arm/neon/pickrst_neon.h" #include "av1/encoder/pickrst.h" int64_t av1_lowbd_pixel_proj_error_neon( - const uint8_t *src8, int width, int height, int src_stride, - const uint8_t *dat8, int dat_stride, int32_t *flt0, int flt0_stride, + const uint8_t *src, int width, int height, int src_stride, + const uint8_t *dat, int dat_stride, int32_t *flt0, int flt0_stride, int32_t *flt1, int flt1_stride, int xq[2], const sgr_params_type *params) { - int i, j, k; - const int32_t shift = SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS; - const int32x4_t zero = vdupq_n_s32(0); - uint64x2_t sum64 = vreinterpretq_u64_s32(zero); - const uint8_t *src = src8; - const uint8_t *dat = dat8; + int64_t sse = 0; + int64x2_t sse_s64 = vdupq_n_s64(0); - int64_t err = 0; if (params->r[0] > 0 && params->r[1] > 0) { - for (i = 0; i < height; ++i) { - int32x4_t err0 = zero; - for (j = 0; j <= width - 8; j += 8) { - const uint8x8_t d0 = vld1_u8(&dat[j]); - const uint8x8_t s0 = vld1_u8(&src[j]); - const int16x8_t flt0_16b = - vcombine_s16(vqmovn_s32(vld1q_s32(&flt0[j])), - vqmovn_s32(vld1q_s32(&flt0[j + 4]))); - const int16x8_t flt1_16b = - vcombine_s16(vqmovn_s32(vld1q_s32(&flt1[j])), - vqmovn_s32(vld1q_s32(&flt1[j + 4]))); - const int16x8_t u0 = - vreinterpretq_s16_u16(vshll_n_u8(d0, SGRPROJ_RST_BITS)); - const int16x8_t flt0_0_sub_u = vsubq_s16(flt0_16b, u0); - const int16x8_t flt1_0_sub_u = vsubq_s16(flt1_16b, u0); - const int16x4_t flt0_16b_sub_u_lo = vget_low_s16(flt0_0_sub_u); - const int16x4_t flt0_16b_sub_u_hi = vget_high_s16(flt0_0_sub_u); - const int16x4_t flt1_16b_sub_u_lo = vget_low_s16(flt1_0_sub_u); - const int16x4_t flt1_16b_sub_u_hi = vget_high_s16(flt1_0_sub_u); - - int32x4_t v0 = vmull_n_s16(flt0_16b_sub_u_lo, (int16_t)xq[0]); - v0 = vmlal_n_s16(v0, flt1_16b_sub_u_lo, (int16_t)xq[1]); - int32x4_t v1 = vmull_n_s16(flt0_16b_sub_u_hi, (int16_t)xq[0]); - v1 = vmlal_n_s16(v1, flt1_16b_sub_u_hi, (int16_t)xq[1]); - const int16x4_t vr0 = vqrshrn_n_s32(v0, 11); - const int16x4_t vr1 = vqrshrn_n_s32(v1, 11); - const int16x8_t e0 = vaddq_s16(vcombine_s16(vr0, vr1), - vreinterpretq_s16_u16(vsubl_u8(d0, s0))); - const int16x4_t e0_lo = vget_low_s16(e0); - const int16x4_t e0_hi = vget_high_s16(e0); - err0 = vmlal_s16(err0, e0_lo, e0_lo); - err0 = vmlal_s16(err0, e0_hi, e0_hi); - } - for (k = j; k < width; ++k) { - const int32_t u = dat[k] << SGRPROJ_RST_BITS; - int32_t v = xq[0] * (flt0[k] - u) + xq[1] * (flt1[k] - u); - const int32_t e = ROUND_POWER_OF_TWO(v, 11) + dat[k] - src[k]; - err += e * e; + int32x2_t xq_v = vld1_s32(xq); + int32x2_t xq_sum_v = vshl_n_s32(vpadd_s32(xq_v, xq_v), SGRPROJ_RST_BITS); + + do { + int j = 0; + int32x4_t sse_s32 = vdupq_n_s32(0); + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt0_0 = vld1q_s32(&flt0[j]); + int32x4_t flt0_1 = vld1q_s32(&flt0[j + 4]); + int32x4_t flt1_0 = vld1q_s32(&flt1[j]); + int32x4_t flt1_1 = vld1q_s32(&flt1[j + 4]); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, flt0_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, flt0_1, xq_v, 0); + + v0 = vmlaq_lane_s32(v0, flt1_0, xq_v, 1); + v1 = vmlaq_lane_s32(v1, flt1_1, xq_v, 1); + + int16x8_t d_s16 = vreinterpretq_s16_u16(vmovl_u8(d)); + v0 = vmlsl_lane_s16(v0, vget_low_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + v1 = vmlsl_lane_s16(v1, vget_high_s16(d_s16), + vreinterpret_s16_s32(xq_sum_v), 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = (dat[k] << SGRPROJ_RST_BITS); + int32_t v = (1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)) + + xq[0] * flt0[k] + xq[1] * flt1[k] - u * (xq[0] + xq[1]); + int32_t e = + (v >> (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS)) + dat[k] - src[k]; + sse += e * e; } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + dat += dat_stride; src += src_stride; flt0 += flt0_stride; flt1 += flt1_stride; - sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0)); - } - + } while (--height != 0); } else if (params->r[0] > 0 || params->r[1] > 0) { - const int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; - const int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; - const int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; - for (i = 0; i < height; ++i) { - int32x4_t err0 = zero; - for (j = 0; j <= width - 8; j += 8) { - const uint8x8_t d0 = vld1_u8(&dat[j]); - const uint8x8_t s0 = vld1_u8(&src[j]); - const uint16x8_t d0s0 = vsubl_u8(d0, s0); - const uint16x8x2_t d0w = - vzipq_u16(vmovl_u8(d0), vreinterpretq_u16_s32(zero)); - - const int32x4_t flt_16b_lo = vld1q_s32(&flt[j]); - const int32x4_t flt_16b_hi = vld1q_s32(&flt[j + 4]); - - int32x4_t v0 = vmulq_n_s32(flt_16b_lo, xq_active); - v0 = vmlsq_n_s32(v0, vreinterpretq_s32_u16(d0w.val[0]), - xq_active * (1 << SGRPROJ_RST_BITS)); - int32x4_t v1 = vmulq_n_s32(flt_16b_hi, xq_active); - v1 = vmlsq_n_s32(v1, vreinterpretq_s32_u16(d0w.val[1]), - xq_active * (1 << SGRPROJ_RST_BITS)); - const int16x4_t vr0 = vqrshrn_n_s32(v0, 11); - const int16x4_t vr1 = vqrshrn_n_s32(v1, 11); - const int16x8_t e0 = - vaddq_s16(vcombine_s16(vr0, vr1), vreinterpretq_s16_u16(d0s0)); - const int16x4_t e0_lo = vget_low_s16(e0); - const int16x4_t e0_hi = vget_high_s16(e0); - err0 = vmlal_s16(err0, e0_lo, e0_lo); - err0 = vmlal_s16(err0, e0_hi, e0_hi); - } - for (k = j; k < width; ++k) { - const int32_t u = dat[k] << SGRPROJ_RST_BITS; + int xq_active = (params->r[0] > 0) ? xq[0] : xq[1]; + int32_t *flt = (params->r[0] > 0) ? flt0 : flt1; + int flt_stride = (params->r[0] > 0) ? flt0_stride : flt1_stride; + int32x2_t xq_v = vdup_n_s32(xq_active); + + do { + int32x4_t sse_s32 = vdupq_n_s32(0); + int j = 0; + + do { + const uint8x8_t d = vld1_u8(&dat[j]); + const uint8x8_t s = vld1_u8(&src[j]); + int32x4_t flt_0 = vld1q_s32(&flt[j]); + int32x4_t flt_1 = vld1q_s32(&flt[j + 4]); + int16x8_t d_s16 = + vreinterpretq_s16_u16(vshll_n_u8(d, SGRPROJ_RST_BITS)); + + int32x4_t sub_0 = vsubw_s16(flt_0, vget_low_s16(d_s16)); + int32x4_t sub_1 = vsubw_s16(flt_1, vget_high_s16(d_s16)); + + int32x4_t offset = + vdupq_n_s32(1 << (SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS - 1)); + int32x4_t v0 = vmlaq_lane_s32(offset, sub_0, xq_v, 0); + int32x4_t v1 = vmlaq_lane_s32(offset, sub_1, xq_v, 0); + + int16x4_t vr0 = vshrn_n_s32(v0, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + int16x4_t vr1 = vshrn_n_s32(v1, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS); + + int16x8_t diff = vreinterpretq_s16_u16(vsubl_u8(d, s)); + int16x8_t e = vaddq_s16(vcombine_s16(vr0, vr1), diff); + int16x4_t e_lo = vget_low_s16(e); + int16x4_t e_hi = vget_high_s16(e); + + sse_s32 = vmlal_s16(sse_s32, e_lo, e_lo); + sse_s32 = vmlal_s16(sse_s32, e_hi, e_hi); + + j += 8; + } while (j <= width - 8); + + for (int k = j; k < width; ++k) { + int32_t u = dat[k] << SGRPROJ_RST_BITS; int32_t v = xq_active * (flt[k] - u); - const int32_t e = ROUND_POWER_OF_TWO(v, shift) + dat[k] - src[k]; - err += e * e; + int32_t e = ROUND_POWER_OF_TWO(v, SGRPROJ_RST_BITS + SGRPROJ_PRJ_BITS) + + dat[k] - src[k]; + sse += e * e; } + + sse_s64 = vpadalq_s32(sse_s64, sse_s32); + dat += dat_stride; src += src_stride; flt += flt_stride; - sum64 = vpadalq_u32(sum64, vreinterpretq_u32_s32(err0)); - } + } while (--height != 0); } else { - uint32x4_t err0 = vreinterpretq_u32_s32(zero); - for (i = 0; i < height; ++i) { - for (j = 0; j <= width - 16; j += 16) { + uint32x4_t sse_s32 = vdupq_n_u32(0); + + do { + int j = 0; + + do { const uint8x16_t d = vld1q_u8(&dat[j]); const uint8x16_t s = vld1q_u8(&src[j]); - const uint8x16_t diff = vabdq_u8(d, s); - const uint8x8_t diff0 = vget_low_u8(diff); - const uint8x8_t diff1 = vget_high_u8(diff); - err0 = vpadalq_u16(err0, vmull_u8(diff0, diff0)); - err0 = vpadalq_u16(err0, vmull_u8(diff1, diff1)); - } - for (k = j; k < width; ++k) { - const int32_t e = dat[k] - src[k]; - err += e * e; + + uint8x16_t diff = vabdq_u8(d, s); + uint8x8_t diff_lo = vget_low_u8(diff); + uint8x8_t diff_hi = vget_high_u8(diff); + + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_lo, diff_lo)); + sse_s32 = vpadalq_u16(sse_s32, vmull_u8(diff_hi, diff_hi)); + + j += 16; + } while (j <= width - 16); + + for (int k = j; k < width; ++k) { + int32_t e = dat[k] - src[k]; + sse += e * e; } + dat += dat_stride; src += src_stride; - } - sum64 = vpaddlq_u32(err0); + } while (--height != 0); + + sse_s64 = vreinterpretq_s64_u64(vpaddlq_u32(sse_s32)); } + + sse += horizontal_add_s64x2(sse_s64); + return sse; +} + +// We can accumulate up to 65536 8-bit multiplication results in 32-bit. We are +// processing 2 pixels at a time, so the accumulator max can be as high as 32768 +// for the compute stats. +#define STAT_ACCUMULATOR_MAX 32768 + +static INLINE uint8x8_t tbl2(uint8x16_t a, uint8x16_t b, uint8x8_t idx) { +#if AOM_ARCH_AARCH64 + uint8x16x2_t table = { { a, b } }; + return vqtbl2_u8(table, idx); +#else + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vtbl4_u8(table, idx); +#endif +} + +static INLINE uint8x16_t tbl2q(uint8x16_t a, uint8x16_t b, uint8x16_t idx) { #if AOM_ARCH_AARCH64 - err += vaddvq_u64(sum64); + uint8x16x2_t table = { { a, b } }; + return vqtbl2q_u8(table, idx); #else - err += vget_lane_u64(vadd_u64(vget_low_u64(sum64), vget_high_u64(sum64)), 0); -#endif // AOM_ARCH_AARCH64 - return err; + uint8x8x4_t table = { { vget_low_u8(a), vget_high_u8(a), vget_low_u8(b), + vget_high_u8(b) } }; + return vcombine_u8(vtbl4_u8(table, vget_low_u8(idx)), + vtbl4_u8(table, vget_high_u8(idx))); +#endif +} + +// The M matrix is accumulated in STAT_ACCUMULATOR_MAX steps to speed-up the +// computation. This function computes the final M from the accumulated +// (src_s64) and the residual parts (src_s32). It also transposes the result as +// the output needs to be column-major. +static INLINE void acc_transpose_M(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, + int scale) { + for (int i = 0; i < wiener_win; ++i) { + for (int j = 0; j < wiener_win; ++j) { + int tr_idx = j * wiener_win + i; + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } +} + +// The resulting H is a column-major matrix accumulated from the transposed +// (column-major) samples of the filter kernel (5x5 or 7x7) viewed as a single +// vector. For the 7x7 filter case: H(49x49) = [49 x 1] x [1 x 49]. This +// function transforms back to the originally expected format (double +// transpose). The H matrix is accumulated in STAT_ACCUMULATOR_MAX steps to +// speed-up the computation. This function computes the final H from the +// accumulated (src_s64) and the residual parts (src_s32). The computed H is +// only an upper triangle matrix, this function also fills the lower triangle of +// the resulting matrix. +static void update_H(int64_t *dst, const int64_t *src_s64, + const int32_t *src_s32, const int wiener_win, int stride, + int scale) { + // For a simplified theoretical 3x3 case where `wiener_win` is 3 and + // `wiener_win2` is 9, the M matrix is 3x3: + // 0, 3, 6 + // 1, 4, 7 + // 2, 5, 8 + // + // This is viewed as a vector to compute H (9x9) by vector outer product: + // 0, 3, 6, 1, 4, 7, 2, 5, 8 + // + // Double transpose and upper triangle remapping for 3x3 -> 9x9 case: + // 0, 3, 6, 1, 4, 7, 2, 5, 8, + // 3, 30, 33, 12, 31, 34, 21, 32, 35, + // 6, 33, 60, 15, 42, 61, 24, 51, 62, + // 1, 12, 15, 10, 13, 16, 11, 14, 17, + // 4, 31, 42, 13, 40, 43, 22, 41, 44, + // 7, 34, 61, 16, 43, 70, 25, 52, 71, + // 2, 21, 24, 11, 22, 25, 20, 23, 26, + // 5, 32, 51, 14, 41, 52, 23, 50, 53, + // 8, 35, 62, 17, 44, 71, 26, 53, 80, + const int wiener_win2 = wiener_win * wiener_win; + + // Loop through the indices according to the remapping above, along the + // columns: + // 0, wiener_win, 2 * wiener_win, ..., 1, 1 + 2 * wiener_win, ..., + // wiener_win - 1, wiener_win - 1 + wiener_win, ... + // For the 3x3 case `j` will be: 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int i = 0; i < wiener_win; ++i) { + for (int j = i; j < wiener_win2; j += wiener_win) { + // These two inner loops are the same as the two outer loops, but running + // along rows instead of columns. For the 3x3 case `l` will be: + // 0, 3, 6, 1, 4, 7, 2, 5, 8. + for (int k = 0; k < wiener_win; ++k) { + for (int l = k; l < wiener_win2; l += wiener_win) { + // The nominal double transpose indexing would be: + // int idx = stride * j + l; + // However we need the upper-triangle indices, it is easy with some + // min/max operations. + int tr_idx = stride * AOMMIN(j, l) + AOMMAX(j, l); + + // Resulting matrix is filled by combining the 64-bit and the residual + // 32-bit matrices together with scaling. + *dst++ += (int64_t)(src_s64[tr_idx] + src_s32[tr_idx]) * scale; + } + } + } + } +} + +// Load 7x7 matrix into 3 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_8x7(uint8x16_t dst[4], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[3] = vcombine_u8(vld1_u8(src - 1), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win7_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, H_s32[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, H_s64[WIENER_WIN2 * WIENER_WIN2_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x6 matrix with consecutive elements from two 7x7 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats7[96]) = { + 0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, + 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 16, 17, 18, 19, + 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 17, 18, 19, 20, 21, 22, + 1, 2, 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, + 3, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 17, 18, 19, 20, + 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 18, 19, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats7 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats7 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats7 + 32); + const uint8x16_t lut3 = vld1q_u8(shuffle_stats7 + 48); + const uint8x16_t lut4 = vld1q_u8(shuffle_stats7 + 64); + const uint8x16_t lut5 = vld1q_u8(shuffle_stats7 + 80); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + dgd += 2; + + // Re-arrange (and widen) the combined 8x7 matrix to have the 2 whole 7x7 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[6] arrays. These arrays contain 48 elements of the 49 (7x7). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 49 + // consecutive elements. + int16x8_t dgd_avg0[6]; + int16x8_t dgd_avg1[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf3 = tbl2q(dgd_rows[0], dgd_rows[1], lut3); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf3), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf3), avg_u8)); + + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG1, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + uint8x16_t dgd_shuf4 = tbl2q(dgd_rows[1], dgd_rows[2], lut4); + + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf4), avg_u8)); + dgd_avg1[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf4), avg_u8)); + + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + vst1q_s16(DGD_AVG1 + 24, dgd_avg1[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + uint8x16_t dgd_shuf5 = tbl2q(dgd_rows[2], dgd_rows[3], lut5); + + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + dgd_avg1[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf5), avg_u8)); + dgd_avg1[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf5), avg_u8)); + + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + vst1q_s16(DGD_AVG1 + 32, dgd_avg1[4]); + vst1q_s16(DGD_AVG1 + 40, dgd_avg1[5]); + + // The remaining last (49th) elements of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + DGD_AVG1[48] = dgd_ptr[7] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 7 * 7. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + update_M_2pixels(M_s32 + 24, src_avg0_s16, src_avg1_s16, dgd_avg0[3], + dgd_avg1[3]); + update_M_2pixels(M_s32 + 32, src_avg0_s16, src_avg1_s16, dgd_avg0[4], + dgd_avg1[4]); + update_M_2pixels(M_s32 + 40, src_avg0_s16, src_avg1_s16, dgd_avg0[5], + dgd_avg1[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[48] += DGD_AVG0[48] * src_avg0 + DGD_AVG1[48] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 49 * 49. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_7x7_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += + DGD_AVG0[48] * DGD_AVG0[48] + DGD_AVG1[48] * DGD_AVG1[48]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 48 - k4); + + // Last element of the row is computed separately. + lh[48] += lh32[48]; + lh32[48] = 0; + + lh += WIENER_WIN2_ALIGN2; + lh32 += WIENER_WIN2_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 7x7 matrices: a 8x7 matrix with the + // middle 6x7 elements being shared. + uint8x16_t dgd_rows[4]; + load_and_pack_u8_8x7(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 6; + ++dgd; + + // Re-arrange (and widen) the combined 8x7 matrix to have a whole 7x7 + // matrix tightly packed into a int16x8_t[6] array. This array contains + // 48 elements of the 49 (7x7). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 49 consecutive elements. + int16x8_t dgd_avg0[6]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + vst1q_s16(DGD_AVG0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[1], dgd_rows[2], lut1); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg0[3] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG0 + 24, dgd_avg0[3]); + + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[2], dgd_rows[3], lut2); + dgd_avg0[4] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg0[5] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + vst1q_s16(DGD_AVG0 + 32, dgd_avg0[4]); + vst1q_s16(DGD_AVG0 + 40, dgd_avg0[5]); + + // The remaining last (49th) element of `dgd - avg`. + DGD_AVG0[48] = dgd_ptr[6] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 7 * 7. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + update_M_1pixel(M_s32 + 24, src_avg0_s16, dgd_avg0[3]); + update_M_1pixel(M_s32 + 32, src_avg0_s16, dgd_avg0[4]); + update_M_1pixel(M_s32 + 40, src_avg0_s16, dgd_avg0[5]); + + // Last (49th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[48] += DGD_AVG0[48] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 49 * 49. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_ALIGN2, 48); + + // The last element of the triangle of H_s32 matrix can be computed as + // scalar more efficiently. + H_s32[48 * WIENER_WIN2_ALIGN2 + 48] += DGD_AVG0[48] * DGD_AVG0[48]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN, WIENER_WIN2_ALIGN2, downsample_factor); +} + +// Load 5x5 matrix into 2 and a half 128-bit vectors from consecutive rows, the +// last load address is offset to prevent out-of-bounds access. +static INLINE void load_and_pack_u8_6x5(uint8x16_t dst[3], const uint8_t *src, + ptrdiff_t stride) { + dst[0] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[1] = vcombine_u8(vld1_u8(src), vld1_u8(src + stride)); + src += 2 * stride; + dst[2] = vcombine_u8(vld1_u8(src - 3), vdup_n_u8(0)); +} + +static INLINE void compute_stats_win5_neon(const uint8_t *dgd, + const uint8_t *src, int width, + int height, int dgd_stride, + int src_stride, int avg, int64_t *M, + int64_t *H, int downsample_factor) { + // Matrix names are capitalized to help readability. + DECLARE_ALIGNED(64, int16_t, DGD_AVG0[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int16_t, DGD_AVG1[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, M_s32[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int64_t, M_s64[WIENER_WIN2_REDUCED_ALIGN3]); + DECLARE_ALIGNED(64, int32_t, + H_s32[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + DECLARE_ALIGNED(64, int64_t, + H_s64[WIENER_WIN2_REDUCED * WIENER_WIN2_REDUCED_ALIGN2]); + + memset(M_s32, 0, sizeof(M_s32)); + memset(M_s64, 0, sizeof(M_s64)); + memset(H_s32, 0, sizeof(H_s32)); + memset(H_s64, 0, sizeof(H_s64)); + + // Look-up tables to create 8x3 matrix with consecutive elements from two 5x5 + // matrices. + // clang-format off + DECLARE_ALIGNED(16, static const uint8_t, shuffle_stats5[48]) = { + 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, + 1, 2, 3, 4, 5, 9, 10, 11, 12, 13, 17, 18, 19, 20, 21, 25, + 9, 10, 11, 12, 19, 20, 21, 22, 10, 11, 12, 13, 20, 21, 22, 23, + }; + // clang-format on + + const uint8x16_t lut0 = vld1q_u8(shuffle_stats5 + 0); + const uint8x16_t lut1 = vld1q_u8(shuffle_stats5 + 16); + const uint8x16_t lut2 = vld1q_u8(shuffle_stats5 + 32); + + int acc_cnt = STAT_ACCUMULATOR_MAX; + const int src_next = downsample_factor * src_stride - width; + const int dgd_next = downsample_factor * dgd_stride - width; + const uint8x8_t avg_u8 = vdup_n_u8(avg); + + do { + int j = width; + while (j >= 2) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + dgd += 2; + + // Re-arrange (and widen) the combined 6x5 matrix to have the 2 whole 5x5 + // matrices (1 for each of the 2 pixels) separated into distinct + // int16x8_t[3] arrays. These arrays contain 24 elements of the 25 (5x5). + // Compute `dgd - avg` for both buffers. Each DGD_AVG buffer contains 25 + // consecutive elements. + int16x8_t dgd_avg0[3]; + int16x8_t dgd_avg1[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x16_t dgd_shuf1 = tbl2q(dgd_rows[0], dgd_rows[1], lut1); + uint8x16_t dgd_shuf2 = tbl2q(dgd_rows[1], dgd_rows[2], lut2); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf2), avg_u8)); + dgd_avg1[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf1), avg_u8)); + dgd_avg1[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf1), avg_u8)); + dgd_avg1[2] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf2), avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + vst1q_s16(DGD_AVG1 + 0, dgd_avg1[0]); + vst1q_s16(DGD_AVG1 + 8, dgd_avg1[1]); + vst1q_s16(DGD_AVG1 + 16, dgd_avg1[2]); + + // The remaining last (25th) elements of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + DGD_AVG1[24] = dgd_ptr[5] - avg; + + // Accumulate into row-major variant of matrix M (cross-correlation) for 2 + // output pixels at a time. M is of size 5 * 5. It needs to be filled such + // that multiplying one element from src with each element of a row of the + // wiener window will fill one column of M. However this is not very + // convenient in terms of memory access, as it means we do contiguous + // loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int src_avg1 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + int16x4_t src_avg1_s16 = vdup_n_s16(src_avg1); + update_M_2pixels(M_s32 + 0, src_avg0_s16, src_avg1_s16, dgd_avg0[0], + dgd_avg1[0]); + update_M_2pixels(M_s32 + 8, src_avg0_s16, src_avg1_s16, dgd_avg0[1], + dgd_avg1[1]); + update_M_2pixels(M_s32 + 16, src_avg0_s16, src_avg1_s16, dgd_avg0[2], + dgd_avg1[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 2 output pixels. + M_s32[24] += DGD_AVG0[24] * src_avg0 + DGD_AVG1[24] * src_avg1; + + // Start accumulating into row-major version of matrix H + // (auto-covariance), it expects the DGD_AVG[01] matrices to also be + // row-major. H is of size 25 * 25. It is filled by multiplying every pair + // of elements of the wiener window together (vector outer product). Since + // it is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work with column-major matrices, + // so we accumulate into a row-major matrix H_s32. At the end of the + // algorithm a double transpose transformation will convert H_s32 back to + // the expected output layout. + update_H_5x5_2pixels(H_s32, DGD_AVG0, DGD_AVG1); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24] + DGD_AVG1[24] * DGD_AVG1[24]; + + // Accumulate into 64-bit after STAT_ACCUMULATOR_MAX iterations to prevent + // overflow. + if (--acc_cnt == 0) { + acc_cnt = STAT_ACCUMULATOR_MAX; + + accumulate_and_clear(M_s64, M_s32, WIENER_WIN2_REDUCED_ALIGN2); + + // The widening accumulation is only needed for the upper triangle part + // of the matrix. + int64_t *lh = H_s64; + int32_t *lh32 = H_s32; + for (int k = 0; k < WIENER_WIN2_REDUCED; ++k) { + // The widening accumulation is only run for the relevant parts + // (upper-right triangle) in a row 4-element aligned. + int k4 = k / 4 * 4; + accumulate_and_clear(lh + k4, lh32 + k4, 24 - k4); + + // Last element of the row is computed separately. + lh[24] += lh32[24]; + lh32[24] = 0; + + lh += WIENER_WIN2_REDUCED_ALIGN2; + lh32 += WIENER_WIN2_REDUCED_ALIGN2; + } + } + + j -= 2; + } + + // Computations for odd pixel in the row. + if (width & 1) { + // Load two adjacent, overlapping 5x5 matrices: a 6x5 matrix with the + // middle 4x5 elements being shared. + uint8x16_t dgd_rows[3]; + load_and_pack_u8_6x5(dgd_rows, dgd, dgd_stride); + + const uint8_t *dgd_ptr = dgd + dgd_stride * 4; + ++dgd; + + // Re-arrange (and widen) the combined 6x5 matrix to have a whole 5x5 + // matrix tightly packed into a int16x8_t[3] array. This array contains + // 24 elements of the 25 (5x5). Compute `dgd - avg` for the whole buffer. + // The DGD_AVG buffer contains 25 consecutive elements. + int16x8_t dgd_avg0[3]; + uint8x16_t dgd_shuf0 = tbl2q(dgd_rows[0], dgd_rows[1], lut0); + uint8x8_t dgd_shuf1 = tbl2(dgd_rows[1], dgd_rows[2], vget_low_u8(lut2)); + + dgd_avg0[0] = + vreinterpretq_s16_u16(vsubl_u8(vget_low_u8(dgd_shuf0), avg_u8)); + dgd_avg0[1] = + vreinterpretq_s16_u16(vsubl_u8(vget_high_u8(dgd_shuf0), avg_u8)); + dgd_avg0[2] = vreinterpretq_s16_u16(vsubl_u8(dgd_shuf1, avg_u8)); + + vst1q_s16(DGD_AVG0 + 0, dgd_avg0[0]); + vst1q_s16(DGD_AVG0 + 8, dgd_avg0[1]); + vst1q_s16(DGD_AVG0 + 16, dgd_avg0[2]); + + // The remaining last (25th) element of `dgd - avg`. + DGD_AVG0[24] = dgd_ptr[4] - avg; + + // Accumulate into row-major order variant of matrix M (cross-correlation) + // for 1 output pixel at a time. M is of size 5 * 5. It needs to be filled + // such that multiplying one element from src with each element of a row + // of the wiener window will fill one column of M. However this is not + // very convenient in terms of memory access, as it means we do + // contiguous loads of dgd but strided stores to M. As a result, we use an + // intermediate matrix M_s32 which is instead filled such that one row of + // the wiener window gives one row of M_s32. Once fully computed, M_s32 is + // then transposed to return M. + int src_avg0 = *src++ - avg; + int16x4_t src_avg0_s16 = vdup_n_s16(src_avg0); + update_M_1pixel(M_s32 + 0, src_avg0_s16, dgd_avg0[0]); + update_M_1pixel(M_s32 + 8, src_avg0_s16, dgd_avg0[1]); + update_M_1pixel(M_s32 + 16, src_avg0_s16, dgd_avg0[2]); + + // Last (25th) element of M_s32 can be computed as scalar more efficiently + // for 1 output pixel. + M_s32[24] += DGD_AVG0[24] * src_avg0; + + // Start accumulating into row-major order version of matrix H + // (auto-covariance), it expects the DGD_AVG0 matrix to also be row-major. + // H is of size 25 * 25. It is filled by multiplying every pair of + // elements of the wiener window together (vector outer product). Since it + // is a symmetric matrix, we only compute the upper-right triangle, and + // then copy it down to the lower-left later. The upper triangle is + // covered by 4x4 tiles. The original algorithm assumes the M matrix is + // column-major and the resulting H matrix is also expected to be + // column-major. It is not efficient to work column-major matrices, so we + // accumulate into a row-major matrix H_s32. At the end of the algorithm a + // double transpose transformation will convert H_s32 back to the expected + // output layout. + update_H_1pixel(H_s32, DGD_AVG0, WIENER_WIN2_REDUCED_ALIGN2, 24); + + // The last element of the triangle of H_s32 matrix can be computed as a + // scalar more efficiently. + H_s32[24 * WIENER_WIN2_REDUCED_ALIGN2 + 24] += + DGD_AVG0[24] * DGD_AVG0[24]; + } + + src += src_next; + dgd += dgd_next; + } while (--height != 0); + + acc_transpose_M(M, M_s64, M_s32, WIENER_WIN_REDUCED, downsample_factor); + + update_H(H, H_s64, H_s32, WIENER_WIN_REDUCED, WIENER_WIN2_REDUCED_ALIGN2, + downsample_factor); } static INLINE uint8_t find_average_neon(const uint8_t *src, int src_stride, @@ -239,741 +915,6 @@ return (uint8_t)(sum / (width * height)); } -static INLINE void compute_sub_avg(const uint8_t *buf, int buf_stride, int avg, - int16_t *buf_avg, int buf_avg_stride, - int width, int height, - int downsample_factor) { - uint8x8_t avg_u8 = vdup_n_u8(avg); - - if (width > 8) { - int i = 0; - do { - int j = width; - const uint8_t *buf_ptr = buf; - int16_t *buf_avg_ptr = buf_avg; - do { - uint8x8_t d = vld1_u8(buf_ptr); - vst1q_s16(buf_avg_ptr, vreinterpretq_s16_u16(vsubl_u8(d, avg_u8))); - - j -= 8; - buf_ptr += 8; - buf_avg_ptr += 8; - } while (j >= 8); - while (j > 0) { - *buf_avg_ptr = (int16_t)buf[width - j] - (int16_t)avg; - buf_avg_ptr++; - j--; - } - buf += buf_stride; - buf_avg += buf_avg_stride; - i += downsample_factor; - } while (i < height); - } else { - // For width < 8, don't use Neon. - for (int i = 0; i < height; i = i + downsample_factor) { - for (int j = 0; j < width; j++) { - buf_avg[j] = (int16_t)buf[j] - (int16_t)avg; - } - buf += buf_stride; - buf_avg += buf_avg_stride; - } - } -} - -static INLINE void compute_H_one_col(int16x8_t *dgd, int col, int64_t *H, - const int wiener_win, - const int wiener_win2, int32x4_t df_s32) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = row0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]), - vget_high_s16(dgd[row1])); - auto_cov = vshlq_s32(auto_cov, df_s32); - - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); - } - } -} - -static INLINE void compute_H_one_col_last_row(int16x8_t *dgd, int col, - int64_t *H, const int wiener_win, - const int wiener_win2, - int last_row_df) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = row0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col * wiener_win + row0) * wiener_win2 + (col * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd[row0]), vget_low_s16(dgd[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd[row0]), - vget_high_s16(dgd[row1])); - auto_cov = vmulq_n_s32(auto_cov, last_row_df); - - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); - } - } -} - -// When we load 8 values of int16_t type and need less than 8 values for -// processing, the below mask is used to make the extra values zero. -const int16_t av1_neon_mask_16bit[16] = { - -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, -}; - -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 7 * 7. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 49 * 49. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void compute_stats_win7_neon(int16_t *dgd_avg, int dgd_avg_stride, - int16_t *src_avg, int src_avg_stride, - int width, int v_start, int v_end, - int64_t *M, int64_t *H, - int downsample_factor, - int last_row_downsample_factor) { - const int wiener_win = 7; - const int wiener_win2 = wiener_win * wiener_win; - // The downsample factor can be either 1 or 4, so instead of multiplying the - // values by 1 or 4, we can left shift by 0 or 2 respectively, which is - // faster. (This doesn't apply to the last row where we can scale the values - // by 1, 2 or 3, so we keep the multiplication). - const int downsample_shift = downsample_factor >> 1; - const int16x8_t df_s16 = vdupq_n_s16(downsample_shift); - const int32x4_t df_s32 = vdupq_n_s32(downsample_shift); - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); - - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[49]; - memset(M_trn, 0, sizeof(M_trn)); - - int h = v_start; - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src and scale based on downsampling factor. - int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16); - - // Compute all the elements of one row of M. - compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j] * downsample_factor; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j]; - int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j]; - - M_trn[row * wiener_win + 0] += d0 * s; - M_trn[row * wiener_win + 1] += d1 * s; - M_trn[row * wiener_win + 2] += d2 * s; - M_trn[row * wiener_win + 3] += d3 * s; - M_trn[row * wiener_win + 4] += d4 * s; - M_trn[row * wiener_win + 5] += d5 * s; - M_trn[row * wiener_win + 6] += d6 * s; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vshlq_s16(dgd1[5], df_s16); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vshlq_s16(dgd1[6], df_s16); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - if (j < width) { - // Process remaining columns using a mask to discard excess elements. - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[5] = vandq_s16(dgd0[5], mask); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - dgd0[6] = vandq_s16(dgd0[6], mask); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vshlq_s16(dgd1[5], df_s16); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vshlq_s16(dgd1[6], df_s16); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd_avg += downsample_factor * dgd_avg_stride; - src_avg += src_avg_stride; - h += downsample_factor; - } while (h <= v_end - downsample_factor); - - if (h < v_end) { - // The last row is scaled by a different downsample factor, so process - // separately. - - // Cross-correlation (M). - for (int row = 0; row < 7; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src vector and scale based on downsampling factor. - int16x8_t s = - vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor); - - // Compute all the elements of one row of M. - compute_M_one_row_win7(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j]; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - int16_t d5 = dgd_avg[row * dgd_avg_stride + 5 + j]; - int16_t d6 = dgd_avg[row * dgd_avg_stride + 6 + j]; - - M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 5] += d5 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 6] += d6 * s * last_row_downsample_factor; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - int col0 = 0; - do { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (28 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } while (++col0 < wiener_win); - j += 8; - } - - // Process remaining columns using a mask to discard excess elements. - if (j < width) { - int col0 = 0; - do { - // Load first column. - int16x8_t dgd0[7]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - dgd0[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col0); - dgd0[5] = vandq_s16(dgd0[5], mask); - dgd0[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col0); - dgd0[6] = vandq_s16(dgd0[6], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 7x7 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 7x7 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[7]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - dgd1[5] = vld1q_s16(dgd_avg + 5 * dgd_avg_stride + j + col1); - dgd1[5] = vmulq_n_s16(dgd1[5], last_row_downsample_factor); - dgd1[6] = vld1q_s16(dgd_avg + 6 * dgd_avg_stride + j + col1); - dgd1[6] = vmulq_n_s16(dgd1[6], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (49 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } while (++col0 < wiener_win); - } - } - - // Transpose M_trn. - transpose_M_win7(M, M_trn, 7); - - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); -} - -// This function computes two matrices: the cross-correlation between the src -// buffer and dgd buffer (M), and the auto-covariance of the dgd buffer (H). -// -// M is of size 5 * 5. It needs to be filled such that multiplying one element -// from src with each element of a row of the wiener window will fill one -// column of M. However this is not very convenient in terms of memory -// accesses, as it means we do contiguous loads of dgd but strided stores to M. -// As a result, we use an intermediate matrix M_trn which is instead filled -// such that one row of the wiener window gives one row of M_trn. Once fully -// computed, M_trn is then transposed to return M. -// -// H is of size 25 * 25. It is filled by multiplying every pair of elements of -// the wiener window together. Since it is a symmetric matrix, we only compute -// the upper triangle, and then copy it down to the lower one. Here we fill it -// by taking each different pair of columns, and multiplying all the elements of -// the first one with all the elements of the second one, with a special case -// when multiplying a column by itself. -static INLINE void compute_stats_win5_neon(int16_t *dgd_avg, int dgd_avg_stride, - int16_t *src_avg, int src_avg_stride, - int width, int v_start, int v_end, - int64_t *M, int64_t *H, - int downsample_factor, - int last_row_downsample_factor) { - const int wiener_win = 5; - const int wiener_win2 = wiener_win * wiener_win; - // The downsample factor can be either 1 or 4, so instead of multiplying the - // values by 1 or 4, we can left shift by 0 or 2 respectively, which is - // faster. (This doesn't apply to the last row where we can scale the values - // by 1, 2 or 3, so we keep the multiplication). - const int downsample_shift = downsample_factor >> 1; - const int16x8_t df_s16 = vdupq_n_s16(downsample_shift); - const int32x4_t df_s32 = vdupq_n_s32(downsample_shift); - const int16x8_t mask = vld1q_s16(&av1_neon_mask_16bit[8] - (width % 8)); - - // We use an intermediate matrix that will be transposed to get M. - int64_t M_trn[25]; - memset(M_trn, 0, sizeof(M_trn)); - - int h = v_start; - do { - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src vector and scale based on downsampling factor. - int16x8_t s = vshlq_s16(vld1q_s16(src_avg + j), df_s16); - - // Compute all the elements of one row of M. - compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j]; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - - M_trn[row * wiener_win + 0] += d0 * s * downsample_factor; - M_trn[row * wiener_win + 1] += d1 * s * downsample_factor; - M_trn[row * wiener_win + 2] += d2 * s * downsample_factor; - M_trn[row * wiener_win + 3] += d3 * s * downsample_factor; - M_trn[row * wiener_win + 4] += d4 * s * downsample_factor; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - // Process remaining columns using a mask to discard excess elements. - if (j < width) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col(dgd0, col0, H, wiener_win, wiener_win2, df_s32); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vshlq_s16(dgd1[0], df_s16); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vshlq_s16(dgd1[1], df_s16); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vshlq_s16(dgd1[2], df_s16); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vshlq_s16(dgd1[3], df_s16); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vshlq_s16(dgd1[4], df_s16); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - dgd_avg += downsample_factor * dgd_avg_stride; - src_avg += src_avg_stride; - h += downsample_factor; - } while (h <= v_end - downsample_factor); - - if (h < v_end) { - // The last row is scaled by a different downsample factor, so process - // separately. - - // Cross-correlation (M). - for (int row = 0; row < wiener_win; row++) { - int16x8_t dgd0 = vld1q_s16(dgd_avg + row * dgd_avg_stride); - int j = 0; - while (j <= width - 8) { - int16x8_t dgd1 = vld1q_s16(dgd_avg + row * dgd_avg_stride + j + 8); - // Load src vector and scale based on downsampling factor. - int16x8_t s = - vmulq_n_s16(vld1q_s16(src_avg + j), last_row_downsample_factor); - - // Compute all the elements of one row of M. - compute_M_one_row_win5(s, dgd0, dgd1, M_trn, wiener_win, row); - - dgd0 = dgd1; - j += 8; - } - - // Process remaining elements without Neon. - while (j < width) { - int16_t s = src_avg[j]; - int16_t d0 = dgd_avg[row * dgd_avg_stride + 0 + j]; - int16_t d1 = dgd_avg[row * dgd_avg_stride + 1 + j]; - int16_t d2 = dgd_avg[row * dgd_avg_stride + 2 + j]; - int16_t d3 = dgd_avg[row * dgd_avg_stride + 3 + j]; - int16_t d4 = dgd_avg[row * dgd_avg_stride + 4 + j]; - - M_trn[row * wiener_win + 0] += d0 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 1] += d1 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 2] += d2 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 3] += d3 * s * last_row_downsample_factor; - M_trn[row * wiener_win + 4] += d4 * s * last_row_downsample_factor; - - j++; - } - } - - // Auto-covariance (H). - int j = 0; - while (j <= width - 8) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - j += 8; - } - - // Process remaining columns using a mask to discard excess elements. - if (j < width) { - for (int col0 = 0; col0 < wiener_win; col0++) { - // Load first column. - int16x8_t dgd0[5]; - dgd0[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col0); - dgd0[0] = vandq_s16(dgd0[0], mask); - dgd0[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col0); - dgd0[1] = vandq_s16(dgd0[1], mask); - dgd0[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col0); - dgd0[2] = vandq_s16(dgd0[2], mask); - dgd0[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col0); - dgd0[3] = vandq_s16(dgd0[3], mask); - dgd0[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col0); - dgd0[4] = vandq_s16(dgd0[4], mask); - - // Perform computation of the first column with itself (15 elements). - // For the first column this will fill the upper triangle of the 5x5 - // matrix at the top left of the H matrix. For the next columns this - // will fill the upper triangle of the other 5x5 matrices around H's - // diagonal. - compute_H_one_col_last_row(dgd0, col0, H, wiener_win, wiener_win2, - last_row_downsample_factor); - - // All computation next to the matrix diagonal has already been done. - for (int col1 = col0 + 1; col1 < wiener_win; col1++) { - // Load second column and scale based on downsampling factor. - int16x8_t dgd1[5]; - dgd1[0] = vld1q_s16(dgd_avg + 0 * dgd_avg_stride + j + col1); - dgd1[0] = vmulq_n_s16(dgd1[0], last_row_downsample_factor); - dgd1[1] = vld1q_s16(dgd_avg + 1 * dgd_avg_stride + j + col1); - dgd1[1] = vmulq_n_s16(dgd1[1], last_row_downsample_factor); - dgd1[2] = vld1q_s16(dgd_avg + 2 * dgd_avg_stride + j + col1); - dgd1[2] = vmulq_n_s16(dgd1[2], last_row_downsample_factor); - dgd1[3] = vld1q_s16(dgd_avg + 3 * dgd_avg_stride + j + col1); - dgd1[3] = vmulq_n_s16(dgd1[3], last_row_downsample_factor); - dgd1[4] = vld1q_s16(dgd_avg + 4 * dgd_avg_stride + j + col1); - dgd1[4] = vmulq_n_s16(dgd1[4], last_row_downsample_factor); - - // Compute all elements from the combination of both columns (25 - // elements). - compute_H_two_cols(dgd0, dgd1, col0, col1, H, wiener_win, - wiener_win2); - } - } - } - } - - // Transpose M_trn. - transpose_M_win5(M, M_trn, 5); - - // Copy upper triangle of H in the lower one. - copy_upper_triangle(H, wiener_win2); -} - void av1_compute_stats_neon(int wiener_win, const uint8_t *dgd, const uint8_t *src, int16_t *dgd_avg, int16_t *src_avg, int h_start, int h_end, @@ -981,23 +922,18 @@ int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats) { assert(wiener_win == WIENER_WIN || wiener_win == WIENER_WIN_CHROMA); + assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); + (void)dgd_avg; + (void)src_avg; const int wiener_win2 = wiener_win * wiener_win; const int wiener_halfwin = wiener_win >> 1; - const int32_t width = h_end - h_start; - const int32_t height = v_end - v_start; - const uint8_t *dgd_start = &dgd[v_start * dgd_stride + h_start]; - memset(H, 0, sizeof(*H) * wiener_win2 * wiener_win2); - - uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); - assert(WIENER_STATS_DOWNSAMPLE_FACTOR == 4); - int downsample_factor = - use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; + const int width = h_end - h_start; + const int height = v_end - v_start; - int dgd_avg_stride = width + 2 * wiener_halfwin; - int src_avg_stride = width; + const uint8_t *dgd_start = dgd + h_start + v_start * dgd_stride; + const uint8_t *src_start = src + h_start + v_start * src_stride; - // Compute (dgd - avg) and store it in dgd_avg. // The wiener window will slide along the dgd frame, centered on each pixel. // For the top left pixel and all the pixels on the side of the frame this // means half of the window will be outside of the frame. As such the actual @@ -1006,27 +942,47 @@ const int vert_offset = v_start - wiener_halfwin; const int horiz_offset = h_start - wiener_halfwin; const uint8_t *dgd_win = dgd + horiz_offset + vert_offset * dgd_stride; - compute_sub_avg(dgd_win, dgd_stride, avg, dgd_avg, dgd_avg_stride, - width + 2 * wiener_halfwin, height + 2 * wiener_halfwin, 1); - // Compute (src - avg), downsample if necessary and store in src-avg. - const uint8_t *src_start = src + h_start + v_start * src_stride; - compute_sub_avg(src_start, src_stride * downsample_factor, avg, src_avg, - src_avg_stride, width, height, downsample_factor); + uint8_t avg = find_average_neon(dgd_start, dgd_stride, width, height); // Since the height is not necessarily a multiple of the downsample factor, // the last line of src will be scaled according to how many rows remain. - int last_row_downsample_factor = - use_downsampled_wiener_stats ? height % downsample_factor : 1; + int downsample_factor = + use_downsampled_wiener_stats ? WIENER_STATS_DOWNSAMPLE_FACTOR : 1; - if (wiener_win == WIENER_WIN) { - compute_stats_win7_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, - width, v_start, v_end, M, H, downsample_factor, - last_row_downsample_factor); - } else { - compute_stats_win5_neon(dgd_avg, dgd_avg_stride, src_avg, src_avg_stride, - width, v_start, v_end, M, H, downsample_factor, - last_row_downsample_factor); + int downsampled_height = height / downsample_factor; + int downsample_remainder = height % downsample_factor; + + memset(M, 0, wiener_win2 * sizeof(*M)); + memset(H, 0, wiener_win2 * wiener_win2 * sizeof(*H)); + + // Calculate the M and H matrices for the normal and downsampled cases. + if (downsampled_height > 0) { + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } else { + compute_stats_win5_neon(dgd_win, src_start, width, downsampled_height, + dgd_stride, src_stride, avg, M, H, + downsample_factor); + } + } + + // Accumulate the remaining last rows in the downsampled case. + if (downsample_remainder > 0) { + int remainder_offset = height - downsample_remainder; + if (wiener_win == WIENER_WIN) { + compute_stats_win7_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } else { + compute_stats_win5_neon(dgd_win + remainder_offset * dgd_stride, + src_start + remainder_offset * src_stride, width, + 1, dgd_stride, src_stride, avg, M, H, + downsample_remainder); + } } } diff -Nru aom-3.8.2/av1/encoder/arm/neon/pickrst_neon.h aom-3.9.0/av1/encoder/arm/neon/pickrst_neon.h --- aom-3.8.2/av1/encoder/arm/neon/pickrst_neon.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/pickrst_neon.h 2024-05-07 19:57:03.003000000 +0000 @@ -14,268 +14,175 @@ #include -#include "aom_dsp/arm/sum_neon.h" -#include "aom_dsp/arm/transpose_neon.h" +#include "av1/common/restoration.h" -// When we load 8 values of int16_t type and need less than 8 values for -// processing, the below mask is used to make the extra values zero. -extern const int16_t av1_neon_mask_16bit[16]; - -static INLINE void copy_upper_triangle(int64_t *H, const int wiener_win2) { - for (int i = 0; i < wiener_win2 - 2; i = i + 2) { - // Transpose the first 2x2 square. It needs a special case as the element - // of the bottom left is on the diagonal. - int64x2_t row0 = vld1q_s64(H + i * wiener_win2 + i + 1); - int64x2_t row1 = vld1q_s64(H + (i + 1) * wiener_win2 + i + 1); - - int64x2_t tr_row = aom_vtrn2q_s64(row0, row1); - - vst1_s64(H + (i + 1) * wiener_win2 + i, vget_low_s64(row0)); - vst1q_s64(H + (i + 2) * wiener_win2 + i, tr_row); - - // Transpose and store all the remaining 2x2 squares of the line. - for (int j = i + 3; j < wiener_win2; j = j + 2) { - row0 = vld1q_s64(H + i * wiener_win2 + j); - row1 = vld1q_s64(H + (i + 1) * wiener_win2 + j); - - int64x2_t tr_row0 = aom_vtrn1q_s64(row0, row1); - int64x2_t tr_row1 = aom_vtrn2q_s64(row0, row1); - - vst1q_s64(H + j * wiener_win2 + i, tr_row0); - vst1q_s64(H + (j + 1) * wiener_win2 + i, tr_row1); +// Aligned sizes for Wiener filters. +#define WIENER_WIN2_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2, 2) +#define WIENER_WIN2_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2, 3) +#define WIENER_WIN2_REDUCED ((WIENER_WIN_REDUCED) * (WIENER_WIN_REDUCED)) +#define WIENER_WIN2_REDUCED_ALIGN2 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 2) +#define WIENER_WIN2_REDUCED_ALIGN3 ALIGN_POWER_OF_TWO(WIENER_WIN2_REDUCED, 3) + +// Compute 8 values of M (cross correlation) for a single source pixel and +// accumulate. +static INLINE void update_M_1pixel(int32_t *M_s32, int16x4_t src_avg, + int16x8_t dgd_avg) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg), src_avg); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg), src_avg); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +// Compute 8 values of M (cross correlation) for two source pixels and +// accumulate. +static INLINE void update_M_2pixels(int32_t *M_s32, int16x4_t src_avg0, + int16x4_t src_avg1, int16x8_t dgd_avg0, + int16x8_t dgd_avg1) { + int32x4_t lo = vld1q_s32(M_s32 + 0); + int32x4_t hi = vld1q_s32(M_s32 + 4); + + lo = vmlal_s16(lo, vget_low_s16(dgd_avg0), src_avg0); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg0), src_avg0); + lo = vmlal_s16(lo, vget_low_s16(dgd_avg1), src_avg1); + hi = vmlal_s16(hi, vget_high_s16(dgd_avg1), src_avg1); + + vst1q_s32(M_s32 + 0, lo); + vst1q_s32(M_s32 + 4, hi); +} + +static INLINE void update_H_1pixel(int32_t *H_s32, const int16_t *dgd_avg, + int width, int height) { + for (int i = 0; i < height; i += 4) { + int16x4_t di = vld1_s16(dgd_avg + i); + + for (int j = i; j < width; j += 4) { + int16x4_t dj = vld1_s16(dgd_avg + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * width + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * width + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * width + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * width + j); + + h0 = vmlal_lane_s16(h0, dj, di, 0); + h1 = vmlal_lane_s16(h1, dj, di, 1); + h2 = vmlal_lane_s16(h2, dj, di, 2); + h3 = vmlal_lane_s16(h3, dj, di, 3); + + vst1q_s32(H_s32 + 0 * width + j, h0); + vst1q_s32(H_s32 + 1 * width + j, h1); + vst1q_s32(H_s32 + 2 * width + j, h2); + vst1q_s32(H_s32 + 3 * width + j, h3); } + H_s32 += 4 * width; } } -static INLINE void transpose_M_win5(int64_t *M, int64_t *M_trn, - const int wiener_win) { - // 1st and 2nd rows. - int64x2_t row00 = vld1q_s64(M_trn); - int64x2_t row10 = vld1q_s64(M_trn + wiener_win); - vst1q_s64(M, aom_vtrn1q_s64(row00, row10)); - vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10)); - - int64x2_t row02 = vld1q_s64(M_trn + 2); - int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2); - vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12)); - vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12)); - - // Last column only needs trn2. - int64x2_t row03 = vld1q_s64(M_trn + 3); - int64x2_t row13 = vld1q_s64(M_trn + wiener_win + 3); - vst1q_s64(M + 4 * wiener_win, aom_vtrn2q_s64(row03, row13)); - - // 3rd and 4th rows. - int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win); - int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win); - vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30)); - vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30)); - - int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2); - int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2); - vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32)); - vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32)); - - // Last column only needs trn2. - int64x2_t row23 = vld1q_s64(M_trn + 2 * wiener_win + 3); - int64x2_t row33 = vld1q_s64(M_trn + 3 * wiener_win + 3); - vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn2q_s64(row23, row33)); - - // Last row. - int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win); - vst1_s64(M + 4, vget_low_s64(row40)); - vst1_s64(M + 1 * wiener_win + 4, vget_high_s64(row40)); - - int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2); - vst1_s64(M + 2 * wiener_win + 4, vget_low_s64(row42)); - vst1_s64(M + 3 * wiener_win + 4, vget_high_s64(row42)); - - // Element on the bottom right of M_trn is copied as is. - vst1_s64(M + 4 * wiener_win + 4, vld1_s64(M_trn + 4 * wiener_win + 4)); -} - -static INLINE void transpose_M_win7(int64_t *M, int64_t *M_trn, - const int wiener_win) { - // 1st and 2nd rows. - int64x2_t row00 = vld1q_s64(M_trn); - int64x2_t row10 = vld1q_s64(M_trn + wiener_win); - vst1q_s64(M, aom_vtrn1q_s64(row00, row10)); - vst1q_s64(M + wiener_win, aom_vtrn2q_s64(row00, row10)); - - int64x2_t row02 = vld1q_s64(M_trn + 2); - int64x2_t row12 = vld1q_s64(M_trn + wiener_win + 2); - vst1q_s64(M + 2 * wiener_win, aom_vtrn1q_s64(row02, row12)); - vst1q_s64(M + 3 * wiener_win, aom_vtrn2q_s64(row02, row12)); - - int64x2_t row04 = vld1q_s64(M_trn + 4); - int64x2_t row14 = vld1q_s64(M_trn + wiener_win + 4); - vst1q_s64(M + 4 * wiener_win, aom_vtrn1q_s64(row04, row14)); - vst1q_s64(M + 5 * wiener_win, aom_vtrn2q_s64(row04, row14)); - - // Last column only needs trn2. - int64x2_t row05 = vld1q_s64(M_trn + 5); - int64x2_t row15 = vld1q_s64(M_trn + wiener_win + 5); - vst1q_s64(M + 6 * wiener_win, aom_vtrn2q_s64(row05, row15)); - - // 3rd and 4th rows. - int64x2_t row20 = vld1q_s64(M_trn + 2 * wiener_win); - int64x2_t row30 = vld1q_s64(M_trn + 3 * wiener_win); - vst1q_s64(M + 2, aom_vtrn1q_s64(row20, row30)); - vst1q_s64(M + wiener_win + 2, aom_vtrn2q_s64(row20, row30)); - - int64x2_t row22 = vld1q_s64(M_trn + 2 * wiener_win + 2); - int64x2_t row32 = vld1q_s64(M_trn + 3 * wiener_win + 2); - vst1q_s64(M + 2 * wiener_win + 2, aom_vtrn1q_s64(row22, row32)); - vst1q_s64(M + 3 * wiener_win + 2, aom_vtrn2q_s64(row22, row32)); - - int64x2_t row24 = vld1q_s64(M_trn + 2 * wiener_win + 4); - int64x2_t row34 = vld1q_s64(M_trn + 3 * wiener_win + 4); - vst1q_s64(M + 4 * wiener_win + 2, aom_vtrn1q_s64(row24, row34)); - vst1q_s64(M + 5 * wiener_win + 2, aom_vtrn2q_s64(row24, row34)); - - // Last column only needs trn2. - int64x2_t row25 = vld1q_s64(M_trn + 2 * wiener_win + 5); - int64x2_t row35 = vld1q_s64(M_trn + 3 * wiener_win + 5); - vst1q_s64(M + 6 * wiener_win + 2, aom_vtrn2q_s64(row25, row35)); - - // 5th and 6th rows. - int64x2_t row40 = vld1q_s64(M_trn + 4 * wiener_win); - int64x2_t row50 = vld1q_s64(M_trn + 5 * wiener_win); - vst1q_s64(M + 4, aom_vtrn1q_s64(row40, row50)); - vst1q_s64(M + wiener_win + 4, aom_vtrn2q_s64(row40, row50)); - - int64x2_t row42 = vld1q_s64(M_trn + 4 * wiener_win + 2); - int64x2_t row52 = vld1q_s64(M_trn + 5 * wiener_win + 2); - vst1q_s64(M + 2 * wiener_win + 4, aom_vtrn1q_s64(row42, row52)); - vst1q_s64(M + 3 * wiener_win + 4, aom_vtrn2q_s64(row42, row52)); - - int64x2_t row44 = vld1q_s64(M_trn + 4 * wiener_win + 4); - int64x2_t row54 = vld1q_s64(M_trn + 5 * wiener_win + 4); - vst1q_s64(M + 4 * wiener_win + 4, aom_vtrn1q_s64(row44, row54)); - vst1q_s64(M + 5 * wiener_win + 4, aom_vtrn2q_s64(row44, row54)); - - // Last column only needs trn2. - int64x2_t row45 = vld1q_s64(M_trn + 4 * wiener_win + 5); - int64x2_t row55 = vld1q_s64(M_trn + 5 * wiener_win + 5); - vst1q_s64(M + 6 * wiener_win + 4, aom_vtrn2q_s64(row45, row55)); - - // Last row. - int64x2_t row60 = vld1q_s64(M_trn + 6 * wiener_win); - vst1_s64(M + 6, vget_low_s64(row60)); - vst1_s64(M + 1 * wiener_win + 6, vget_high_s64(row60)); - - int64x2_t row62 = vld1q_s64(M_trn + 6 * wiener_win + 2); - vst1_s64(M + 2 * wiener_win + 6, vget_low_s64(row62)); - vst1_s64(M + 3 * wiener_win + 6, vget_high_s64(row62)); - - int64x2_t row64 = vld1q_s64(M_trn + 6 * wiener_win + 4); - vst1_s64(M + 4 * wiener_win + 6, vget_low_s64(row64)); - vst1_s64(M + 5 * wiener_win + 6, vget_high_s64(row64)); - - // Element on the bottom right of M_trn is copied as is. - vst1_s64(M + 6 * wiener_win + 6, vld1_s64(M_trn + 6 * wiener_win + 6)); -} - -static INLINE void compute_M_one_row_win5(int16x8_t src, int16x8_t dgd0, - int16x8_t dgd1, int64_t *M, - const int wiener_win, int row) { - int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0); - int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2); - - int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0)); - m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0)); - - int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1); - int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01)); - m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01)); - - m0 = horizontal_add_2d_s32(m0, m1); - m_01 = vpadalq_s32(m_01, m0); - vst1q_s64(M + row * wiener_win + 0, m_01); - - int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2); - int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02)); - m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02)); - - int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3); - int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03)); - m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03)); - - m2 = horizontal_add_2d_s32(m2, m3); - m_23 = vpadalq_s32(m_23, m2); - vst1q_s64(M + row * wiener_win + 2, m_23); - - int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4); - int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04)); - m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04)); - M[row * wiener_win + 4] += horizontal_long_add_s32x4(m4); -} - -static INLINE void compute_M_one_row_win7(int16x8_t src, int16x8_t dgd0, - int16x8_t dgd1, int64_t *M, - const int wiener_win, int row) { - int64x2_t m_01 = vld1q_s64(M + row * wiener_win + 0); - int64x2_t m_23 = vld1q_s64(M + row * wiener_win + 2); - int64x2_t m_45 = vld1q_s64(M + row * wiener_win + 4); - - int32x4_t m0 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd0)); - m0 = vmlal_s16(m0, vget_high_s16(src), vget_high_s16(dgd0)); - - int16x8_t dgd01 = vextq_s16(dgd0, dgd1, 1); - int32x4_t m1 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd01)); - m1 = vmlal_s16(m1, vget_high_s16(src), vget_high_s16(dgd01)); - - m0 = horizontal_add_2d_s32(m0, m1); - m_01 = vpadalq_s32(m_01, m0); - vst1q_s64(M + row * wiener_win + 0, m_01); - - int16x8_t dgd02 = vextq_s16(dgd0, dgd1, 2); - int32x4_t m2 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd02)); - m2 = vmlal_s16(m2, vget_high_s16(src), vget_high_s16(dgd02)); - - int16x8_t dgd03 = vextq_s16(dgd0, dgd1, 3); - int32x4_t m3 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd03)); - m3 = vmlal_s16(m3, vget_high_s16(src), vget_high_s16(dgd03)); - - m2 = horizontal_add_2d_s32(m2, m3); - m_23 = vpadalq_s32(m_23, m2); - vst1q_s64(M + row * wiener_win + 2, m_23); - - int16x8_t dgd04 = vextq_s16(dgd0, dgd1, 4); - int32x4_t m4 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd04)); - m4 = vmlal_s16(m4, vget_high_s16(src), vget_high_s16(dgd04)); - - int16x8_t dgd05 = vextq_s16(dgd0, dgd1, 5); - int32x4_t m5 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd05)); - m5 = vmlal_s16(m5, vget_high_s16(src), vget_high_s16(dgd05)); - - m4 = horizontal_add_2d_s32(m4, m5); - m_45 = vpadalq_s32(m_45, m4); - vst1q_s64(M + row * wiener_win + 4, m_45); - - int16x8_t dgd06 = vextq_s16(dgd0, dgd1, 6); - int32x4_t m6 = vmull_s16(vget_low_s16(src), vget_low_s16(dgd06)); - m6 = vmlal_s16(m6, vget_high_s16(src), vget_high_s16(dgd06)); - M[row * wiener_win + 6] += horizontal_long_add_s32x4(m6); +static INLINE void update_H_5x5_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 24; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + for (int j = i + 0; j < WIENER_WIN2_REDUCED_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_REDUCED_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_REDUCED_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_REDUCED_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_REDUCED_ALIGN2 + j, h3); + } + H_s32 += 4 * WIENER_WIN2_REDUCED_ALIGN2; + } } -static INLINE void compute_H_two_cols(int16x8_t *dgd0, int16x8_t *dgd1, - int col0, int col1, int64_t *H, - const int wiener_win, - const int wiener_win2) { - for (int row0 = 0; row0 < wiener_win; row0++) { - for (int row1 = 0; row1 < wiener_win; row1++) { - int auto_cov_idx = - (col0 * wiener_win + row0) * wiener_win2 + (col1 * wiener_win) + row1; - - int32x4_t auto_cov = - vmull_s16(vget_low_s16(dgd0[row0]), vget_low_s16(dgd1[row1])); - auto_cov = vmlal_s16(auto_cov, vget_high_s16(dgd0[row0]), - vget_high_s16(dgd1[row1])); - - H[auto_cov_idx] += horizontal_long_add_s32x4(auto_cov); +static INLINE void update_H_7x7_2pixels(int32_t *H_s32, const int16_t *dgd_avg0, + const int16_t *dgd_avg1) { + for (int i = 0; i < 48; i += 4) { + int16x4_t di0 = vld1_s16(dgd_avg0 + i); + int16x4_t di1 = vld1_s16(dgd_avg1 + i); + + int32x4_t h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i); + int32x4_t h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i); + + h0 = vmlal_lane_s16(h0, di0, di0, 0); + h0 = vmlal_lane_s16(h0, di1, di1, 0); + h1 = vmlal_lane_s16(h1, di0, di0, 1); + h1 = vmlal_lane_s16(h1, di1, di1, 1); + h2 = vmlal_lane_s16(h2, di0, di0, 2); + h2 = vmlal_lane_s16(h2, di1, di1, 2); + h3 = vmlal_lane_s16(h3, di0, di0, 3); + h3 = vmlal_lane_s16(h3, di1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + i, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + i, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + i, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + i, h3); + + for (int j = i + 4; j < WIENER_WIN2_ALIGN2; j += 4) { + int16x4_t dj0 = vld1_s16(dgd_avg0 + j); + int16x4_t dj1 = vld1_s16(dgd_avg1 + j); + h0 = vld1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j); + h1 = vld1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j); + h2 = vld1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j); + h3 = vld1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j); + + h0 = vmlal_lane_s16(h0, dj0, di0, 0); + h0 = vmlal_lane_s16(h0, dj1, di1, 0); + h1 = vmlal_lane_s16(h1, dj0, di0, 1); + h1 = vmlal_lane_s16(h1, dj1, di1, 1); + h2 = vmlal_lane_s16(h2, dj0, di0, 2); + h2 = vmlal_lane_s16(h2, dj1, di1, 2); + h3 = vmlal_lane_s16(h3, dj0, di0, 3); + h3 = vmlal_lane_s16(h3, dj1, di1, 3); + + vst1q_s32(H_s32 + 0 * WIENER_WIN2_ALIGN2 + j, h0); + vst1q_s32(H_s32 + 1 * WIENER_WIN2_ALIGN2 + j, h1); + vst1q_s32(H_s32 + 2 * WIENER_WIN2_ALIGN2 + j, h2); + vst1q_s32(H_s32 + 3 * WIENER_WIN2_ALIGN2 + j, h3); } + H_s32 += 4 * WIENER_WIN2_ALIGN2; } } +// Widen 32-bit src data and accumulate into 64-bit dst. Clear src data. +static INLINE void accumulate_and_clear(int64_t *dst, int32_t *src, + int length) { + do { + int32x4_t s32 = vld1q_s32(src); + vst1q_s32(src, vdupq_n_s32(0)); + src += 4; + + int64x2_t d_lo = vld1q_s64(dst + 0); + int64x2_t d_hi = vld1q_s64(dst + 2); + + d_lo = vaddw_s32(d_lo, vget_low_s32(s32)); + d_hi = vaddw_s32(d_hi, vget_high_s32(s32)); + + vst1q_s64(dst + 0, d_lo); + vst1q_s64(dst + 2, d_hi); + + dst += 4; + length -= 4; + } while (length > 0); +} + #endif // AOM_AV1_ENCODER_ARM_NEON_PICKRST_NEON_H_ diff -Nru aom-3.8.2/av1/encoder/arm/neon/reconinter_enc_neon.c aom-3.9.0/av1/encoder/arm/neon/reconinter_enc_neon.c --- aom-3.8.2/av1/encoder/arm/neon/reconinter_enc_neon.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/reconinter_enc_neon.c 2024-05-07 19:57:03.006000000 +0000 @@ -222,8 +222,7 @@ int i = height / 2; do { uint16x4_t r = load_u16_2x2(ref, ref_stride); - store_u16_2x1(comp_pred + 0 * width, r, 0); - store_u16_2x1(comp_pred + 1 * width, r, 1); + store_u16x2_strided_x2(comp_pred, width, r); ref += 2 * ref_stride; comp_pred += 2 * width; } while (--i != 0); diff -Nru aom-3.8.2/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c aom-3.9.0/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c --- aom-3.8.2/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/temporal_filter_neon_dotprod.c 2024-05-07 19:57:03.008000000 +0000 @@ -23,7 +23,15 @@ #define SSE_STRIDE (BW + 4) // clang-format off +// Table used to pad the first and last columns and apply the sliding window. +DECLARE_ALIGNED(16, static const uint8_t, kLoadPad[4][16]) = { + { 2, 2, 2, 3, 4, 255, 255, 255, 255, 2, 2, 3, 4, 5, 255, 255 }, + { 255, 255, 2, 3, 4, 5, 6, 255, 255, 255, 255, 3, 4, 5, 6, 7 }, + { 0, 1, 2, 3, 4, 255, 255, 255, 255, 1, 2, 3, 4, 5, 255, 255 }, + { 255, 255, 2, 3, 4, 5, 5, 255, 255, 255, 255, 3, 4, 5, 5, 5 } +}; +// For columns that don't need to be padded it's just a simple mask. DECLARE_ALIGNED(16, static const uint8_t, kSlidingWindowMask[]) = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, @@ -56,22 +64,6 @@ } while (++i < block_height); } -static INLINE uint8x16_t load_and_pad(const uint8_t *src, const uint32_t col, - const uint32_t block_width) { - uint8x8_t s = vld1_u8(src); - - if (col == 0) { - const uint8_t lane2 = vget_lane_u8(s, 2); - s = vset_lane_u8(lane2, s, 0); - s = vset_lane_u8(lane2, s, 1); - } else if (col >= block_width - 4) { - const uint8_t lane5 = vget_lane_u8(s, 5); - s = vset_lane_u8(lane5, s, 6); - s = vset_lane_u8(lane5, s, 7); - } - return vcombine_u8(s, s); -} - static void apply_temporal_filter( const uint8_t *frame, const unsigned int stride, const uint32_t block_width, const uint32_t block_height, const int *subblock_mses, @@ -84,6 +76,10 @@ uint32_t acc_5x5_neon[BH][BW]; const uint8x16x2_t vmask = vld1q_u8_x2(kSlidingWindowMask); + const uint8x16_t pad_tbl0 = vld1q_u8(kLoadPad[0]); + const uint8x16_t pad_tbl1 = vld1q_u8(kLoadPad[1]); + const uint8x16_t pad_tbl2 = vld1q_u8(kLoadPad[2]); + const uint8x16_t pad_tbl3 = vld1q_u8(kLoadPad[3]); // Traverse 4 columns at a time - first and last two columns need padding. for (uint32_t col = 0; col < block_width; col += 4) { @@ -92,9 +88,18 @@ // Load, pad (for first and last two columns) and mask 3 rows from the top. for (int i = 2; i < 5; i++) { - const uint8x16_t s = load_and_pad(src, col, block_width); - vsrc[i][0] = vandq_u8(s, vmask.val[0]); - vsrc[i][1] = vandq_u8(s, vmask.val[1]); + uint8x8_t s = vld1_u8(src); + uint8x16_t s_dup = vcombine_u8(s, s); + if (col == 0) { + vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl0); + vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl1); + } else if (col >= block_width - 4) { + vsrc[i][0] = vqtbl1q_u8(s_dup, pad_tbl2); + vsrc[i][1] = vqtbl1q_u8(s_dup, pad_tbl3); + } else { + vsrc[i][0] = vandq_u8(s_dup, vmask.val[0]); + vsrc[i][1] = vandq_u8(s_dup, vmask.val[1]); + } src += SSE_STRIDE; } @@ -130,9 +135,18 @@ if (row <= block_height - 4) { // Load next row into the bottom of the sliding window. - uint8x16_t s = load_and_pad(src, col, block_width); - vsrc[4][0] = vandq_u8(s, vmask.val[0]); - vsrc[4][1] = vandq_u8(s, vmask.val[1]); + uint8x8_t s = vld1_u8(src); + uint8x16_t s_dup = vcombine_u8(s, s); + if (col == 0) { + vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl0); + vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl1); + } else if (col >= block_width - 4) { + vsrc[4][0] = vqtbl1q_u8(s_dup, pad_tbl2); + vsrc[4][1] = vqtbl1q_u8(s_dup, pad_tbl3); + } else { + vsrc[4][0] = vandq_u8(s_dup, vmask.val[0]); + vsrc[4][1] = vandq_u8(s_dup, vmask.val[1]); + } src += SSE_STRIDE; } else { // Pad the bottom 2 rows. diff -Nru aom-3.8.2/av1/encoder/arm/neon/wedge_utils_sve.c aom-3.9.0/av1/encoder/arm/neon/wedge_utils_sve.c --- aom-3.8.2/av1/encoder/arm/neon/wedge_utils_sve.c 1970-01-01 00:00:00.000000000 +0000 +++ aom-3.9.0/av1/encoder/arm/neon/wedge_utils_sve.c 2024-05-07 19:57:03.009000000 +0000 @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2024, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include +#include + +#include "aom_dsp/arm/aom_neon_sve_bridge.h" +#include "aom_dsp/arm/sum_neon.h" +#include "av1/common/reconinter.h" + +uint64_t av1_wedge_sse_from_residuals_sve(const int16_t *r1, const int16_t *d, + const uint8_t *m, int N) { + assert(N % 64 == 0); + + // Predicate pattern with first 8 elements true. + const svbool_t pattern = svptrue_pat_b16(SV_VL8); + int64x2_t sse[2] = { vdupq_n_s64(0), vdupq_n_s64(0) }; + + int i = 0; + do { + int32x4_t sum[4]; + int16x8_t sum_s16[2]; + + const int16x8_t r1_l = vld1q_s16(r1 + i); + const int16x8_t r1_h = vld1q_s16(r1 + i + 8); + const int16x8_t d_l = vld1q_s16(d + i); + const int16x8_t d_h = vld1q_s16(d + i + 8); + + // Use a zero-extending load to widen the vector elements. + const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m + i)); + const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + i + 8)); + + sum[0] = vshll_n_s16(vget_low_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[1] = vshll_n_s16(vget_high_s16(r1_l), WEDGE_WEIGHT_BITS); + sum[2] = vshll_n_s16(vget_low_s16(r1_h), WEDGE_WEIGHT_BITS); + sum[3] = vshll_n_s16(vget_high_s16(r1_h), WEDGE_WEIGHT_BITS); + + sum[0] = vmlal_s16(sum[0], vget_low_s16(m_l), vget_low_s16(d_l)); + sum[1] = vmlal_s16(sum[1], vget_high_s16(m_l), vget_high_s16(d_l)); + sum[2] = vmlal_s16(sum[2], vget_low_s16(m_h), vget_low_s16(d_h)); + sum[3] = vmlal_s16(sum[3], vget_high_s16(m_h), vget_high_s16(d_h)); + + sum_s16[0] = vcombine_s16(vqmovn_s32(sum[0]), vqmovn_s32(sum[1])); + sum_s16[1] = vcombine_s16(vqmovn_s32(sum[2]), vqmovn_s32(sum[3])); + + sse[0] = aom_sdotq_s16(sse[0], sum_s16[0], sum_s16[0]); + sse[1] = aom_sdotq_s16(sse[1], sum_s16[1], sum_s16[1]); + + i += 16; + } while (i < N); + + const uint64_t csse = + (uint64_t)horizontal_add_s64x2(vaddq_s64(sse[0], sse[1])); + return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); +} + +int8_t av1_wedge_sign_from_residuals_sve(const int16_t *ds, const uint8_t *m, + int N, int64_t limit) { + assert(N % 16 == 0); + + // Predicate pattern with first 8 elements true. + svbool_t pattern = svptrue_pat_b16(SV_VL8); + int64x2_t acc_l = vdupq_n_s64(0); + int64x2_t acc_h = vdupq_n_s64(0); + + do { + const int16x8_t ds_l = vld1q_s16(ds); + const int16x8_t ds_h = vld1q_s16(ds + 8); + + // Use a zero-extending load to widen the vector elements. + const int16x8_t m_l = svget_neonq_s16(svld1ub_s16(pattern, m)); + const int16x8_t m_h = svget_neonq_s16(svld1ub_s16(pattern, m + 8)); + + acc_l = aom_sdotq_s16(acc_l, ds_l, m_l); + acc_h = aom_sdotq_s16(acc_h, ds_h, m_h); + + ds += 16; + m += 16; + N -= 16; + } while (N != 0); + + const int64x2_t sum = vaddq_s64(acc_l, acc_h); + return horizontal_add_s64x2(sum) > limit; +} diff -Nru aom-3.8.2/av1/encoder/av1_quantize.c aom-3.9.0/av1/encoder/av1_quantize.c --- aom-3.8.2/av1/encoder/av1_quantize.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/av1_quantize.c 2024-05-07 19:57:03.014000000 +0000 @@ -15,6 +15,7 @@ #include "aom_dsp/quantize.h" #include "aom_mem/aom_mem.h" +#include "aom_ports/bitops.h" #include "aom_ports/mem.h" #include "av1/common/idct.h" @@ -581,7 +582,7 @@ uint32_t t; int l, m; t = d; - for (l = 0; t > 1; l++) t >>= 1; + l = get_msb(t); m = 1 + (1 << (16 + l)) / d; *quant = (int16_t)(m - (1 << 16)); *shift = 1 << (16 - l); diff -Nru aom-3.8.2/av1/encoder/av1_temporal_denoiser.c aom-3.9.0/av1/encoder/av1_temporal_denoiser.c --- aom-3.8.2/av1/encoder/av1_temporal_denoiser.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/av1_temporal_denoiser.c 2024-05-07 19:57:03.016000000 +0000 @@ -489,7 +489,7 @@ &denoiser->running_avg_y[fb_idx], cm->width, cm->height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; @@ -577,7 +577,7 @@ fail = aom_alloc_frame_buffer( &denoiser->running_avg_y[i + denoiser->num_ref_frames * layer], denoise_width, denoise_height, ssx, ssy, use_highbitdepth, border, - legacy_byte_alignment, 0, 0); + legacy_byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; @@ -589,7 +589,7 @@ fail = aom_alloc_frame_buffer( &denoiser->mc_running_avg_y[layer], denoise_width, denoise_height, ssx, - ssy, use_highbitdepth, border, legacy_byte_alignment, 0, 0); + ssy, use_highbitdepth, border, legacy_byte_alignment, false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; @@ -600,7 +600,7 @@ // layer. fail = aom_alloc_frame_buffer(&denoiser->last_source, width, height, ssx, ssy, use_highbitdepth, border, legacy_byte_alignment, - 0, 0); + false, 0); if (fail) { av1_denoiser_free(denoiser); return 1; diff -Nru aom-3.8.2/av1/encoder/bitstream.c aom-3.9.0/av1/encoder/bitstream.c --- aom-3.8.2/av1/encoder/bitstream.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/bitstream.c 2024-05-07 19:57:03.024000000 +0000 @@ -3642,7 +3642,9 @@ mode_bc.allow_update_cdf && !cm->features.disable_cdf_update; aom_start_encode(&mode_bc, buf->data + data_offset); write_modes(cpi, &cpi->td, &tile_info, &mode_bc, tile_row, tile_col); - aom_stop_encode(&mode_bc); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(cm->error, AOM_CODEC_ERROR, "Error writing modes"); + } tile_size = mode_bc.pos; buf->size = tile_size; @@ -3778,7 +3780,10 @@ // Pack tile data aom_start_encode(&mode_bc, pack_bs_params->dst + *total_size); write_modes(cpi, td, &tile_info, &mode_bc, tile_row, tile_col); - aom_stop_encode(&mode_bc); + if (aom_stop_encode(&mode_bc) < 0) { + aom_internal_error(td->mb.e_mbd.error_info, AOM_CODEC_ERROR, + "Error writing modes"); + } tile_size = mode_bc.pos; assert(tile_size >= AV1_MIN_TILE_SIZE_BYTES); diff -Nru aom-3.8.2/av1/encoder/bitstream.h aom-3.9.0/av1/encoder/bitstream.h --- aom-3.8.2/av1/encoder/bitstream.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/bitstream.h 2024-05-07 19:57:03.031000000 +0000 @@ -21,6 +21,7 @@ #include "av1/common/enums.h" #include "av1/encoder/level.h" #include "aom_dsp/bitwriter.h" +#include "aom_util/aom_pthread.h" struct aom_write_bit_buffer; struct AV1_COMP; diff -Nru aom-3.8.2/av1/encoder/block.h aom-3.9.0/av1/encoder/block.h --- aom-3.8.2/av1/encoder/block.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/block.h 2024-05-07 19:57:03.032000000 +0000 @@ -1348,6 +1348,9 @@ //! Motion vector from superblock MV derived from int_pro_motion() in // the variance_partitioning. int_mv sb_me_mv; + //! Flag to indicate if a fixed partition should be used, only if the + // speed feature rt_sf->use_fast_fixed_part is enabled. + int sb_force_fixed_part; //! SSE of the current predictor. unsigned int pred_sse[REF_FRAMES]; //! Prediction for ML based partition. diff -Nru aom-3.8.2/av1/encoder/cnn.c aom-3.9.0/av1/encoder/cnn.c --- aom-3.8.2/av1/encoder/cnn.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/cnn.c 2024-05-07 19:57:03.035000000 +0000 @@ -31,13 +31,9 @@ int th_step; } CONVOLVE_OPS; -typedef float (*activation_fn)(float); +static INLINE float softsign(float x) { return x / (fabsf(x) + 1.0f); } -static float softsign(float x) { return x / (float)(fabsf(x) + 1.0); } - -static float relu(float x) { return (x < 0) ? 0 : x; } - -static float identity(float x) { return x; } +static INLINE float relu(float x) { return (x < 0) ? 0 : x; } typedef struct { int allocsize; @@ -291,18 +287,6 @@ } } -activation_fn get_activation(ACTIVATION layer_activation) { - switch (layer_activation) { - case NONE: return identity; - case RELU: return relu; - case SOFTSIGN: return softsign; - case SIGMOID: - assert(0 && "Sigmoid has not been supported in CNN."); // TO DO - return NULL; - default: assert(0 && "Unknown activation type"); return NULL; - } -} - static INLINE int get_start_shift_convolve(int width, int filt_width, int stride) { const int mod = (width % stride); @@ -322,11 +306,22 @@ void av1_cnn_activate_c(float **output, int channels, int width, int height, int stride, ACTIVATION layer_activation) { - activation_fn activation = get_activation(layer_activation); - for (int c = 0; c < channels; ++c) { - for (int i = 0; i < height; ++i) - for (int j = 0; j < width; ++j) - output[c][i * stride + j] = activation(output[c][i * stride + j]); + if (layer_activation == RELU) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = relu(output[c][i * stride + j]); + } + } else if (layer_activation == SOFTSIGN) { + for (int c = 0; c < channels; ++c) { + for (int i = 0; i < height; ++i) + for (int j = 0; j < width; ++j) + output[c][i * stride + j] = softsign(output[c][i * stride + j]); + } + } else if (layer_activation == SIGMOID) { + assert(0 && "Sigmoid has not been supported in CNN."); // TO DO + } else if (layer_activation != NONE) { + assert(0 && "Unknown activation type"); } } @@ -1013,10 +1008,9 @@ } // Non-linearity - if (layer_config->activation != IDENTITY) - av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, - tensor2[branch].width, tensor2[branch].height, - tensor2[branch].stride, layer_config->activation); + av1_cnn_activate(tensor2[branch].buf, tensor2[branch].channels, + tensor2[branch].width, tensor2[branch].height, + tensor2[branch].stride, layer_config->activation); if (layer_config->bn_params.bn_gamma) { av1_cnn_batchnorm( diff -Nru aom-3.8.2/av1/encoder/context_tree.c aom-3.9.0/av1/encoder/context_tree.c --- aom-3.8.2/av1/encoder/context_tree.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/context_tree.c 2024-05-07 19:57:03.043000000 +0000 @@ -248,11 +248,11 @@ if (!keep_best && !keep_none) aom_free(pc_tree); } -void av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { +int av1_setup_sms_tree(AV1_COMP *const cpi, ThreadData *td) { // The structure 'sms_tree' is used to store the simple motion search data for // partition pruning in inter frames. Hence, the memory allocations and // initializations related to it are avoided for allintra encoding mode. - if (cpi->oxcf.kf_cfg.key_freq_max == 0) return; + if (cpi->oxcf.kf_cfg.key_freq_max == 0) return 0; AV1_COMMON *const cm = &cpi->common; const int stat_generation_stage = is_stat_generation_stage(cpi); @@ -265,8 +265,9 @@ int nodes; aom_free(td->sms_tree); - CHECK_MEM_ERROR(cm, td->sms_tree, - aom_calloc(tree_nodes, sizeof(*td->sms_tree))); + td->sms_tree = + (SIMPLE_MOTION_DATA_TREE *)aom_calloc(tree_nodes, sizeof(*td->sms_tree)); + if (!td->sms_tree) return -1; this_sms = &td->sms_tree[0]; if (!stat_generation_stage) { @@ -301,6 +302,7 @@ // Set up the root node for the largest superblock size td->sms_root = &td->sms_tree[tree_nodes - 1]; + return 0; } void av1_free_sms_tree(ThreadData *td) { diff -Nru aom-3.8.2/av1/encoder/context_tree.h aom-3.9.0/av1/encoder/context_tree.h --- aom-3.8.2/av1/encoder/context_tree.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/context_tree.h 2024-05-07 19:57:03.043000000 +0000 @@ -131,7 +131,8 @@ return tree_nodes; } -void av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td); +// Returns 0 on success, -1 on memory allocation failure. +int av1_setup_sms_tree(struct AV1_COMP *const cpi, struct ThreadData *td); void av1_free_sms_tree(struct ThreadData *td); #ifdef __cplusplus diff -Nru aom-3.8.2/av1/encoder/encode_strategy.c aom-3.9.0/av1/encoder/encode_strategy.c --- aom-3.8.2/av1/encoder/encode_strategy.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encode_strategy.c 2024-05-07 19:57:03.118000000 +0000 @@ -237,10 +237,24 @@ // Clear down mmx registers - if (cpi->ppi->use_svc && cpi->svc.spatial_layer_id > 0) { - cpi->framerate = cpi->svc.base_framerate; - av1_rc_update_framerate(cpi, cpi->common.width, cpi->common.height); - return; + if (cpi->ppi->use_svc && cpi->ppi->rtc_ref.set_ref_frame_config && + cpi->svc.number_spatial_layers > 1) { + // ts_start is the timestamp for the current frame and ts_end is the + // expected next timestamp given the duration passed into codec_encode(). + // See the setting in encoder_encode() in av1_cx_iface.c: + // ts_start = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol), + // ts_end = timebase_units_to_ticks(cpi_data.timestamp_ratio, ptsvol + + // duration). So the difference ts_end - ts_start is the duration passed + // in by the user. For spatial layers SVC set the framerate based directly + // on the duration, and bypass the adjustments below. + this_duration = ts_end - ts_start; + if (this_duration > 0) { + cpi->new_framerate = 10000000.0 / this_duration; + av1_new_framerate(cpi, cpi->new_framerate); + time_stamps->prev_ts_start = ts_start; + time_stamps->prev_ts_end = ts_end; + return; + } } if (ts_start == time_stamps->first_ts_start) { @@ -805,7 +819,7 @@ oxcf->frm_dim_cfg.height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, - NULL, cpi->image_pyramid_levels, 0); + NULL, cpi->alloc_pyramid, 0); if (ret) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate tf_buf_second_arf"); @@ -909,7 +923,7 @@ if (apply_filtering && is_psnr_calc_enabled(cpi)) { cpi->source = av1_realloc_and_scale_if_required( cm, source_buffer, &cpi->scaled_source, cm->features.interp_filter, 0, - false, true, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + false, true, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); cpi->unscaled_source = source_buffer; } #if CONFIG_COLLECT_COMPONENT_TIMING @@ -1688,8 +1702,7 @@ // This is used in rtc temporal filter case. Use true source in the PSNR // calculation. - if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && - cpi->common.current_frame.frame_type != KEY_FRAME) { + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf) { assert(cpi->orig_source.buffer_alloc_sz > 0); cpi->source = &cpi->orig_source; } @@ -1744,9 +1757,9 @@ cpi->svc.temporal_layer_id == 0 && cpi->unscaled_source->y_width == cpi->svc.source_last_TL0.y_width && cpi->unscaled_source->y_height == cpi->svc.source_last_TL0.y_height) { - aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0); - aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0); - aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0); + aom_yv12_copy_y(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); + aom_yv12_copy_u(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); + aom_yv12_copy_v(cpi->unscaled_source, &cpi->svc.source_last_TL0, 1); } return AOM_CODEC_OK; diff -Nru aom-3.8.2/av1/encoder/encodeframe.c aom-3.9.0/av1/encoder/encodeframe.c --- aom-3.8.2/av1/encoder/encodeframe.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encodeframe.c 2024-05-07 19:57:03.127000000 +0000 @@ -23,7 +23,7 @@ #include "aom_dsp/binary_codes_writer.h" #include "aom_ports/mem.h" #include "aom_ports/aom_timer.h" - +#include "aom_util/aom_pthread.h" #if CONFIG_MISMATCH_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_MISMATCH_DEBUG @@ -535,11 +535,19 @@ } #endif // Set the partition - if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip) { + if (sf->part_sf.partition_search_type == FIXED_PARTITION || seg_skip || + (sf->rt_sf.use_fast_fixed_part && x->sb_force_fixed_part == 1 && + (!frame_is_intra_only(cm) && + (!cpi->ppi->use_svc || + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame)))) { // set a fixed-size partition av1_set_offsets(cpi, tile_info, x, mi_row, mi_col, sb_size); - const BLOCK_SIZE bsize = - seg_skip ? sb_size : sf->part_sf.fixed_partition_size; + BLOCK_SIZE bsize_select = sf->part_sf.fixed_partition_size; + if (sf->rt_sf.use_fast_fixed_part && + x->content_state_sb.source_sad_nonrd < kLowSad) { + bsize_select = BLOCK_64X64; + } + const BLOCK_SIZE bsize = seg_skip ? sb_size : bsize_select; av1_set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize); } else if (sf->part_sf.partition_search_type == VAR_BASED_PARTITION) { // set a variance-based partition @@ -1048,8 +1056,13 @@ // The threshold is determined based on kLowSad and kHighSad threshold and // test results. - const uint64_t thresh_low = 15000; - const uint64_t thresh_high = 40000; + uint64_t thresh_low = 15000; + uint64_t thresh_high = 40000; + + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + thresh_low = thresh_low << 1; + thresh_high = thresh_high << 1; + } if (avg_64x64_blk_sad > thresh_low && avg_64x64_blk_sad < thresh_high) { do_calc_src_content = false; @@ -1197,6 +1210,7 @@ x->sb_me_block = 0; x->sb_me_partition = 0; x->sb_me_mv.as_int = 0; + x->sb_force_fixed_part = 1; if (cpi->oxcf.mode == ALLINTRA) { x->intra_sb_rdmult_modifier = 128; @@ -1225,7 +1239,7 @@ // Grade the temporal variation of the sb, the grade will be used to decide // fast mode search strategy for coding blocks - grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col); + if (!seg_skip) grade_source_content_sb(cpi, x, tile_data, mi_row, mi_col); // encode the superblock if (use_nonrd_mode) { @@ -1267,17 +1281,32 @@ void av1_alloc_tile_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; av1_row_mt_mem_dealloc(cpi); aom_free(cpi->tile_data); + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; + CHECK_MEM_ERROR( cm, cpi->tile_data, aom_memalign(32, tile_cols * tile_rows * sizeof(*cpi->tile_data))); cpi->allocated_tiles = tile_cols * tile_rows; + enc_row_mt->allocated_tile_cols = tile_cols; + enc_row_mt->allocated_tile_rows = tile_rows; + for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { + for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { + const int tile_index = tile_row * tile_cols + tile_col; + TileDataEnc *const this_tile = &cpi->tile_data[tile_index]; + av1_zero(this_tile->row_mt_sync); + this_tile->row_ctx = NULL; + } + } } void av1_init_tile_data(AV1_COMP *cpi) { @@ -1568,20 +1597,12 @@ // High Latency: Turn off skip mode if all refs are fwd. if (cpi->all_one_sided_refs && cpi->oxcf.gf_cfg.lag_in_frames > 0) return 0; - static const int flag_list[REF_FRAMES] = { 0, - AOM_LAST_FLAG, - AOM_LAST2_FLAG, - AOM_LAST3_FLAG, - AOM_GOLD_FLAG, - AOM_BWD_FLAG, - AOM_ALT2_FLAG, - AOM_ALT_FLAG }; const int ref_frame[2] = { cm->current_frame.skip_mode_info.ref_frame_idx_0 + LAST_FRAME, cm->current_frame.skip_mode_info.ref_frame_idx_1 + LAST_FRAME }; - if (!(cpi->ref_frame_flags & flag_list[ref_frame[0]]) || - !(cpi->ref_frame_flags & flag_list[ref_frame[1]])) + if (!(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[0]]) || + !(cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame[1]])) return 0; return 1; @@ -2324,7 +2345,7 @@ // a source or a ref frame should have an image pyramid allocated. // Check here so that issues can be caught early in debug mode #if !defined(NDEBUG) && !CONFIG_REALTIME_ONLY - if (cpi->image_pyramid_levels > 0) { + if (cpi->alloc_pyramid) { assert(cpi->source->y_pyramid); for (int ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { const RefCntBuffer *const buf = get_ref_frame_buf(cm, ref_frame); diff -Nru aom-3.8.2/av1/encoder/encodeframe_utils.c aom-3.9.0/av1/encoder/encodeframe_utils.c --- aom-3.8.2/av1/encoder/encodeframe_utils.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encodeframe_utils.c 2024-05-07 19:57:03.132000000 +0000 @@ -15,6 +15,7 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/encodeframe_utils.h" +#include "av1/encoder/encoder_utils.h" #include "av1/encoder/rdopt.h" void av1_set_ssim_rdmult(const AV1_COMP *const cpi, int *errorperbit, @@ -306,6 +307,7 @@ // Else for cyclic refresh mode update the segment map, set the segment id // and then update the quantizer. if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && + mi_addr->segment_id != AM_SEGMENT_ID_INACTIVE && !cpi->rc.rtc_external_ratectrl) { av1_cyclic_refresh_update_segment(cpi, x, mi_row, mi_col, bsize, ctx->rd_stats.rate, ctx->rd_stats.dist, @@ -1398,6 +1400,11 @@ 36000 }; // ~3*3*(64*64) uint64_t avg_source_sse_threshold_high = 1000000; // ~15*15*(64*64) + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + avg_source_sse_threshold_high = avg_source_sse_threshold_high << 1; + avg_source_sse_threshold_low[0] = avg_source_sse_threshold_low[0] << 1; + avg_source_sse_threshold_verylow = avg_source_sse_threshold_verylow << 1; + } uint64_t sum_sq_thresh = 10000; // sum = sqrt(thresh / 64*64)) ~1.5 src_y += src_offset; last_src_y += last_src_offset; @@ -1426,6 +1433,10 @@ if ((tmp_sse - tmp_variance) < (sum_sq_thresh >> 1)) x->content_state_sb.low_sumdiff = 1; + if (tmp_sse > ((avg_source_sse_threshold_high * 7) >> 3) && + !x->content_state_sb.lighting_change && !x->content_state_sb.low_sumdiff) + x->sb_force_fixed_part = 0; + if (!cpi->sf.rt_sf.use_rtc_tf || cpi->rc.high_source_sad || cpi->rc.frame_source_sad > 20000 || cpi->svc.number_spatial_layers > 1) return; @@ -1753,6 +1764,11 @@ void av1_alloc_src_diff_buf(const struct AV1Common *cm, struct macroblock *mb) { const int num_planes = av1_num_planes(cm); +#ifndef NDEBUG + for (int plane = 0; plane < num_planes; ++plane) { + assert(!mb->plane[plane].src_diff); + } +#endif for (int plane = 0; plane < num_planes; ++plane) { const int subsampling_xy = plane ? cm->seq_params->subsampling_x + cm->seq_params->subsampling_y diff -Nru aom-3.8.2/av1/encoder/encoder.c aom-3.9.0/av1/encoder/encoder.c --- aom-3.8.2/av1/encoder/encoder.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encoder.c 2024-05-07 19:57:03.144000000 +0000 @@ -35,6 +35,7 @@ #include "aom_ports/aom_timer.h" #include "aom_ports/mem.h" #include "aom_scale/aom_scale.h" +#include "aom_util/aom_pthread.h" #if CONFIG_BITSTREAM_DEBUG #include "aom_util/debug_util.h" #endif // CONFIG_BITSTREAM_DEBUG @@ -152,24 +153,33 @@ unsigned char *const active_map_4x4 = cpi->active_map.map; const int mi_rows = mi_params->mi_rows; const int mi_cols = mi_params->mi_cols; - const int row_scale = mi_size_high_log2[BLOCK_16X16]; - const int col_scale = mi_size_wide_log2[BLOCK_16X16]; cpi->active_map.update = 0; + cpi->rc.percent_blocks_inactive = 0; assert(mi_rows % 2 == 0); assert(mi_cols % 2 == 0); if (new_map_16x16) { - for (int r = 0; r < (mi_rows >> row_scale); ++r) { - for (int c = 0; c < (mi_cols >> col_scale); ++c) { - const uint8_t val = new_map_16x16[r * cols + c] + int num_samples = 0; + int num_blocks_inactive = 0; + for (int r = 0; r < mi_rows; r += 4) { + for (int c = 0; c < mi_cols; c += 4) { + const uint8_t val = new_map_16x16[(r >> 2) * cols + (c >> 2)] ? AM_SEGMENT_ID_ACTIVE : AM_SEGMENT_ID_INACTIVE; - active_map_4x4[(2 * r + 0) * mi_cols + (c + 0)] = val; - active_map_4x4[(2 * r + 0) * mi_cols + (c + 1)] = val; - active_map_4x4[(2 * r + 1) * mi_cols + (c + 0)] = val; - active_map_4x4[(2 * r + 1) * mi_cols + (c + 1)] = val; + num_samples++; + if (val == AM_SEGMENT_ID_INACTIVE) num_blocks_inactive++; + const int row_max = AOMMIN(4, mi_rows - r); + const int col_max = AOMMIN(4, mi_cols - c); + for (int x = 0; x < row_max; ++x) { + for (int y = 0; y < col_max; ++y) { + active_map_4x4[(r + x) * mi_cols + (c + y)] = val; + } + } } } cpi->active_map.enabled = 1; + cpi->active_map.update = 1; + cpi->rc.percent_blocks_inactive = + (num_blocks_inactive * 100) / num_samples; } return 0; } @@ -642,14 +652,12 @@ cm->height = oxcf->frm_dim_cfg.height; cpi->is_dropped_frame = false; - InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; - initial_dimensions->width = cm->width; - initial_dimensions->height = cm->height; + alloc_compressor_data(cpi); + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; - alloc_compressor_data(cpi); - // Single thread case: use counts in common. cpi->td.counts = &cpi->counts; @@ -773,7 +781,6 @@ PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; MACROBLOCK *const x = &cpi->td.mb; AV1LevelParams *const level_params = &cpi->ppi->level_params; - InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; const FrameDimensionCfg *const frm_dim_cfg = &cpi->oxcf.frm_dim_cfg; const RateControlCfg *const rc_cfg = &oxcf->rc_cfg; @@ -913,8 +920,8 @@ cm->width = frm_dim_cfg->width; cm->height = frm_dim_cfg->height; - if (cm->width > initial_dimensions->width || - cm->height > initial_dimensions->height || is_sb_size_changed) { + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height || is_sb_size_changed) { av1_free_context_buffers(cm); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); @@ -922,8 +929,8 @@ cpi->td.firstpass_ctx = NULL; alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); - initial_dimensions->width = cm->width; - initial_dimensions->height = cm->height; + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; } av1_update_frame_size(cpi); @@ -946,14 +953,9 @@ #if CONFIG_REALTIME_ONLY assert(!oxcf->tool_cfg.enable_global_motion); - cpi->image_pyramid_levels = 0; + cpi->alloc_pyramid = false; #else - if (oxcf->tool_cfg.enable_global_motion) { - cpi->image_pyramid_levels = - global_motion_pyr_levels[default_global_motion_method]; - } else { - cpi->image_pyramid_levels = 0; - } + cpi->alloc_pyramid = oxcf->tool_cfg.enable_global_motion; #endif // CONFIG_REALTIME_ONLY } @@ -1501,6 +1503,7 @@ cpi->mb_weber_stats = NULL; cpi->mb_delta_q = NULL; cpi->palette_pixel_num = 0; + cpi->scaled_last_source_available = 0; { const BLOCK_SIZE bsize = BLOCK_16X16; @@ -2072,8 +2075,8 @@ // TODO(chengchen): consider renaming this function as it is necessary // for the encoder to setup critical parameters, and it does not // deal with initial width any longer. -void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, - int subsampling_x, int subsampling_y) { +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y) { AV1_COMMON *const cm = &cpi->common; SequenceHeader *const seq_params = cm->seq_params; @@ -2090,7 +2093,8 @@ if (!is_stat_generation_stage(cpi)) { #if !CONFIG_REALTIME_ONLY - av1_tf_info_alloc(&cpi->ppi->tf_info, cpi); + if (!av1_tf_info_alloc(&cpi->ppi->tf_info, cpi)) + return AOM_CODEC_MEM_ERROR; #endif // !CONFIG_REALTIME_ONLY } init_ref_frame_bufs(cpi); @@ -2100,6 +2104,7 @@ cpi->initial_mbs = cm->mi_params.MBs; cpi->frame_size_related_setup_done = true; } + return AOM_CODEC_OK; } #if CONFIG_AV1_TEMPORAL_DENOISING @@ -2119,12 +2124,14 @@ #endif // Returns 1 if the assigned width or height was <= 0. -int av1_set_size_literal(AV1_COMP *cpi, int width, int height) { +static int set_size_literal(AV1_COMP *cpi, int width, int height) { AV1_COMMON *cm = &cpi->common; - InitialDimensions *const initial_dimensions = &cpi->initial_dimensions; - av1_check_initial_width(cpi, cm->seq_params->use_highbitdepth, - cm->seq_params->subsampling_x, - cm->seq_params->subsampling_y); + aom_codec_err_t err = av1_check_initial_width( + cpi, cm->seq_params->use_highbitdepth, cm->seq_params->subsampling_x, + cm->seq_params->subsampling_y); + if (err != AOM_CODEC_OK) { + aom_internal_error(cm->error, err, "av1_check_initial_width() failed"); + } if (width <= 0 || height <= 0) return 1; @@ -2135,8 +2142,8 @@ setup_denoiser_buffer(cpi); #endif - if (cm->width > initial_dimensions->width || - cm->height > initial_dimensions->height) { + if (cm->width > cpi->data_alloc_width || + cm->height > cpi->data_alloc_height) { av1_free_context_buffers(cm); av1_free_shared_coeff_buffer(&cpi->td.shared_coeff_buf); av1_free_sms_tree(&cpi->td); @@ -2144,8 +2151,8 @@ cpi->td.firstpass_ctx = NULL; alloc_compressor_data(cpi); realloc_segmentation_maps(cpi); - initial_dimensions->width = cm->width; - initial_dimensions->height = cm->height; + cpi->data_alloc_width = cm->width; + cpi->data_alloc_height = cm->height; cpi->frame_size_related_setup_done = false; } alloc_mb_mode_info_buffers(cpi); @@ -2163,7 +2170,7 @@ if (width != cm->width || height != cm->height) { // There has been a change in the encoded frame size - av1_set_size_literal(cpi, width, height); + set_size_literal(cpi, width, height); // Recalculate 'all_lossless' in case super-resolution was (un)selected. cm->features.all_lossless = cm->features.coded_lossless && !av1_superres_scaled(cm); @@ -2206,7 +2213,7 @@ &cm->cur_frame->buf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, - NULL, cpi->image_pyramid_levels, 0)) + NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); @@ -2387,7 +2394,10 @@ const int use_loopfilter = is_loopfilter_used(cm) && !cpi->mt_info.pipeline_lpf_mt_with_enc; - const int use_cdef = is_cdef_used(cm); + const int use_cdef = + is_cdef_used(cm) && (!cpi->active_map.enabled || + cpi->rc.percent_blocks_inactive <= + cpi->sf.rt_sf.thresh_active_maps_skip_lf_cdef); const int use_superres = av1_superres_scaled(cm); const int use_restoration = is_restoration_used(cm); @@ -2473,7 +2483,6 @@ const QuantizationCfg *const q_cfg = &cpi->oxcf.q_cfg; SVC *const svc = &cpi->svc; const int resize_pending = is_frame_resize_pending(cpi); - int top_index = 0, bottom_index = 0, q = 0; YV12_BUFFER_CONFIG *unscaled = cpi->unscaled_source; InterpFilter filter_scaler = @@ -2497,7 +2506,8 @@ &cpi->svc.source_last_TL0, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) { + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, + 0)) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate buffer for source_last_TL0"); } @@ -2546,7 +2556,7 @@ cpi->source = av1_realloc_and_scale_if_required( cm, unscaled, &cpi->scaled_source, filter_scaler, phase_scaler, true, - false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (frame_is_intra_only(cm) || resize_pending != 0) { const int current_size = (cm->mi_params.mi_rows * cm->mi_params.mi_cols) >> 2; @@ -2562,11 +2572,14 @@ memset(cpi->consec_zero_mv, 0, current_size * sizeof(*cpi->consec_zero_mv)); } - if (cpi->unscaled_last_source != NULL) { + if (cpi->scaled_last_source_available) { + cpi->last_source = &cpi->scaled_last_source; + cpi->scaled_last_source_available = 0; + } else if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, filter_scaler, phase_scaler, true, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } if (cpi->sf.rt_sf.use_temporal_noise_estimate) { @@ -2584,13 +2597,16 @@ // av1_scale_references. Note GOLDEN is forced to update on the (first/tigger) // resized frame and ALTREF will be refreshed ~4 frames later, so both // references become available again after few frames. + // For superres: don't disable golden reference. if (svc->number_spatial_layers == 1) { - if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { - const YV12_BUFFER_CONFIG *const ref = - get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); - if (ref == NULL || ref->y_crop_width != cm->width || - ref->y_crop_height != cm->height) { - cpi->ref_frame_flags ^= AOM_GOLD_FLAG; + if (!cpi->oxcf.superres_cfg.enable_superres) { + if (cpi->ref_frame_flags & av1_ref_frame_flag_list[GOLDEN_FRAME]) { + const YV12_BUFFER_CONFIG *const ref = + get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (ref == NULL || ref->y_crop_width != cm->width || + ref->y_crop_height != cm->height) { + cpi->ref_frame_flags ^= AOM_GOLD_FLAG; + } } } if (cpi->ref_frame_flags & av1_ref_frame_flag_list[ALTREF_FRAME]) { @@ -2640,12 +2656,8 @@ av1_setup_frame(cpi); } } - - if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) { - suppress_active_map(cpi); - av1_cyclic_refresh_setup(cpi); - } av1_apply_active_map(cpi); + if (q_cfg->aq_mode == CYCLIC_REFRESH_AQ) av1_cyclic_refresh_setup(cpi); if (cm->seg.enabled) { if (!cm->seg.update_data && cm->prev_frame) { segfeatures_copy(&cm->seg, &cm->prev_frame->seg); @@ -2660,26 +2672,26 @@ cm->cur_frame->seg.enabled = cm->seg.enabled; // This is for rtc temporal filtering case. - if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf && - cm->current_frame.frame_type != KEY_FRAME) { + if (is_psnr_calc_enabled(cpi) && cpi->sf.rt_sf.use_rtc_tf) { const SequenceHeader *seq_params = cm->seq_params; if (cpi->orig_source.buffer_alloc_sz == 0 || - cpi->last_source->y_width != cpi->source->y_width || - cpi->last_source->y_height != cpi->source->y_height) { + cpi->rc.prev_coded_width != cpi->oxcf.frm_dim_cfg.width || + cpi->rc.prev_coded_height != cpi->oxcf.frm_dim_cfg.height) { // Allocate a source buffer to store the true source for psnr calculation. if (aom_alloc_frame_buffer( &cpi->orig_source, cpi->oxcf.frm_dim_cfg.width, cpi->oxcf.frm_dim_cfg.height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0)) + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, + 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate scaled buffer"); } - aom_yv12_copy_y(cpi->source, &cpi->orig_source); - aom_yv12_copy_u(cpi->source, &cpi->orig_source); - aom_yv12_copy_v(cpi->source, &cpi->orig_source); + aom_yv12_copy_y(cpi->source, &cpi->orig_source, 1); + aom_yv12_copy_u(cpi->source, &cpi->orig_source, 1); + aom_yv12_copy_v(cpi->source, &cpi->orig_source, 1); } #if CONFIG_COLLECT_COMPONENT_TIMING @@ -2697,13 +2709,32 @@ update_motion_stat(cpi); // Adjust the refresh of the golden (longer-term) reference based on QP - // selected for this frame. This is for CBR with 1 layer/non-svc RTC mode. + // selected for this frame. This is for CBR real-time mode, and only + // for single layer without usage of the set_ref_frame_config (so + // reference structure for 1 layer is set internally). if (!frame_is_intra_only(cm) && cpi->oxcf.rc_cfg.mode == AOM_CBR && cpi->oxcf.mode == REALTIME && svc->number_spatial_layers == 1 && svc->number_temporal_layers == 1 && !cpi->rc.rtc_external_ratectrl && + !cpi->ppi->rtc_ref.set_ref_frame_config && sf->rt_sf.gf_refresh_based_on_qp) av1_adjust_gf_refresh_qp_one_pass_rt(cpi); + // For non-svc: if scaling is required, copy scaled_source + // into scaled_last_source. + if (cm->current_frame.frame_number > 1 && !cpi->ppi->use_svc && + cpi->scaled_source.y_buffer != NULL && + cpi->scaled_last_source.y_buffer != NULL && + cpi->scaled_source.y_crop_width == cpi->scaled_last_source.y_crop_width && + cpi->scaled_source.y_crop_height == + cpi->scaled_last_source.y_crop_height && + (cm->width != cpi->unscaled_source->y_crop_width || + cm->height != cpi->unscaled_source->y_crop_height)) { + cpi->scaled_last_source_available = 1; + aom_yv12_copy_y(&cpi->scaled_source, &cpi->scaled_last_source, 1); + aom_yv12_copy_u(&cpi->scaled_source, &cpi->scaled_last_source, 1); + aom_yv12_copy_v(&cpi->scaled_source, &cpi->scaled_last_source, 1); + } + #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, av1_encode_frame_time); #endif @@ -2820,7 +2851,7 @@ } cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, EIGHTTAP_REGULAR, 0, - false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); #if CONFIG_TUNE_BUTTERAUGLI if (oxcf->tune_cfg.tuning == AOM_TUNE_BUTTERAUGLI) { @@ -2840,7 +2871,7 @@ cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, EIGHTTAP_REGULAR, 0, false, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } int scale_references = 0; @@ -3809,6 +3840,8 @@ if (cpi->sf.rt_sf.disable_cdf_update_non_reference_frame && cpi->ppi->rtc_ref.non_reference_frame && cpi->rc.frames_since_key > 2) features->disable_cdf_update = 1; + else if (cpi->sf.rt_sf.selective_cdf_update) + features->disable_cdf_update = selective_disable_cdf_rtc(cpi); else features->disable_cdf_update = 0; break; @@ -4014,7 +4047,7 @@ } #if CONFIG_DENOISE -static int apply_denoise_2d(AV1_COMP *cpi, YV12_BUFFER_CONFIG *sd, +static int apply_denoise_2d(AV1_COMP *cpi, const YV12_BUFFER_CONFIG *sd, int block_size, float noise_level, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; @@ -4022,16 +4055,16 @@ cpi->denoise_and_model = aom_denoise_and_model_alloc( cm->seq_params->bit_depth, block_size, noise_level); if (!cpi->denoise_and_model) { - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, - "Error allocating denoise and model"); + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating denoise and model"); return -1; } } if (!cpi->film_grain_table) { cpi->film_grain_table = aom_malloc(sizeof(*cpi->film_grain_table)); if (!cpi->film_grain_table) { - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, - "Error allocating grain table"); + aom_set_error(cm->error, AOM_CODEC_MEM_ERROR, + "Error allocating grain table"); return -1; } memset(cpi->film_grain_table, 0, sizeof(*cpi->film_grain_table)); @@ -4049,7 +4082,7 @@ #endif int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + const YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time) { AV1_COMMON *const cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; @@ -4111,10 +4144,8 @@ #endif // CONFIG_DENOISE if (av1_lookahead_push(cpi->ppi->lookahead, sd, time_stamp, end_time, - use_highbitdepth, cpi->image_pyramid_levels, - frame_flags)) { - aom_internal_error(cm->error, AOM_CODEC_ERROR, - "av1_lookahead_push() failed"); + use_highbitdepth, cpi->alloc_pyramid, frame_flags)) { + aom_set_error(cm->error, AOM_CODEC_ERROR, "av1_lookahead_push() failed"); res = -1; } #if CONFIG_INTERNAL_STATS @@ -4131,21 +4162,21 @@ // header. if ((seq_params->profile == PROFILE_0) && !seq_params->monochrome && (subsampling_x != 1 || subsampling_y != 1)) { - aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, - "Non-4:2:0 color format requires profile 1 or 2"); + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Non-4:2:0 color format requires profile 1 or 2"); res = -1; } if ((seq_params->profile == PROFILE_1) && !(subsampling_x == 0 && subsampling_y == 0)) { - aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, - "Profile 1 requires 4:4:4 color format"); + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 1 requires 4:4:4 color format"); res = -1; } if ((seq_params->profile == PROFILE_2) && (seq_params->bit_depth <= AOM_BITS_10) && !(subsampling_x == 1 && subsampling_y == 0)) { - aom_internal_error(cm->error, AOM_CODEC_INVALID_PARAM, - "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); + aom_set_error(cm->error, AOM_CODEC_INVALID_PARAM, + "Profile 2 bit-depth <= 10 requires 4:2:2 color format"); res = -1; } @@ -4696,7 +4727,7 @@ aom_bitstream_queue_set_frame_write(cm->current_frame.frame_number); } #endif - if (cpi->ppi->use_svc && cpi->ppi->number_spatial_layers > 1) { + if (cpi->ppi->use_svc) { av1_one_pass_cbr_svc_start_layer(cpi); } diff -Nru aom-3.8.2/av1/encoder/encoder.h aom-3.9.0/av1/encoder/encoder.h --- aom-3.8.2/av1/encoder/encoder.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encoder.h 2024-05-07 19:57:03.156000000 +0000 @@ -21,6 +21,7 @@ #include "config/aom_config.h" #include "aom/aomcx.h" +#include "aom_util/aom_pthread.h" #include "av1/common/alloccommon.h" #include "av1/common/av1_common_int.h" @@ -1544,6 +1545,13 @@ */ bool firstpass_mt_exit; + /*! + * Initialized to false, set to true in cal_mb_wiener_var_hook() by the worker + * thread that encounters an error in order to abort the processing of other + * worker threads. + */ + bool mb_wiener_mt_exit; + #if CONFIG_MULTITHREAD /*! * Mutex lock used while dispatching jobs. @@ -2081,20 +2089,6 @@ } GlobalMotionInfo; /*! - * \brief Initial frame dimensions - * - * Tracks the frame dimensions using which: - * - Frame buffers (like altref and util frame buffers) were allocated - * - Motion estimation related initializations were done - * This structure is helpful to reallocate / reinitialize the above when there - * is a change in frame dimensions. - */ -typedef struct { - int width; /*!< initial width */ - int height; /*!< initial height */ -} InitialDimensions; - -/*! * \brief Flags related to interpolation filter search */ typedef struct { @@ -3163,11 +3157,18 @@ FRAME_INDEX_SET frame_index_set; /*! - * Structure to store the cm->width and cm->height in the last call - * of alloc_compressor_data(). - * TODO(chengchen): rename this variable or delete it. + * Stores the cm->width in the last call of alloc_compressor_data(). Helps + * determine whether compressor data should be reallocated when cm->width + * changes. + */ + int data_alloc_width; + + /*! + * Stores the cm->height in the last call of alloc_compressor_data(). Helps + * determine whether compressor data should be reallocated when cm->height + * changes. */ - InitialDimensions initial_dimensions; + int data_alloc_height; /*! * Number of MBs in the full-size frame; to be used to @@ -3631,10 +3632,10 @@ unsigned int zeromv_skip_thresh_exit_part[BLOCK_SIZES_ALL]; /*! - * Number of downsampling pyramid levels to allocate for each frame + * Should we allocate a downsampling pyramid for each frame buffer? * This is currently only used for global motion */ - int image_pyramid_levels; + bool alloc_pyramid; #if CONFIG_SALIENCY_MAP /*! @@ -3653,6 +3654,12 @@ * fast encoding pass in av1_determine_sc_tools_with_encoding(). */ int palette_pixel_num; + + /*! + * Flag to indicate scaled_last_source is available, + * so scaling is not needed for last_source. + */ + int scaled_last_source_available; } AV1_COMP; /*! @@ -3756,8 +3763,8 @@ void av1_change_config(AV1_COMP *cpi, const AV1EncoderConfig *oxcf, bool sb_size_changed); -void av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, - int subsampling_x, int subsampling_y); +aom_codec_err_t av1_check_initial_width(AV1_COMP *cpi, int use_highbitdepth, + int subsampling_x, int subsampling_y); void av1_init_seq_coding_tools(AV1_PRIMARY *const ppi, const AV1EncoderConfig *oxcf, int use_svc); @@ -3802,7 +3809,7 @@ * copy of the pointer. */ int av1_receive_raw_frame(AV1_COMP *cpi, aom_enc_frame_flags_t frame_flags, - YV12_BUFFER_CONFIG *sd, int64_t time_stamp, + const YV12_BUFFER_CONFIG *sd, int64_t time_stamp, int64_t end_time_stamp); /*!\brief Encode a frame @@ -3822,7 +3829,9 @@ * \retval #AOM_CODEC_OK * \retval -1 * No frame encoded; more input is required. - * \retval #AOM_CODEC_ERROR + * \retval "A nonzero (positive) aom_codec_err_t code" + * The encoding failed with the error. Sets the error code and error message + * in \c cpi->common.error. */ int av1_get_compressed_data(AV1_COMP *cpi, AV1_COMP_DATA *const cpi_data); @@ -3852,8 +3861,6 @@ int av1_set_reference_enc(AV1_COMP *cpi, int idx, YV12_BUFFER_CONFIG *sd); -int av1_set_size_literal(AV1_COMP *cpi, int width, int height); - void av1_set_frame_size(AV1_COMP *cpi, int width, int height); void av1_set_mv_search_params(AV1_COMP *cpi); @@ -4304,7 +4311,7 @@ const AV1_COMMON *const cm = &cpi->common; return cpi->ppi->b_calculate_psnr && !is_stat_generation_stage(cpi) && - cm->show_frame; + cm->show_frame && !cpi->is_dropped_frame; } static INLINE int is_frame_resize_pending(const AV1_COMP *const cpi) { diff -Nru aom-3.8.2/av1/encoder/encoder_alloc.h aom-3.9.0/av1/encoder/encoder_alloc.h --- aom-3.8.2/av1/encoder/encoder_alloc.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encoder_alloc.h 2024-05-07 19:57:03.164000000 +0000 @@ -77,7 +77,10 @@ av1_setup_shared_coeff_buffer(cm->seq_params, &cpi->td.shared_coeff_buf, cm->error); - av1_setup_sms_tree(cpi, &cpi->td); + if (av1_setup_sms_tree(cpi, &cpi->td)) { + aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } cpi->td.firstpass_ctx = av1_alloc_pmc(cpi, BLOCK_16X16, &cpi->td.shared_coeff_buf); if (!cpi->td.firstpass_ctx) @@ -182,11 +185,15 @@ static AOM_INLINE void dealloc_compressor_data(AV1_COMP *cpi) { AV1_COMMON *const cm = &cpi->common; TokenInfo *token_info = &cpi->token_info; + AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; const int num_planes = av1_num_planes(cm); dealloc_context_buffers_ext(&cpi->mbmi_ext_info); aom_free(cpi->tile_data); cpi->tile_data = NULL; + cpi->allocated_tiles = 0; + enc_row_mt->allocated_tile_cols = 0; + enc_row_mt->allocated_tile_rows = 0; // Delete sementation map aom_free(cpi->enc_seg.map); @@ -432,8 +439,7 @@ &cpi->scaled_source, scaled_width, scaled_height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, NULL, NULL, NULL, - cpi->image_pyramid_levels, 0)) + cm->features.byte_alignment, NULL, NULL, NULL, cpi->alloc_pyramid, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer"); assert(cpi->scaled_source.y_crop_width == scaled_width); @@ -458,61 +464,62 @@ for (int t = 1; t < p_mt_info->num_workers; ++t) { EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[t]; thread_data->td = thread_data->original_td; - aom_free(thread_data->td->tctx); - aom_free(thread_data->td->palette_buffer); - aom_free(thread_data->td->tmp_conv_dst); - release_compound_type_rd_buffers(&thread_data->td->comp_rd_buffer); + ThreadData *const td = thread_data->td; + if (!td) continue; + aom_free(td->tctx); + aom_free(td->palette_buffer); + aom_free(td->tmp_conv_dst); + release_compound_type_rd_buffers(&td->comp_rd_buffer); for (int j = 0; j < 2; ++j) { - aom_free(thread_data->td->tmp_pred_bufs[j]); + aom_free(td->tmp_pred_bufs[j]); } - aom_free(thread_data->td->pixel_gradient_info); - aom_free(thread_data->td->src_var_info_of_4x4_sub_blocks); - release_obmc_buffers(&thread_data->td->obmc_buffer); - aom_free(thread_data->td->vt64x64); + aom_free(td->pixel_gradient_info); + aom_free(td->src_var_info_of_4x4_sub_blocks); + release_obmc_buffers(&td->obmc_buffer); + aom_free(td->vt64x64); for (int x = 0; x < 2; x++) { for (int y = 0; y < 2; y++) { - aom_free(thread_data->td->hash_value_buffer[x][y]); - thread_data->td->hash_value_buffer[x][y] = NULL; + aom_free(td->hash_value_buffer[x][y]); + td->hash_value_buffer[x][y] = NULL; } } - aom_free(thread_data->td->mv_costs_alloc); - thread_data->td->mv_costs_alloc = NULL; - aom_free(thread_data->td->dv_costs_alloc); - thread_data->td->dv_costs_alloc = NULL; - aom_free(thread_data->td->counts); - av1_free_pmc(thread_data->td->firstpass_ctx, num_planes); - thread_data->td->firstpass_ctx = NULL; - av1_free_shared_coeff_buffer(&thread_data->td->shared_coeff_buf); - av1_free_sms_tree(thread_data->td); + aom_free(td->mv_costs_alloc); + td->mv_costs_alloc = NULL; + aom_free(td->dv_costs_alloc); + td->dv_costs_alloc = NULL; + aom_free(td->counts); + av1_free_pmc(td->firstpass_ctx, num_planes); + td->firstpass_ctx = NULL; + av1_free_shared_coeff_buffer(&td->shared_coeff_buf); + av1_free_sms_tree(td); // This call ensures that the buffers allocated by tf_alloc_and_reset_data() // in prepare_tf_workers() for MT encode are freed in case an error is // encountered during temporal filtering (due to early termination // tf_dealloc_thread_data() in av1_tf_do_filtering_mt() would not be // invoked). - if (t < num_tf_workers) - tf_dealloc_data(&thread_data->td->tf_data, is_highbitdepth); + if (t < num_tf_workers) tf_dealloc_data(&td->tf_data, is_highbitdepth); // This call ensures that tpl_tmp_buffers for MT encode are freed in case of // an error during tpl. - if (t < num_tpl_workers) - tpl_dealloc_temp_buffers(&thread_data->td->tpl_tmp_buffers); + if (t < num_tpl_workers) tpl_dealloc_temp_buffers(&td->tpl_tmp_buffers); // This call ensures that the buffers in gm_data for MT encode are freed in // case of an error during gm. - gm_dealloc_data(&thread_data->td->gm_data); - av1_dealloc_mb_data(&thread_data->td->mb, num_planes); - aom_free(thread_data->td->mb.sb_stats_cache); - thread_data->td->mb.sb_stats_cache = NULL; - aom_free(thread_data->td->mb.sb_fp_stats); - thread_data->td->mb.sb_fp_stats = NULL; + gm_dealloc_data(&td->gm_data); + av1_dealloc_mb_data(&td->mb, num_planes); + aom_free(td->mb.sb_stats_cache); + td->mb.sb_stats_cache = NULL; + aom_free(td->mb.sb_fp_stats); + td->mb.sb_fp_stats = NULL; #if CONFIG_PARTITION_SEARCH_ORDER - aom_free(thread_data->td->mb.rdcost); - thread_data->td->mb.rdcost = NULL; + aom_free(td->mb.rdcost); + td->mb.rdcost = NULL; #endif - av1_free_pc_tree_recursive(thread_data->td->pc_root, num_planes, 0, 0, - SEARCH_PARTITION); - thread_data->td->pc_root = NULL; - av1_dealloc_mb_wiener_var_pred_buf(thread_data->td); - aom_free(thread_data->td); + av1_free_pc_tree_recursive(td->pc_root, num_planes, 0, 0, SEARCH_PARTITION); + td->pc_root = NULL; + av1_dealloc_mb_wiener_var_pred_buf(td); + aom_free(td); + thread_data->td = NULL; + thread_data->original_td = NULL; } } diff -Nru aom-3.8.2/av1/encoder/encoder_utils.c aom-3.9.0/av1/encoder/encoder_utils.c --- aom-3.8.2/av1/encoder/encoder_utils.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encoder_utils.c 2024-05-07 19:57:03.167000000 +0000 @@ -421,11 +421,13 @@ struct segmentation *const seg = &cpi->common.seg; unsigned char *const seg_map = cpi->enc_seg.map; const unsigned char *const active_map = cpi->active_map.map; - int i; assert(AM_SEGMENT_ID_ACTIVE == CR_SEGMENT_ID_BASE); - if (frame_is_intra_only(&cpi->common)) { + // Disable the active_maps on intra_only frames or if the + // input map for the current frame has no inactive blocks. + if (frame_is_intra_only(&cpi->common) || + cpi->rc.percent_blocks_inactive == 0) { cpi->active_map.enabled = 0; cpi->active_map.update = 1; } @@ -434,8 +436,7 @@ if (cpi->active_map.enabled) { const int num_mis = cpi->common.mi_params.mi_rows * cpi->common.mi_params.mi_cols; - for (i = 0; i < num_mis; ++i) - if (seg_map[i] == AM_SEGMENT_ID_ACTIVE) seg_map[i] = active_map[i]; + memcpy(seg_map, active_map, sizeof(active_map[0]) * num_mis); av1_enable_segmentation(seg); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_SKIP); av1_enable_segfeature(seg, AM_SEGMENT_ID_INACTIVE, SEG_LVL_ALT_LF_Y_H); @@ -706,6 +707,14 @@ if (ref_frame == ALTREF_FRAME && cpi->svc.skip_mvsearch_altref) continue; } + // For RTC with superres on: golden reference only needs to be scaled + // if it was refreshed in previous frame. + if (is_one_pass_rt_params(cpi) && + cpi->oxcf.superres_cfg.enable_superres && ref_frame == GOLDEN_FRAME && + cpi->rc.frame_num_last_gf_refresh < + (int)cm->current_frame.frame_number - 1) { + continue; + } if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) { // Replace the reference buffer with a copy having a thicker border, @@ -717,7 +726,7 @@ RefCntBuffer *ref_fb = get_ref_frame_buf(cm, ref_frame); if (aom_yv12_realloc_with_new_border( &ref_fb->buf, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, cpi->image_pyramid_levels, + cm->features.byte_alignment, cpi->alloc_pyramid, num_planes) != 0) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); @@ -741,7 +750,7 @@ &new_fb->buf, cm->width, cm->height, cm->seq_params->subsampling_x, cm->seq_params->subsampling_y, cm->seq_params->use_highbitdepth, AOM_BORDER_IN_PIXELS, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) { + cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) { if (force_scaling) { // Release the reference acquired in the get_free_fb() call above. --new_fb->ref_count; @@ -1079,12 +1088,12 @@ cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, - 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } av1_setup_frame(cpi); diff -Nru aom-3.8.2/av1/encoder/encoder_utils.h aom-3.9.0/av1/encoder/encoder_utils.h --- aom-3.8.2/av1/encoder/encoder_utils.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encoder_utils.h 2024-05-07 19:57:03.170000000 +0000 @@ -83,9 +83,10 @@ static AOM_INLINE void enc_free_mi(CommonModeInfoParams *mi_params) { aom_free(mi_params->mi_alloc); mi_params->mi_alloc = NULL; + mi_params->mi_alloc_size = 0; aom_free(mi_params->mi_grid_base); mi_params->mi_grid_base = NULL; - mi_params->mi_alloc_size = 0; + mi_params->mi_grid_size = 0; aom_free(mi_params->tx_type_map); mi_params->tx_type_map = NULL; } @@ -1013,10 +1014,23 @@ } static AOM_INLINE void release_scaled_references(AV1_COMP *cpi) { - // TODO(isbs): only refresh the necessary frames, rather than all of them + // Scaled references should only need to be released under certain conditions: + // if the reference will be updated, or if the scaled reference has same + // resolution. For now only apply this to Golden for non-svc RTC mode. + AV1_COMMON *const cm = &cpi->common; + const bool refresh_golden = (cpi->refresh_frame.golden_frame) ? 1 : 0; + bool release_golden = true; for (int i = 0; i < INTER_REFS_PER_FRAME; ++i) { RefCntBuffer *const buf = cpi->scaled_ref_buf[i]; - if (buf != NULL) { + const int golden_ref = (i == GOLDEN_FRAME - 1); + if (golden_ref && is_one_pass_rt_params(cpi) && !cpi->ppi->use_svc && + buf != NULL) { + const RefCntBuffer *const ref = get_ref_frame_buf(cm, GOLDEN_FRAME); + const bool same_resoln = buf->buf.y_crop_width == ref->buf.y_crop_width && + buf->buf.y_crop_height == ref->buf.y_crop_height; + release_golden = refresh_golden || same_resoln; + } + if (buf != NULL && (!golden_ref || (golden_ref && release_golden))) { --buf->ref_count; cpi->scaled_ref_buf[i] = NULL; } diff -Nru aom-3.8.2/av1/encoder/encodetxb.c aom-3.9.0/av1/encoder/encodetxb.c --- aom-3.8.2/av1/encoder/encodetxb.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/encodetxb.c 2024-05-07 19:57:03.172000000 +0000 @@ -76,9 +76,13 @@ void av1_free_txb_buf(AV1_COMP *cpi) { CoeffBufferPool *coeff_buf_pool = &cpi->coeff_buffer_pool; aom_free(cpi->coeff_buffer_base); + cpi->coeff_buffer_base = NULL; aom_free(coeff_buf_pool->tcoeff); + coeff_buf_pool->tcoeff = NULL; aom_free(coeff_buf_pool->eobs); + coeff_buf_pool->eobs = NULL; aom_free(coeff_buf_pool->entropy_ctx); + coeff_buf_pool->entropy_ctx = NULL; } static void write_golomb(aom_writer *w, int level) { diff -Nru aom-3.8.2/av1/encoder/ethread.c aom-3.9.0/av1/encoder/ethread.c --- aom-3.8.2/av1/encoder/ethread.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/ethread.c 2024-05-07 19:57:03.175000000 +0000 @@ -10,6 +10,9 @@ */ #include +#include + +#include "aom_util/aom_pthread.h" #include "av1/common/warped_motion.h" #include "av1/common/thread_common.h" @@ -151,7 +154,13 @@ if (sig) { pthread_mutex_lock(&row_mt_sync->mutex_[r]); - row_mt_sync->num_finished_cols[r] = cur; + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + row_mt_sync->num_finished_cols[r] = + AOMMAX(row_mt_sync->num_finished_cols[r], cur); pthread_cond_signal(&row_mt_sync->cond_[r]); pthread_mutex_unlock(&row_mt_sync->mutex_[r]); @@ -246,7 +255,6 @@ row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, max_rows); - this_tile->row_ctx = NULL; if (alloc_row_ctx) { assert(max_cols > 0); const int num_row_ctx = AOMMAX(1, (max_cols - 1)); @@ -261,13 +269,9 @@ cm, enc_row_mt->num_tile_cols_done, aom_malloc(sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows)); - enc_row_mt->allocated_tile_cols = tile_cols; - enc_row_mt->allocated_tile_rows = tile_rows; enc_row_mt->allocated_rows = max_rows; enc_row_mt->allocated_cols = max_cols - 1; enc_row_mt->allocated_sb_rows = sb_rows; - enc_row_mt->row_mt_exit = false; - enc_row_mt->firstpass_mt_exit = false; } void av1_row_mt_mem_dealloc(AV1_COMP *cpi) { @@ -284,15 +288,16 @@ av1_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); - if (cpi->oxcf.algo_cfg.cdf_update_mode) aom_free(this_tile->row_ctx); + if (cpi->oxcf.algo_cfg.cdf_update_mode) { + aom_free(this_tile->row_ctx); + this_tile->row_ctx = NULL; + } } } aom_free(enc_row_mt->num_tile_cols_done); enc_row_mt->num_tile_cols_done = NULL; enc_row_mt->allocated_rows = 0; enc_row_mt->allocated_cols = 0; - enc_row_mt->allocated_tile_cols = 0; - enc_row_mt->allocated_tile_rows = 0; enc_row_mt->allocated_sb_rows = 0; } @@ -574,6 +579,11 @@ } } +static bool lpf_mt_with_enc_enabled(int pipeline_lpf_mt_with_enc, + const int filter_level[2]) { + return pipeline_lpf_mt_with_enc && (filter_level[0] || filter_level[1]); +} + static int enc_row_mt_worker_hook(void *arg1, void *unused) { EncWorkerData *const thread_data = (EncWorkerData *)arg1; AV1_COMP *const cpi = thread_data->cpi; @@ -588,6 +598,9 @@ AV1LfSync *const lf_sync = thread_data->lf_sync; MACROBLOCKD *const xd = &thread_data->td->mb.e_mbd; xd->error_info = error_info; + AV1_COMMON *volatile const cm = &cpi->common; + volatile const bool do_pipelined_lpf_mt_with_enc = lpf_mt_with_enc_enabled( + cpi->mt_info.pipeline_lpf_mt_with_enc, cm->lf.filter_level); // The jmp_buf is valid only for the duration of the function that calls // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 @@ -604,7 +617,7 @@ #endif set_encoding_done(cpi); - if (cpi->mt_info.pipeline_lpf_mt_with_enc) { + if (do_pipelined_lpf_mt_with_enc) { #if CONFIG_MULTITHREAD pthread_mutex_lock(lf_sync->job_mutex); lf_sync->lf_mt_exit = true; @@ -617,7 +630,6 @@ } error_info->setjmp = 1; - AV1_COMMON *const cm = &cpi->common; const int mib_size_log2 = cm->seq_params->mib_size_log2; int cur_tile_id = enc_row_mt->thread_id_to_tile_id[thread_id]; @@ -717,9 +729,7 @@ pthread_mutex_unlock(enc_row_mt_mutex_); #endif } - if (cpi->mt_info.pipeline_lpf_mt_with_enc && - (cm->lf.filter_level[PLANE_TYPE_Y] || - cm->lf.filter_level[PLANE_TYPE_UV])) { + if (do_pipelined_lpf_mt_with_enc) { // Loop-filter a superblock row if encoding of the current and next // superblock row is complete. // TODO(deepa.kg @ittiam.com) Evaluate encoder speed by interleaving @@ -831,6 +841,11 @@ AV1_COMMON *const cm = &cpi->common; MultiThreadInfo *const mt_info = &cpi->mt_info; + if (setjmp(cm->error->jmp)) { + cm->error->setjmp = 0; + aom_internal_error_copy(&cpi->ppi->error, cm->error); + } + cm->error->setjmp = 1; // Initialize enc row MT object. if (is_first_pass || cpi->oxcf.row_mt == 1) { AV1EncRowMultiThreadInfo *enc_row_mt = &mt_info->enc_row_mt; @@ -892,7 +907,6 @@ aom_malloc(sizeof(*(tpl_row_mt->mutex_)))); if (tpl_row_mt->mutex_) pthread_mutex_init(tpl_row_mt->mutex_, NULL); } - tpl_row_mt->tpl_mt_exit = false; #if !CONFIG_REALTIME_ONLY if (is_restoration_used(cm)) { @@ -919,6 +933,7 @@ if (pack_bs_sync->mutex_) pthread_mutex_init(pack_bs_sync->mutex_, NULL); } } + cm->error->setjmp = 0; } #endif // CONFIG_MULTITHREAD @@ -951,48 +966,48 @@ if (i > 0) { // Allocate thread data. - AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td, - aom_memalign(32, sizeof(*thread_data->td))); - av1_zero(*thread_data->td); - thread_data->original_td = thread_data->td; + ThreadData *td; + AOM_CHECK_MEM_ERROR(&ppi->error, td, aom_memalign(32, sizeof(*td))); + av1_zero(*td); + thread_data->original_td = thread_data->td = td; // Set up shared coeff buffers. - av1_setup_shared_coeff_buffer( - &ppi->seq_params, &thread_data->td->shared_coeff_buf, &ppi->error); - AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->tmp_conv_dst, - aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * - sizeof(*thread_data->td->tmp_conv_dst))); + av1_setup_shared_coeff_buffer(&ppi->seq_params, &td->shared_coeff_buf, + &ppi->error); + AOM_CHECK_MEM_ERROR(&ppi->error, td->tmp_conv_dst, + aom_memalign(32, MAX_SB_SIZE * MAX_SB_SIZE * + sizeof(*td->tmp_conv_dst))); if (i < p_mt_info->num_mod_workers[MOD_FP]) { // Set up firstpass PICK_MODE_CONTEXT. - thread_data->td->firstpass_ctx = av1_alloc_pmc( - ppi->cpi, BLOCK_16X16, &thread_data->td->shared_coeff_buf); - if (!thread_data->td->firstpass_ctx) + td->firstpass_ctx = + av1_alloc_pmc(ppi->cpi, BLOCK_16X16, &td->shared_coeff_buf); + if (!td->firstpass_ctx) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate PICK_MODE_CONTEXT"); } if (!is_first_pass && i < num_enc_workers) { // Set up sms_tree. - av1_setup_sms_tree(ppi->cpi, thread_data->td); + if (av1_setup_sms_tree(ppi->cpi, td)) { + aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, + "Failed to allocate SMS tree"); + } for (int x = 0; x < 2; x++) for (int y = 0; y < 2; y++) AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->hash_value_buffer[x][y], - (uint32_t *)aom_malloc( - AOM_BUFFER_SIZE_FOR_BLOCK_HASH * - sizeof(*thread_data->td->hash_value_buffer[0][0]))); + &ppi->error, td->hash_value_buffer[x][y], + (uint32_t *)aom_malloc(AOM_BUFFER_SIZE_FOR_BLOCK_HASH * + sizeof(*td->hash_value_buffer[0][0]))); // Allocate frame counters in thread data. - AOM_CHECK_MEM_ERROR(&ppi->error, thread_data->td->counts, - aom_calloc(1, sizeof(*thread_data->td->counts))); + AOM_CHECK_MEM_ERROR(&ppi->error, td->counts, + aom_calloc(1, sizeof(*td->counts))); // Allocate buffers used by palette coding mode. - AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->palette_buffer, - aom_memalign(16, sizeof(*thread_data->td->palette_buffer))); + AOM_CHECK_MEM_ERROR(&ppi->error, td->palette_buffer, + aom_memalign(16, sizeof(*td->palette_buffer))); // The buffers 'tmp_pred_bufs[]', 'comp_rd_buffer' and 'obmc_buffer' are // used in inter frames to store intermediate inter mode prediction @@ -1000,26 +1015,23 @@ // memory allocations for these buffers are avoided for allintra // encoding mode. if (ppi->cpi->oxcf.kf_cfg.key_freq_max != 0) { - alloc_obmc_buffers(&thread_data->td->obmc_buffer, &ppi->error); + alloc_obmc_buffers(&td->obmc_buffer, &ppi->error); - alloc_compound_type_rd_buffers(&ppi->error, - &thread_data->td->comp_rd_buffer); + alloc_compound_type_rd_buffers(&ppi->error, &td->comp_rd_buffer); for (int j = 0; j < 2; ++j) { AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->tmp_pred_bufs[j], - aom_memalign(32, - 2 * MAX_MB_PLANE * MAX_SB_SQUARE * - sizeof(*thread_data->td->tmp_pred_bufs[j]))); + &ppi->error, td->tmp_pred_bufs[j], + aom_memalign(32, 2 * MAX_MB_PLANE * MAX_SB_SQUARE * + sizeof(*td->tmp_pred_bufs[j]))); } } if (is_gradient_caching_for_hog_enabled(ppi->cpi)) { const int plane_types = PLANE_TYPES >> ppi->seq_params.monochrome; - AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->pixel_gradient_info, - aom_malloc(sizeof(*thread_data->td->pixel_gradient_info) * - plane_types * MAX_SB_SQUARE)); + AOM_CHECK_MEM_ERROR(&ppi->error, td->pixel_gradient_info, + aom_malloc(sizeof(*td->pixel_gradient_info) * + plane_types * MAX_SB_SQUARE)); } if (is_src_var_for_4x4_sub_blocks_caching_enabled(ppi->cpi)) { @@ -1028,18 +1040,17 @@ mi_size_wide[sb_size] * mi_size_high[sb_size]; AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->src_var_info_of_4x4_sub_blocks, - aom_malloc( - sizeof(*thread_data->td->src_var_info_of_4x4_sub_blocks) * - mi_count_in_sb)); + &ppi->error, td->src_var_info_of_4x4_sub_blocks, + aom_malloc(sizeof(*td->src_var_info_of_4x4_sub_blocks) * + mi_count_in_sb)); } if (ppi->cpi->sf.part_sf.partition_search_type == VAR_BASED_PARTITION) { const int num_64x64_blocks = (ppi->seq_params.sb_size == BLOCK_64X64) ? 1 : 4; AOM_CHECK_MEM_ERROR( - &ppi->error, thread_data->td->vt64x64, - aom_malloc(sizeof(*thread_data->td->vt64x64) * num_64x64_blocks)); + &ppi->error, td->vt64x64, + aom_malloc(sizeof(*td->vt64x64) * num_64x64_blocks)); } } } @@ -1076,7 +1087,7 @@ &ppi->error, p_mt_info->tile_thr_data, aom_calloc(num_workers, sizeof(*p_mt_info->tile_thr_data))); - for (int i = num_workers - 1; i >= 0; i--) { + for (int i = 0; i < num_workers; ++i) { AVxWorker *const worker = &p_mt_info->workers[i]; EncWorkerData *const thread_data = &p_mt_info->tile_thr_data[i]; @@ -1247,6 +1258,9 @@ return workers_per_frame; } +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared); + // Prepare level 1 workers. This function is only called for // parallel_frame_count > 1. This function populates the mt_info structure of // frame level contexts appropriately by dividing the total number of available @@ -1262,17 +1276,30 @@ PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; int num_workers = p_mt_info->num_workers; - int frame_idx = 0; - int i = 0; + volatile int frame_idx = 0; + volatile int i = 0; while (i < num_workers) { // Assign level 1 worker AVxWorker *frame_worker = p_mt_info->p_workers[frame_idx] = &p_mt_info->workers[i]; AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; MultiThreadInfo *mt_info = &cur_cpi->mt_info; - AV1_COMMON *const cm = &cur_cpi->common; - const int num_planes = av1_num_planes(cm); + // This 'aom_internal_error_info' pointer is not derived from the local + // pointer ('AV1_COMMON *const cm') to silence the compiler warning + // "variable 'cm' might be clobbered by 'longjmp' or 'vfork' [-Wclobbered]". + struct aom_internal_error_info *const error = cur_cpi->common.error; + + // The jmp_buf is valid only within the scope of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error->jmp)) { + error->setjmp = 0; + restore_workers_after_fpmt(ppi, parallel_frame_count, i); + aom_internal_error_copy(&ppi->error, error); + } + error->setjmp = 1; + AV1_COMMON *const cm = &cur_cpi->common; // Assign start of level 2 worker pool mt_info->workers = &p_mt_info->workers[i]; mt_info->tile_thr_data = &p_mt_info->tile_thr_data[i]; @@ -1281,13 +1308,14 @@ num_workers - i, parallel_frame_count - frame_idx); for (int j = MOD_FP; j < NUM_MT_MODULES; j++) { mt_info->num_mod_workers[j] = - AOMMIN(mt_info->num_workers, ppi->p_mt_info.num_mod_workers[j]); + AOMMIN(mt_info->num_workers, p_mt_info->num_mod_workers[j]); } - if (ppi->p_mt_info.cdef_worker != NULL) { - mt_info->cdef_worker = &ppi->p_mt_info.cdef_worker[i]; + if (p_mt_info->cdef_worker != NULL) { + mt_info->cdef_worker = &p_mt_info->cdef_worker[i]; // Back up the original cdef_worker pointers. mt_info->restore_state_buf.cdef_srcbuf = mt_info->cdef_worker->srcbuf; + const int num_planes = av1_num_planes(cm); for (int plane = 0; plane < num_planes; plane++) mt_info->restore_state_buf.cdef_colbuf[plane] = mt_info->cdef_worker->colbuf[plane]; @@ -1308,6 +1336,8 @@ } #endif + i += mt_info->num_workers; + // At this stage, the thread specific CDEF buffers for the current frame's // 'common' and 'cdef_sync' only need to be allocated. 'cdef_worker' has // already been allocated across parallel frames. @@ -1320,7 +1350,7 @@ ? first_cpi_data : &ppi->parallel_frames_data[frame_idx - 1]; frame_idx++; - i += mt_info->num_workers; + error->setjmp = 0; } p_mt_info->p_num_workers = parallel_frame_count; } @@ -1340,25 +1370,24 @@ } // Restore worker states after parallel encode. -static AOM_INLINE void restore_workers_after_fpmt(AV1_PRIMARY *ppi, - int parallel_frame_count) { +static AOM_INLINE void restore_workers_after_fpmt( + AV1_PRIMARY *ppi, int parallel_frame_count, int num_fpmt_workers_prepared) { assert(parallel_frame_count <= ppi->num_fp_contexts && parallel_frame_count > 1); (void)parallel_frame_count; PrimaryMultiThreadInfo *const p_mt_info = &ppi->p_mt_info; - int num_workers = p_mt_info->num_workers; int frame_idx = 0; int i = 0; - while (i < num_workers) { + while (i < num_fpmt_workers_prepared) { AV1_COMP *cur_cpi = ppi->parallel_cpi[frame_idx]; MultiThreadInfo *mt_info = &cur_cpi->mt_info; const AV1_COMMON *const cm = &cur_cpi->common; const int num_planes = av1_num_planes(cm); // Restore the original cdef_worker pointers. - if (ppi->p_mt_info.cdef_worker != NULL) { + if (p_mt_info->cdef_worker != NULL) { mt_info->cdef_worker->srcbuf = mt_info->restore_state_buf.cdef_srcbuf; for (int plane = 0; plane < num_planes; plane++) mt_info->cdef_worker->colbuf[plane] = @@ -1388,7 +1417,7 @@ int num_workers = ppi->p_mt_info.p_num_workers; int had_error = 0; // Points to error in the earliest display order frame in the parallel set. - const struct aom_internal_error_info *error; + const struct aom_internal_error_info *error = NULL; // Encoding ends. for (int i = num_workers - 1; i >= 0; --i) { @@ -1399,10 +1428,10 @@ } } - restore_workers_after_fpmt(ppi, frames_in_parallel_set); + restore_workers_after_fpmt(ppi, frames_in_parallel_set, + ppi->p_mt_info.num_workers); - if (had_error) - aom_internal_error(&ppi->error, error->error_code, "%s", error->detail); + if (had_error) aom_internal_error_copy(&ppi->error, error); } static int get_compressed_data_hook(void *arg1, void *arg2) { @@ -1416,8 +1445,8 @@ // This function encodes the raw frame data for each frame in parallel encode // set, and outputs the frame bit stream to the designated buffers. -int av1_compress_parallel_frames(AV1_PRIMARY *const ppi, - AV1_COMP_DATA *const first_cpi_data) { +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data) { // Bitmask for the frame buffers referenced by cpi->scaled_ref_buf // corresponding to frames in the current parallel encode set. int ref_buffers_used_map = 0; @@ -1435,7 +1464,6 @@ } av1_decrement_ref_counts_fpmt(ppi->cpi->common.buffer_pool, ref_buffers_used_map); - return AOM_CODEC_OK; } static AOM_INLINE void launch_workers(MultiThreadInfo *const mt_info, @@ -1472,9 +1500,7 @@ } } - if (had_error) - aom_internal_error(cm->error, error_info.error_code, "%s", - error_info.detail); + if (had_error) aom_internal_error_copy(cm->error, &error_info); // Restore xd->error_info of the main thread back to cm->error so that the // multithreaded code, when executed using a single thread, has a valid @@ -1648,13 +1674,10 @@ thread_data->td = &cpi->td; } else { thread_data->td = thread_data->original_td; - } - - if (thread_data->td != &cpi->td) { // Before encoding a frame, copy the thread data from cpi. thread_data->td->mb = cpi->td.mb; - av1_alloc_src_diff_buf(cm, &thread_data->td->mb); } + av1_alloc_src_diff_buf(cm, &thread_data->td->mb); } } #endif @@ -1856,8 +1879,9 @@ const int plane_start = 0; const int plane_end = av1_num_planes(cm); int planes_to_lf[MAX_MB_PLANE]; - if ((lf->filter_level[PLANE_TYPE_Y] || lf->filter_level[PLANE_TYPE_UV]) && - check_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end)) { + if (lpf_mt_with_enc_enabled(cpi->mt_info.pipeline_lpf_mt_with_enc, + lf->filter_level)) { + set_planes_to_loop_filter(lf, planes_to_lf, plane_start, plane_end); int lpf_opt_level = get_lpf_opt_level(&cpi->sf); assert(lpf_opt_level == 2); @@ -1923,6 +1947,7 @@ sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); memset(enc_row_mt->num_tile_cols_done, 0, sizeof(*enc_row_mt->num_tile_cols_done) * sb_rows_in_frame); + enc_row_mt->row_mt_exit = false; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { @@ -2001,6 +2026,7 @@ memset(thread_id_to_tile_id, -1, sizeof(*thread_id_to_tile_id) * MAX_NUM_THREADS); + enc_row_mt->firstpass_mt_exit = false; for (int tile_row = 0; tile_row < tile_rows; tile_row++) { for (int tile_col = 0; tile_col < tile_cols; tile_col++) { @@ -2082,7 +2108,13 @@ if (sig) { pthread_mutex_lock(&tpl_row_mt_sync->mutex_[r]); - tpl_row_mt_sync->num_finished_cols[r] = cur; + // When a thread encounters an error, num_finished_cols[r] is set to maximum + // column number. In this case, the AOMMAX operation here ensures that + // num_finished_cols[r] is not overwritten with a smaller value thus + // preventing the infinite waiting of threads in the relevant sync_read() + // function. + tpl_row_mt_sync->num_finished_cols[r] = + AOMMAX(tpl_row_mt_sync->num_finished_cols[r], cur); pthread_cond_signal(&tpl_row_mt_sync->cond_[r]); pthread_mutex_unlock(&tpl_row_mt_sync->mutex_[r]); @@ -2299,6 +2331,7 @@ av1_tpl_alloc(tpl_sync, cm, mb_rows); } tpl_sync->num_threads_working = num_workers; + mt_info->tpl_row_mt.tpl_mt_exit = false; // Initialize cur_mb_col to -1 for all MB rows. memset(tpl_sync->num_finished_cols, -1, @@ -2713,6 +2746,28 @@ } } +static void set_mb_wiener_var_calc_done(AV1_COMP *const cpi) { + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const BLOCK_SIZE bsize = cpi->weber_bsize; + const int mb_step = mi_size_wide[bsize]; + assert(MB_WIENER_MT_UNIT_SIZE < BLOCK_SIZES_ALL); + const int mt_unit_step = mi_size_wide[MB_WIENER_MT_UNIT_SIZE]; + const int mt_unit_cols = + (mi_params->mi_cols + (mt_unit_step >> 1)) / mt_unit_step; + const AV1EncAllIntraMultiThreadInfo *const intra_mt = &cpi->mt_info.intra_mt; + AV1EncRowMultiThreadSync *const intra_row_mt_sync = + &cpi->ppi->intra_row_mt_sync; + + // Update the wiener variance computation of every row in the frame to + // indicate that it is complete in order to avoid dependent workers waiting + // indefinitely. + for (int mi_row = 0, mt_thread_id = 0; mi_row < mi_params->mi_rows; + mi_row += mb_step, ++mt_thread_id) { + intra_mt->intra_sync_write_ptr(intra_row_mt_sync, mt_thread_id, + mt_unit_cols - 1, mt_unit_cols); + } +} + static int cal_mb_wiener_var_hook(void *arg1, void *unused) { (void)unused; EncWorkerData *const thread_data = (EncWorkerData *)arg1; @@ -2726,25 +2781,44 @@ AV1EncRowMultiThreadInfo *const enc_row_mt = &cpi->mt_info.enc_row_mt; (void)enc_row_mt; #if CONFIG_MULTITHREAD - pthread_mutex_t *enc_row_mt_mutex_ = enc_row_mt->mutex_; + pthread_mutex_t *enc_row_mt_mutex = enc_row_mt->mutex_; +#endif + + struct aom_internal_error_info *const error_info = &thread_data->error_info; + xd->error_info = error_info; + + // The jmp_buf is valid only for the duration of the function that calls + // setjmp(). Therefore, this function must reset the 'setjmp' field to 0 + // before it returns. + if (setjmp(error_info->jmp)) { + error_info->setjmp = 0; +#if CONFIG_MULTITHREAD + pthread_mutex_lock(enc_row_mt_mutex); + enc_row_mt->mb_wiener_mt_exit = true; + pthread_mutex_unlock(enc_row_mt_mutex); #endif + set_mb_wiener_var_calc_done(cpi); + return 0; + } + error_info->setjmp = 1; DECLARE_ALIGNED(32, int16_t, src_diff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, coeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, qcoeff[32 * 32]); DECLARE_ALIGNED(32, tran_low_t, dqcoeff[32 * 32]); double sum_rec_distortion = 0; double sum_est_rate = 0; - int has_jobs = 1; - while (has_jobs) { + while (1) { int current_mi_row = -1; #if CONFIG_MULTITHREAD - pthread_mutex_lock(enc_row_mt_mutex_); + pthread_mutex_lock(enc_row_mt_mutex); #endif - has_jobs = - get_next_job_allintra(intra_row_mt_sync, cpi->common.mi_params.mi_rows, - ¤t_mi_row, mb_step); + int has_jobs = enc_row_mt->mb_wiener_mt_exit + ? 0 + : get_next_job_allintra(intra_row_mt_sync, + cpi->common.mi_params.mi_rows, + ¤t_mi_row, mb_step); #if CONFIG_MULTITHREAD - pthread_mutex_unlock(enc_row_mt_mutex_); + pthread_mutex_unlock(enc_row_mt_mutex); #endif if (!has_jobs) break; // TODO(chengchen): properly accumulate the distortion and rate. @@ -2753,13 +2827,14 @@ &sum_est_rate, thread_data->td->wiener_tmp_pred_buf); #if CONFIG_MULTITHREAD - pthread_mutex_lock(enc_row_mt_mutex_); + pthread_mutex_lock(enc_row_mt_mutex); #endif intra_row_mt_sync->num_threads_working--; #if CONFIG_MULTITHREAD - pthread_mutex_unlock(enc_row_mt_mutex_); + pthread_mutex_unlock(enc_row_mt_mutex); #endif } + error_info->setjmp = 0; return 1; } @@ -2799,6 +2874,7 @@ intra_row_mt_sync->next_mi_row = 0; memset(intra_row_mt_sync->num_finished_cols, -1, sizeof(*intra_row_mt_sync->num_finished_cols) * mi_rows); + mt_info->enc_row_mt.mb_wiener_mt_exit = false; prepare_wiener_var_workers(cpi, cal_mb_wiener_var_hook, num_workers); launch_workers(mt_info, num_workers); @@ -3053,6 +3129,7 @@ AV1EncPackBSSync *const pack_bs_sync = &mt_info->pack_bs_sync; const uint16_t num_tiles = cm->tiles.rows * cm->tiles.cols; pack_bs_sync->next_job_idx = 0; + pack_bs_sync->pack_bs_mt_exit = false; PackBSTileOrder *const pack_bs_tile_order = pack_bs_sync->pack_bs_tile_order; // Reset tile order data of pack bitstream @@ -3188,6 +3265,7 @@ cdef_sync->end_of_frame = 0; cdef_sync->fbr = 0; cdef_sync->fbc = 0; + cdef_sync->cdef_mt_exit = false; } // Checks if a job is available. If job is available, diff -Nru aom-3.8.2/av1/encoder/ethread.h aom-3.9.0/av1/encoder/ethread.h --- aom-3.8.2/av1/encoder/ethread.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/ethread.h 2024-05-07 19:57:03.182000000 +0000 @@ -124,8 +124,8 @@ int av1_check_fpmt_config(AV1_PRIMARY *const ppi, AV1EncoderConfig *const oxcf); -int av1_compress_parallel_frames(AV1_PRIMARY *const ppi, - AV1_COMP_DATA *const first_cpi_data); +void av1_compress_parallel_frames(AV1_PRIMARY *const ppi, + AV1_COMP_DATA *const first_cpi_data); #ifdef __cplusplus } // extern "C" #endif diff -Nru aom-3.8.2/av1/encoder/firstpass.c aom-3.9.0/av1/encoder/firstpass.c --- aom-3.8.2/av1/encoder/firstpass.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/firstpass.c 2024-05-07 19:57:03.183000000 +0000 @@ -22,6 +22,7 @@ #include "aom_ports/mem.h" #include "aom_scale/aom_scale.h" #include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "av1/common/entropymv.h" #include "av1/common/quant_common.h" @@ -1106,6 +1107,7 @@ const int tile_cols = cm->tiles.cols; const int tile_rows = cm->tiles.rows; + av1_alloc_src_diff_buf(cm, &cpi->td.mb); for (int tile_row = 0; tile_row < tile_rows; ++tile_row) { for (int tile_col = 0; tile_col < tile_cols; ++tile_col) { TileDataEnc *const tile_data = @@ -1391,7 +1393,6 @@ av1_init_mode_probs(cm->fc); av1_init_mv_probs(cm); av1_initialize_rd_consts(cpi); - av1_alloc_src_diff_buf(cm, &cpi->td.mb); enc_row_mt->sync_read_ptr = av1_row_mt_sync_read_dummy; enc_row_mt->sync_write_ptr = av1_row_mt_sync_write_dummy; diff -Nru aom-3.8.2/av1/encoder/global_motion.c aom-3.9.0/av1/encoder/global_motion.c --- aom-3.8.2/av1/encoder/global_motion.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/global_motion.c 2024-05-07 19:57:03.187000000 +0000 @@ -30,83 +30,6 @@ // Border over which to compute the global motion #define ERRORADV_BORDER 0 -/* clang-format off */ -// Error metric used for global motion evaluation. -// For 8-bit input, the pixel error used to index this table will always -// be between -255 and +255. But for 10- and 12-bit input, we use interpolation -// which means that we need to support indices of -256 and +256 as well. -// Therefore, the table is offset so that logical index 0 corresponds to -// error_measure_lut[256]. -const int error_measure_lut[513] = { - // pow 0.7 - 16384, 16384, 16339, 16294, 16249, 16204, 16158, 16113, - 16068, 16022, 15977, 15932, 15886, 15840, 15795, 15749, - 15703, 15657, 15612, 15566, 15520, 15474, 15427, 15381, - 15335, 15289, 15242, 15196, 15149, 15103, 15056, 15010, - 14963, 14916, 14869, 14822, 14775, 14728, 14681, 14634, - 14587, 14539, 14492, 14445, 14397, 14350, 14302, 14254, - 14206, 14159, 14111, 14063, 14015, 13967, 13918, 13870, - 13822, 13773, 13725, 13676, 13628, 13579, 13530, 13481, - 13432, 13383, 13334, 13285, 13236, 13187, 13137, 13088, - 13038, 12988, 12939, 12889, 12839, 12789, 12739, 12689, - 12639, 12588, 12538, 12487, 12437, 12386, 12335, 12285, - 12234, 12183, 12132, 12080, 12029, 11978, 11926, 11875, - 11823, 11771, 11719, 11667, 11615, 11563, 11511, 11458, - 11406, 11353, 11301, 11248, 11195, 11142, 11089, 11036, - 10982, 10929, 10875, 10822, 10768, 10714, 10660, 10606, - 10552, 10497, 10443, 10388, 10333, 10279, 10224, 10168, - 10113, 10058, 10002, 9947, 9891, 9835, 9779, 9723, - 9666, 9610, 9553, 9497, 9440, 9383, 9326, 9268, - 9211, 9153, 9095, 9037, 8979, 8921, 8862, 8804, - 8745, 8686, 8627, 8568, 8508, 8449, 8389, 8329, - 8269, 8208, 8148, 8087, 8026, 7965, 7903, 7842, - 7780, 7718, 7656, 7593, 7531, 7468, 7405, 7341, - 7278, 7214, 7150, 7086, 7021, 6956, 6891, 6826, - 6760, 6695, 6628, 6562, 6495, 6428, 6361, 6293, - 6225, 6157, 6089, 6020, 5950, 5881, 5811, 5741, - 5670, 5599, 5527, 5456, 5383, 5311, 5237, 5164, - 5090, 5015, 4941, 4865, 4789, 4713, 4636, 4558, - 4480, 4401, 4322, 4242, 4162, 4080, 3998, 3916, - 3832, 3748, 3663, 3577, 3490, 3402, 3314, 3224, - 3133, 3041, 2948, 2854, 2758, 2661, 2562, 2461, - 2359, 2255, 2148, 2040, 1929, 1815, 1698, 1577, - 1452, 1323, 1187, 1045, 894, 731, 550, 339, - 0, 339, 550, 731, 894, 1045, 1187, 1323, - 1452, 1577, 1698, 1815, 1929, 2040, 2148, 2255, - 2359, 2461, 2562, 2661, 2758, 2854, 2948, 3041, - 3133, 3224, 3314, 3402, 3490, 3577, 3663, 3748, - 3832, 3916, 3998, 4080, 4162, 4242, 4322, 4401, - 4480, 4558, 4636, 4713, 4789, 4865, 4941, 5015, - 5090, 5164, 5237, 5311, 5383, 5456, 5527, 5599, - 5670, 5741, 5811, 5881, 5950, 6020, 6089, 6157, - 6225, 6293, 6361, 6428, 6495, 6562, 6628, 6695, - 6760, 6826, 6891, 6956, 7021, 7086, 7150, 7214, - 7278, 7341, 7405, 7468, 7531, 7593, 7656, 7718, - 7780, 7842, 7903, 7965, 8026, 8087, 8148, 8208, - 8269, 8329, 8389, 8449, 8508, 8568, 8627, 8686, - 8745, 8804, 8862, 8921, 8979, 9037, 9095, 9153, - 9211, 9268, 9326, 9383, 9440, 9497, 9553, 9610, - 9666, 9723, 9779, 9835, 9891, 9947, 10002, 10058, - 10113, 10168, 10224, 10279, 10333, 10388, 10443, 10497, - 10552, 10606, 10660, 10714, 10768, 10822, 10875, 10929, - 10982, 11036, 11089, 11142, 11195, 11248, 11301, 11353, - 11406, 11458, 11511, 11563, 11615, 11667, 11719, 11771, - 11823, 11875, 11926, 11978, 12029, 12080, 12132, 12183, - 12234, 12285, 12335, 12386, 12437, 12487, 12538, 12588, - 12639, 12689, 12739, 12789, 12839, 12889, 12939, 12988, - 13038, 13088, 13137, 13187, 13236, 13285, 13334, 13383, - 13432, 13481, 13530, 13579, 13628, 13676, 13725, 13773, - 13822, 13870, 13918, 13967, 14015, 14063, 14111, 14159, - 14206, 14254, 14302, 14350, 14397, 14445, 14492, 14539, - 14587, 14634, 14681, 14728, 14775, 14822, 14869, 14916, - 14963, 15010, 15056, 15103, 15149, 15196, 15242, 15289, - 15335, 15381, 15427, 15474, 15520, 15566, 15612, 15657, - 15703, 15749, 15795, 15840, 15886, 15932, 15977, 16022, - 16068, 16113, 16158, 16204, 16249, 16294, 16339, 16384, - 16384, -}; -/* clang-format on */ - int av1_is_enough_erroradvantage(double best_erroradvantage, int params_cost) { return best_erroradvantage < erroradv_tr && best_erroradvantage * params_cost < erroradv_prod_tr; @@ -541,6 +464,11 @@ } wm->wmtype = get_wmtype(wm); + // Recompute shear params for the refined model + // This should never fail, because we only ever consider warp-able models + if (!av1_get_shear_params(wm)) { + assert(0); + } return best_error; } diff -Nru aom-3.8.2/av1/encoder/global_motion.h aom-3.9.0/av1/encoder/global_motion.h --- aom-3.8.2/av1/encoder/global_motion.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/global_motion.h 2024-05-07 19:57:03.189000000 +0000 @@ -15,6 +15,7 @@ #include "aom/aom_integer.h" #include "aom_dsp/flow_estimation/flow_estimation.h" #include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "aom_util/aom_thread.h" #ifdef __cplusplus @@ -97,37 +98,6 @@ int height, int *inliers, int num_inliers); -extern const int error_measure_lut[513]; - -static INLINE int error_measure(int err) { - return error_measure_lut[256 + err]; -} - -#if CONFIG_AV1_HIGHBITDEPTH -static INLINE int highbd_error_measure(int err, int bd) { - const int b = bd - 8; - const int bmask = (1 << b) - 1; - const int v = (1 << b); - - // Split error into two parts and do an interpolated table lookup - // To compute the table index and interpolation value, we want to calculate - // the quotient and remainder of err / 2^b. But it is very important that - // the division must round down, and the remainder must be positive, - // ie. in the range [0, 2^b). - // - // In C, the >> and & operators do what we want, but the / and % operators - // give the wrong results for negative inputs. So we must use >> and & here. - // - // For example, if bd == 10 and err == -5, compare the results: - // (-5) >> 2 = -2, (-5) & 3 = 3 - // vs. (-5) / 4 = -1, (-5) % 4 = -1 - const int e1 = err >> b; - const int e2 = err & bmask; - return error_measure_lut[256 + e1] * (v - e2) + - error_measure_lut[257 + e1] * e2; -} -#endif // CONFIG_AV1_HIGHBITDEPTH - int64_t av1_segmented_frame_error(int use_hbd, int bd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, int p_width, int p_height, diff -Nru aom-3.8.2/av1/encoder/global_motion_facade.c aom-3.9.0/av1/encoder/global_motion_facade.c --- aom-3.8.2/av1/encoder/global_motion_facade.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/global_motion_facade.c 2024-05-07 19:57:03.189000000 +0000 @@ -89,6 +89,7 @@ assert(ref_buf[frame] != NULL); int bit_depth = cpi->common.seq_params->bit_depth; GlobalMotionMethod global_motion_method = default_global_motion_method; + int downsample_level = cpi->sf.gm_sf.downsample_level; int num_refinements = cpi->sf.gm_sf.num_refinement_steps; bool mem_alloc_failed = false; @@ -99,9 +100,10 @@ double best_erroradv = erroradv_tr; for (TransformationType model = FIRST_GLOBAL_TRANS_TYPE; model <= LAST_GLOBAL_TRANS_TYPE; ++model) { - if (!aom_compute_global_motion( - model, cpi->source, ref_buf[frame], bit_depth, global_motion_method, - motion_models, RANSAC_NUM_MOTIONS, &mem_alloc_failed)) { + if (!aom_compute_global_motion(model, cpi->source, ref_buf[frame], + bit_depth, global_motion_method, + downsample_level, motion_models, + RANSAC_NUM_MOTIONS, &mem_alloc_failed)) { if (mem_alloc_failed) { aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate global motion buffers"); @@ -115,6 +117,9 @@ WarpedMotionParams tmp_wm_params; av1_convert_model_to_params(motion_models[i].params, &tmp_wm_params); + // Check that the generated model is warp-able + if (!av1_get_shear_params(&tmp_wm_params)) continue; + // Skip models that we won't use (IDENTITY or TRANSLATION) // // For IDENTITY type models, we don't need to evaluate anything because @@ -151,6 +156,14 @@ double erroradvantage = (double)warp_error / ref_frame_error; + // Check that the model signaling cost is not too high + if (!av1_is_enough_erroradvantage( + erroradvantage, + gm_get_params_cost(&tmp_wm_params, ref_params, + cm->features.allow_high_precision_mv))) { + continue; + } + if (erroradvantage < best_erroradv) { best_erroradv = erroradvantage; // Save the wm_params modified by @@ -161,34 +174,6 @@ } } } - - if (!av1_get_shear_params(&cm->global_motion[frame])) - cm->global_motion[frame] = default_warp_params; - -#if 0 - // We never choose translational models, so this code is disabled - if (cm->global_motion[frame].wmtype == TRANSLATION) { - cm->global_motion[frame].wmmat[0] = - convert_to_trans_prec(cm->features.allow_high_precision_mv, - cm->global_motion[frame].wmmat[0]) * - GM_TRANS_ONLY_DECODE_FACTOR; - cm->global_motion[frame].wmmat[1] = - convert_to_trans_prec(cm->features.allow_high_precision_mv, - cm->global_motion[frame].wmmat[1]) * - GM_TRANS_ONLY_DECODE_FACTOR; - } -#endif - - if (cm->global_motion[frame].wmtype == IDENTITY) return; - - // If the best error advantage found doesn't meet the threshold for - // this motion type, revert to IDENTITY. - if (!av1_is_enough_erroradvantage( - best_erroradv, - gm_get_params_cost(&cm->global_motion[frame], ref_params, - cm->features.allow_high_precision_mv))) { - cm->global_motion[frame] = default_warp_params; - } } // Computes global motion for the given reference frame. diff -Nru aom-3.8.2/av1/encoder/hash.c aom-3.9.0/av1/encoder/hash.c --- aom-3.8.2/av1/encoder/hash.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/hash.c 2024-05-07 19:57:03.195000000 +0000 @@ -10,6 +10,7 @@ */ #include "av1/encoder/hash.h" +#include "config/av1_rtcd.h" static void crc_calculator_process_data(CRC_CALCULATOR *p_crc_calculator, uint8_t *pData, uint32_t dataLength) { diff -Nru aom-3.8.2/av1/encoder/k_means_template.h aom-3.9.0/av1/encoder/k_means_template.h --- aom-3.8.2/av1/encoder/k_means_template.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/k_means_template.h 2024-05-07 19:57:03.207000000 +0000 @@ -24,6 +24,9 @@ #define RENAME_(x, y) AV1_K_MEANS_RENAME(x, y) #define RENAME(x) RENAME_(x, AV1_K_MEANS_DIM) +#define K_MEANS_RENAME_C(x, y) x##_dim##y##_c +#define RENAME_C_(x, y) K_MEANS_RENAME_C(x, y) +#define RENAME_C(x) RENAME_C_(x, AV1_K_MEANS_DIM) // Though we want to compute the smallest L2 norm, in 1 dimension, // it is equivalent to find the smallest L1 norm and then square it. @@ -41,8 +44,8 @@ #endif } -void RENAME(av1_calc_indices)(const int16_t *data, const int16_t *centroids, - uint8_t *indices, int64_t *dist, int n, int k) { +void RENAME_C(av1_calc_indices)(const int16_t *data, const int16_t *centroids, + uint8_t *indices, int64_t *dist, int n, int k) { if (dist) { *dist = 0; } @@ -149,3 +152,6 @@ } #undef RENAME_ #undef RENAME +#undef K_MEANS_RENAME_C +#undef RENAME_C_ +#undef RENAME_C diff -Nru aom-3.8.2/av1/encoder/lookahead.c aom-3.9.0/av1/encoder/lookahead.c --- aom-3.8.2/av1/encoder/lookahead.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/lookahead.c 2024-05-07 19:57:03.210000000 +0000 @@ -46,7 +46,7 @@ unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, const int border_in_pixels, int byte_alignment, int num_lap_buffers, - bool is_all_intra, int num_pyramid_levels) { + bool is_all_intra, bool alloc_pyramid) { int lag_in_frames = AOMMAX(1, depth); // For all-intra frame encoding, previous source frames are not required. @@ -82,7 +82,7 @@ if (aom_realloc_frame_buffer( &ctx->buf[i].img, width, height, subsampling_x, subsampling_y, use_highbitdepth, border_in_pixels, byte_alignment, NULL, NULL, - NULL, num_pyramid_levels, 0)) { + NULL, alloc_pyramid, 0)) { goto fail; } } @@ -100,7 +100,7 @@ int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, - int num_pyramid_levels, aom_enc_frame_flags_t flags) { + bool alloc_pyramid, aom_enc_frame_flags_t flags) { int width = src->y_crop_width; int height = src->y_crop_height; int uv_width = src->uv_crop_width; @@ -124,9 +124,9 @@ height != buf->img.y_crop_height || uv_width != buf->img.uv_crop_width || uv_height != buf->img.uv_crop_height; - larger_dimensions = width > buf->img.y_width || height > buf->img.y_height || - uv_width > buf->img.uv_width || - uv_height > buf->img.uv_height; + larger_dimensions = + width > buf->img.y_crop_width || height > buf->img.y_crop_height || + uv_width > buf->img.uv_crop_width || uv_height > buf->img.uv_crop_height; assert(!larger_dimensions || new_dimensions); if (larger_dimensions) { @@ -134,11 +134,15 @@ memset(&new_img, 0, sizeof(new_img)); if (aom_alloc_frame_buffer(&new_img, width, height, subsampling_x, subsampling_y, use_highbitdepth, - AOM_BORDER_IN_PIXELS, 0, num_pyramid_levels, 0)) + AOM_BORDER_IN_PIXELS, 0, alloc_pyramid, 0)) return 1; aom_free_frame_buffer(&buf->img); buf->img = new_img; } else if (new_dimensions) { + buf->img.y_width = src->y_width; + buf->img.y_height = src->y_height; + buf->img.uv_width = src->uv_width; + buf->img.uv_height = src->uv_height; buf->img.y_crop_width = src->y_crop_width; buf->img.y_crop_height = src->y_crop_height; buf->img.uv_crop_width = src->uv_crop_width; @@ -146,7 +150,6 @@ buf->img.subsampling_x = src->subsampling_x; buf->img.subsampling_y = src->subsampling_y; } - // Partial copy not implemented yet av1_copy_and_extend_frame(src, &buf->img); buf->ts_start = ts_start; diff -Nru aom-3.8.2/av1/encoder/lookahead.h aom-3.9.0/av1/encoder/lookahead.h --- aom-3.8.2/av1/encoder/lookahead.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/lookahead.h 2024-05-07 19:57:03.210000000 +0000 @@ -70,7 +70,7 @@ unsigned int width, unsigned int height, unsigned int subsampling_x, unsigned int subsampling_y, int use_highbitdepth, unsigned int depth, const int border_in_pixels, int byte_alignment, int num_lap_buffers, - bool is_all_intra, int num_pyramid_levels); + bool is_all_intra, bool alloc_pyramid); /**\brief Destroys the lookahead stage */ @@ -85,18 +85,18 @@ * This function will copy the source image into a new framebuffer with * the expected stride/border. * - * \param[in] ctx Pointer to the lookahead context - * \param[in] src Pointer to the image to enqueue - * \param[in] ts_start Timestamp for the start of this frame - * \param[in] ts_end Timestamp for the end of this frame - * \param[in] use_highbitdepth Tell if HBD is used - * \param[in] num_pyramid_levels Number of pyramid levels to allocate - for each frame buffer - * \param[in] flags Flags set on this frame + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] use_highbitdepth Tell if HBD is used + * \param[in] alloc_pyramid Whether to allocate a downsampling pyramid + * for each frame buffer + * \param[in] flags Flags set on this frame */ int av1_lookahead_push(struct lookahead_ctx *ctx, const YV12_BUFFER_CONFIG *src, int64_t ts_start, int64_t ts_end, int use_highbitdepth, - int num_pyramid_levels, aom_enc_frame_flags_t flags); + bool alloc_pyramid, aom_enc_frame_flags_t flags); /**\brief Get the next source buffer to encode * diff -Nru aom-3.8.2/av1/encoder/mcomp.c aom-3.9.0/av1/encoder/mcomp.c --- aom-3.8.2/av1/encoder/mcomp.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/mcomp.c 2024-05-07 19:57:03.211000000 +0000 @@ -2153,7 +2153,7 @@ aom_free(vbuf); aom_free(src_hbuf); aom_free(src_vbuf); - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, + aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, "Failed to allocate hbuf, vbuf, src_hbuf, or src_vbuf"); } diff -Nru aom-3.8.2/av1/encoder/nonrd_pickmode.c aom-3.9.0/av1/encoder/nonrd_pickmode.c --- aom-3.8.2/av1/encoder/nonrd_pickmode.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/nonrd_pickmode.c 2024-05-07 19:57:03.233000000 +0000 @@ -577,7 +577,7 @@ struct macroblock_plane *const p = &x->plane[AOM_PLANE_Y]; const uint32_t dc_quant = p->dequant_QTX[0]; const uint32_t ac_quant = p->dequant_QTX[1]; - const int64_t dc_thr = dc_quant * dc_quant >> 6; + int64_t dc_thr = dc_quant * dc_quant >> 6; int64_t ac_thr = ac_quant * ac_quant >> 6; const int bw = b_width_log2_lookup[bsize]; const int bh = b_height_log2_lookup[bsize]; @@ -597,6 +597,11 @@ #endif + if (cpi->sf.rt_sf.increase_source_sad_thresh) { + dc_thr = dc_thr << 1; + ac_thr = ac_thr << 2; + } + for (int k = 0; k < num_blk; k++) { // Check if all ac coefficients can be quantized to zero. if (!(var_tx[k] < ac_thr || var == 0)) { @@ -626,10 +631,12 @@ const BLOCK_SIZE uv_bsize = get_plane_block_size( bsize, puvd->subsampling_x, puvd->subsampling_y); // Adjust these thresholds for UV. + const int shift_ac = cpi->sf.rt_sf.increase_source_sad_thresh ? 5 : 3; + const int shift_dc = cpi->sf.rt_sf.increase_source_sad_thresh ? 4 : 3; const int64_t uv_dc_thr = - (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> 3; + (puv->dequant_QTX[0] * puv->dequant_QTX[0]) >> shift_dc; const int64_t uv_ac_thr = - (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> 3; + (puv->dequant_QTX[1] * puv->dequant_QTX[1]) >> shift_ac; av1_enc_build_inter_predictor(cm, xd, mi_row, mi_col, NULL, bsize, plane, plane); var_uv[j] = cpi->ppi->fn_ptr[uv_bsize].vf(puv->src.buf, puv->src.stride, @@ -1762,7 +1769,7 @@ x->nonrd_prune_ref_frame_search > 2 && x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_U)] == 0 && x->color_sensitivity_sb_g[COLOR_SENS_IDX(AOM_PLANE_V)] == 0) { - int thr = (cm->width * cm->height >= 640 * 360) ? 100 : 150; + int thr = (cm->width * cm->height > RESOLUTION_288P) ? 100 : 150; int pred = x->pred_mv_sad[LAST_FRAME] >> (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]); if (pred > thr) use_golden_ref_frame = 1; @@ -1933,11 +1940,16 @@ return; } int shift = 3; + unsigned int source_var_thr = 50; + int uv_sad_thr = 100; if (source_sad_nonrd >= kMedSad && x->source_variance > 0 && high_res) shift = 4; - if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && - cpi->rc.high_source_sad) { - shift = 6; + if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN) { + if (cpi->rc.high_source_sad) shift = 6; + if (source_sad_nonrd > kMedSad) { + source_var_thr = 1200; + uv_sad_thr = 10; + } } NOISE_LEVEL noise_level = kLow; int norm_sad = @@ -1975,7 +1987,7 @@ uv_sad >> (b_width_log2_lookup[bs] + b_height_log2_lookup[bs]); x->color_sensitivity[COLOR_SENS_IDX(plane)] = uv_sad > (y_sad >> shift) && norm_uv_sad > 40; - if (source_variance < 50 && norm_uv_sad > 100) + if (source_variance < source_var_thr && norm_uv_sad > uv_sad_thr) x->color_sensitivity[COLOR_SENS_IDX(plane)] = 1; } } @@ -2345,6 +2357,10 @@ *ref_frame2 = NONE_FRAME; } + if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP) && + (*this_mode != GLOBALMV || *ref_frame != LAST_FRAME)) + return true; + if (x->sb_me_block && *ref_frame == LAST_FRAME) { // We want to make sure to test the superblock MV: // so don't skip (return false) for NEAREST_LAST or NEAR_LAST if they @@ -3229,7 +3245,8 @@ inter_pred_params_sr.conv_params = get_conv_params(/*do_average=*/0, AOM_PLANE_Y, xd->bd); - x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad; + x->block_is_zero_sad = x->content_state_sb.source_sad_nonrd == kZeroSad || + segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP); if (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN && !x->force_zeromv_skip_for_blk && x->content_state_sb.source_sad_nonrd != kZeroSad && diff -Nru aom-3.8.2/av1/encoder/palette.h aom-3.9.0/av1/encoder/palette.h --- aom-3.8.2/av1/encoder/palette.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/palette.h 2024-05-07 19:57:03.246000000 +0000 @@ -26,7 +26,7 @@ struct macroblock; /*!\cond */ -#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim##_c +#define AV1_K_MEANS_RENAME(func, dim) func##_dim##dim void AV1_K_MEANS_RENAME(av1_k_means, 1)(const int16_t *data, int16_t *centroids, uint8_t *indices, int n, int k, diff -Nru aom-3.8.2/av1/encoder/partition_search.c aom-3.9.0/av1/encoder/partition_search.c --- aom-3.8.2/av1/encoder/partition_search.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/partition_search.c 2024-05-07 19:57:03.293000000 +0000 @@ -2144,8 +2144,9 @@ } if (tile_data->allow_update_cdf) update_stats(&cpi->common, td); } - if (cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ && mbmi->skip_txfm && - !cpi->rc.rtc_external_ratectrl && cm->seg.enabled) + if ((cpi->oxcf.q_cfg.aq_mode == CYCLIC_REFRESH_AQ || + cpi->active_map.enabled) && + mbmi->skip_txfm && !cpi->rc.rtc_external_ratectrl && cm->seg.enabled) av1_cyclic_reset_segment_skip(cpi, x, mi_row, mi_col, bsize, dry_run); // TODO(Ravi/Remya): Move this copy function to a better logical place // This function will copy the best mode information from block @@ -2254,6 +2255,8 @@ const AQ_MODE aq_mode = cpi->oxcf.q_cfg.aq_mode; TxfmSearchInfo *txfm_info = &x->txfm_search_info; int i; + const int seg_skip = + segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP); // This is only needed for real time/allintra row-mt enabled multi-threaded // encoding with cost update frequency set to COST_UPD_TILE/COST_UPD_OFF. @@ -2276,15 +2279,17 @@ } for (i = 0; i < 2; ++i) pd[i].color_index_map = ctx->color_index_map[i]; - x->force_zeromv_skip_for_blk = - get_force_zeromv_skip_flag_for_blk(cpi, x, bsize); - - // Source variance may be already compute at superblock level, so no need - // to recompute, unless bsize < sb_size or source_variance is not yet set. - if (!x->force_zeromv_skip_for_blk && - (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size)) - x->source_variance = av1_get_perpixel_variance_facade( - cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + if (!seg_skip) { + x->force_zeromv_skip_for_blk = + get_force_zeromv_skip_flag_for_blk(cpi, x, bsize); + + // Source variance may be already compute at superblock level, so no need + // to recompute, unless bsize < sb_size or source_variance is not yet set. + if (!x->force_zeromv_skip_for_blk && + (x->source_variance == UINT_MAX || bsize < cm->seq_params->sb_size)) + x->source_variance = av1_get_perpixel_variance_facade( + cpi, xd, &x->plane[0].src, bsize, AOM_PLANE_Y); + } // Save rdmult before it might be changed, so it can be restored later. const int orig_rdmult = x->rdmult; @@ -2305,16 +2310,13 @@ #if CONFIG_COLLECT_COMPONENT_TIMING start_timing(cpi, nonrd_pick_inter_mode_sb_time); #endif - if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) { - RD_STATS invalid_rd; - av1_invalid_rd_stats(&invalid_rd); - // TODO(kyslov): add av1_nonrd_pick_inter_mode_sb_seg_skip - av1_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, mi_row, mi_col, - rd_cost, bsize, ctx, - invalid_rd.rdcost); - } else { - av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx); + if (seg_skip) { + x->force_zeromv_skip_for_blk = 1; + // TODO(marpan): Consider adding a function for nonrd: + // av1_nonrd_pick_inter_mode_sb_seg_skip(), instead of setting + // x->force_zeromv_skip flag and entering av1_nonrd_pick_inter_mode_sb(). } + av1_nonrd_pick_inter_mode_sb(cpi, tile_data, x, rd_cost, bsize, ctx); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, nonrd_pick_inter_mode_sb_time); #endif @@ -2322,10 +2324,12 @@ if (cpi->sf.rt_sf.skip_cdef_sb) { // cdef_strength is initialized to 1 which means skip_cdef, and is updated // here. Check to see is skipping cdef is allowed. + // Always allow cdef_skip for seg_skip = 1. const int allow_cdef_skipping = - cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && - !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || - x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)]); + seg_skip || + (cpi->rc.frames_since_key > 10 && !cpi->rc.high_source_sad && + !(x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_U)] || + x->color_sensitivity[COLOR_SENS_IDX(AOM_PLANE_V)])); // Find the corresponding 64x64 block. It'll be the 128x128 block if that's // the block size. @@ -4233,6 +4237,54 @@ } } +// Returns true if either of the left and top neighbor blocks is larger than +// the current block; false otherwise. +static AOM_INLINE bool is_neighbor_blk_larger_than_cur_blk( + const MACROBLOCKD *xd, BLOCK_SIZE bsize) { + const int cur_blk_area = (block_size_high[bsize] * block_size_wide[bsize]); + if (xd->left_available) { + const BLOCK_SIZE left_bsize = xd->left_mbmi->bsize; + if (block_size_high[left_bsize] * block_size_wide[left_bsize] > + cur_blk_area) + return true; + } + + if (xd->up_available) { + const BLOCK_SIZE above_bsize = xd->above_mbmi->bsize; + if (block_size_high[above_bsize] * block_size_wide[above_bsize] > + cur_blk_area) + return true; + } + return false; +} + +static AOM_INLINE void prune_rect_part_using_none_pred_mode( + const MACROBLOCKD *xd, PartitionSearchState *part_state, + PREDICTION_MODE mode, BLOCK_SIZE bsize) { + if (mode == DC_PRED || mode == SMOOTH_PRED) { + // If the prediction mode of NONE partition is either DC_PRED or + // SMOOTH_PRED, it indicates that the current block has less variation. In + // this case, HORZ and VERT partitions are pruned if at least one of left + // and top neighbor blocks is larger than the current block. + if (is_neighbor_blk_larger_than_cur_blk(xd, bsize)) { + part_state->prune_rect_part[HORZ] = 1; + part_state->prune_rect_part[VERT] = 1; + } + } else if (mode == D67_PRED || mode == V_PRED || mode == D113_PRED) { + // If the prediction mode chosen by NONE partition is close to 90 degrees, + // it implies a dominant vertical pattern, and the chance of choosing a + // vertical rectangular partition is high. Hence, horizontal partition is + // pruned in these cases. + part_state->prune_rect_part[HORZ] = 1; + } else if (mode == D157_PRED || mode == H_PRED || mode == D203_PRED) { + // If the prediction mode chosen by NONE partition is close to 180 degrees, + // it implies a dominant horizontal pattern, and the chance of choosing a + // horizontal rectangular partition is high. Hence, vertical partition is + // pruned in these cases. + part_state->prune_rect_part[VERT] = 1; + } +} + // PARTITION_NONE search. static void none_partition_search( AV1_COMP *const cpi, ThreadData *td, TileDataEnc *tile_data, MACROBLOCK *x, @@ -4322,6 +4374,10 @@ part_search_state, best_rdc, pb_source_variance); } + + if (cpi->sf.part_sf.prune_rect_part_using_none_pred_mode) + prune_rect_part_using_none_pred_mode(&x->e_mbd, part_search_state, + pc_tree->none->mic.mode, bsize); } av1_restore_context(x, x_ctx, mi_row, mi_col, bsize, av1_num_planes(cm)); } diff -Nru aom-3.8.2/av1/encoder/pass2_strategy.c aom-3.9.0/av1/encoder/pass2_strategy.c --- aom-3.8.2/av1/encoder/pass2_strategy.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/pass2_strategy.c 2024-05-07 19:57:03.312000000 +0000 @@ -158,28 +158,12 @@ return (int)max_bits; } -static const double q_pow_term[(QINDEX_RANGE >> 5) + 1] = { 0.65, 0.70, 0.75, - 0.80, 0.85, 0.90, - 0.95, 0.95, 0.95 }; -#define ERR_DIVISOR 96.0 -static double calc_correction_factor(double err_per_mb, int q) { - const double error_term = err_per_mb / ERR_DIVISOR; - const int index = q >> 5; - // Adjustment to power term based on qindex - const double power_term = - q_pow_term[index] + - (((q_pow_term[index + 1] - q_pow_term[index]) * (q % 32)) / 32.0); - assert(error_term >= 0.0); - return fclamp(pow(error_term, power_term), 0.05, 5.0); -} - // Based on history adjust expectations of bits per macroblock. static void twopass_update_bpm_factor(AV1_COMP *cpi, int rate_err_tol) { TWO_PASS *const twopass = &cpi->ppi->twopass; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; // Based on recent history adjust expectations of bits per macroblock. - double damp_fac = AOMMAX(5.0, rate_err_tol / 10.0); double rate_err_factor = 1.0; const double adj_limit = AOMMAX(0.2, (double)(100 - rate_err_tol) / 200.0); const double min_fac = 1.0 - adj_limit; @@ -214,9 +198,7 @@ } int err_estimate = p_rc->rate_error_estimate; - int64_t bits_left = twopass->bits_left; int64_t total_actual_bits = p_rc->total_actual_bits; - int64_t bits_off_target = p_rc->vbr_bits_off_target; double rolling_arf_group_actual_bits = (double)twopass->rolling_arf_group_actual_bits; double rolling_arf_group_target_bits = @@ -231,10 +213,6 @@ : 0; total_actual_bits = simulate_parallel_frame ? p_rc->temp_total_actual_bits : p_rc->total_actual_bits; - bits_off_target = simulate_parallel_frame ? p_rc->temp_vbr_bits_off_target - : p_rc->vbr_bits_off_target; - bits_left = - simulate_parallel_frame ? p_rc->temp_bits_left : twopass->bits_left; rolling_arf_group_target_bits = (double)(simulate_parallel_frame ? p_rc->temp_rolling_arf_group_target_bits @@ -247,21 +225,21 @@ : p_rc->rate_error_estimate; #endif - if (p_rc->bits_off_target && total_actual_bits > 0) { - if (cpi->ppi->lap_enabled) { - rate_err_factor = rolling_arf_group_actual_bits / - DOUBLE_DIVIDE_CHECK(rolling_arf_group_target_bits); + if ((p_rc->bits_off_target && total_actual_bits > 0) && + (rolling_arf_group_target_bits >= 1.0)) { + if (rolling_arf_group_actual_bits > rolling_arf_group_target_bits) { + double error_fraction = + (rolling_arf_group_actual_bits - rolling_arf_group_target_bits) / + rolling_arf_group_target_bits; + error_fraction = (error_fraction > 1.0) ? 1.0 : error_fraction; + rate_err_factor = 1.0 + error_fraction; } else { - rate_err_factor = 1.0 - ((double)(bits_off_target) / - AOMMAX(total_actual_bits, bits_left)); + double error_fraction = + (rolling_arf_group_target_bits - rolling_arf_group_actual_bits) / + rolling_arf_group_target_bits; + rate_err_factor = 1.0 - error_fraction; } - // Adjustment is damped if this is 1 pass with look ahead processing - // (as there are only ever a few frames of data) and for all but the first - // GOP in normal two pass. - if ((twopass->bpm_factor != 1.0) || cpi->ppi->lap_enabled) { - rate_err_factor = 1.0 + ((rate_err_factor - 1.0) / damp_fac); - } rate_err_factor = AOMMAX(min_fac, AOMMIN(max_fac, rate_err_factor)); } @@ -270,36 +248,38 @@ if ((rate_err_factor < 1.0 && err_estimate >= 0) || (rate_err_factor > 1.0 && err_estimate <= 0)) { twopass->bpm_factor *= rate_err_factor; - if (rate_err_tol >= 100) { - twopass->bpm_factor = - AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); - } else { - twopass->bpm_factor = AOMMAX(0.1, AOMMIN(10.0, twopass->bpm_factor)); - } + twopass->bpm_factor = AOMMAX(min_fac, AOMMIN(max_fac, twopass->bpm_factor)); } } -static int qbpm_enumerator(int rate_err_tol) { - return 1200000 + ((300000 * AOMMIN(75, AOMMAX(rate_err_tol - 25, 0))) / 75); +static const double q_div_term[(QINDEX_RANGE >> 5) + 1] = { 32.0, 40.0, 46.0, + 52.0, 56.0, 60.0, + 64.0, 68.0, 72.0 }; +#define EPMB_SCALER 1250000 +static double calc_correction_factor(double err_per_mb, int q) { + double power_term = 0.90; + const int index = q >> 5; + const double divisor = + q_div_term[index] + + (((q_div_term[index + 1] - q_div_term[index]) * (q % 32)) / 32.0); + double error_term = EPMB_SCALER * pow(err_per_mb, power_term); + return error_term / divisor; } // Similar to find_qindex_by_rate() function in ratectrl.c, but includes // calculation of a correction_factor. static int find_qindex_by_rate_with_correction( int desired_bits_per_mb, aom_bit_depth_t bit_depth, double error_per_mb, - double group_weight_factor, int rate_err_tol, int best_qindex, - int worst_qindex) { + double group_weight_factor, int best_qindex, int worst_qindex) { assert(best_qindex <= worst_qindex); int low = best_qindex; int high = worst_qindex; while (low < high) { const int mid = (low + high) >> 1; - const double mid_factor = calc_correction_factor(error_per_mb, mid); + const double q_factor = calc_correction_factor(error_per_mb, mid); const double q = av1_convert_qindex_to_q(mid, bit_depth); - const int enumerator = qbpm_enumerator(rate_err_tol); - const int mid_bits_per_mb = - (int)((enumerator * mid_factor * group_weight_factor) / q); + const int mid_bits_per_mb = (int)((q_factor * group_weight_factor) / q); if (mid_bits_per_mb > desired_bits_per_mb) { low = mid + 1; @@ -359,8 +339,8 @@ // content at the given rate. int q = find_qindex_by_rate_with_correction( target_norm_bits_per_mb, cpi->common.seq_params->bit_depth, - av_err_per_mb, cpi->ppi->twopass.bpm_factor, rate_err_tol, - rc->best_quality, rc->worst_quality); + av_err_per_mb, cpi->ppi->twopass.bpm_factor, rc->best_quality, + rc->worst_quality); // Restriction on active max q for constrained quality mode. if (rc_cfg->mode == AOM_CQ) q = AOMMAX(q, rc_cfg->cq_level); @@ -3535,12 +3515,13 @@ } // Smooth-out the noise variance so it is more stable +// Returns 0 on success, -1 on memory allocation failure. // TODO(bohanli): Use a better low-pass filter than averaging -static void smooth_filter_noise(FIRSTPASS_STATS *first_stats, - FIRSTPASS_STATS *last_stats) { +static int smooth_filter_noise(FIRSTPASS_STATS *first_stats, + FIRSTPASS_STATS *last_stats) { int len = (int)(last_stats - first_stats); double *smooth_noise = aom_malloc(len * sizeof(*smooth_noise)); - if (!smooth_noise) return; + if (!smooth_noise) return -1; for (int i = 0; i < len; i++) { double total_noise = 0; @@ -3565,11 +3546,13 @@ } aom_free(smooth_noise); + return 0; } // Estimate the noise variance of each frame from the first pass stats void av1_estimate_noise(FIRSTPASS_STATS *first_stats, - FIRSTPASS_STATS *last_stats) { + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info) { FIRSTPASS_STATS *this_stats, *next_stats; double C1, C2, C3, noise; for (this_stats = first_stats + 2; this_stats < last_stats; this_stats++) { @@ -3655,7 +3638,10 @@ this_stats->noise_var = (first_stats + 2)->noise_var; } - smooth_filter_noise(first_stats, last_stats); + if (smooth_filter_noise(first_stats, last_stats) == -1) { + aom_internal_error(error_info, AOM_CODEC_MEM_ERROR, + "Error allocating buffers in smooth_filter_noise()"); + } } // Estimate correlation coefficient of each frame with its previous frame. @@ -3822,7 +3808,8 @@ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, - twopass->stats_buf_ctx->stats_in_end); + twopass->stats_buf_ctx->stats_in_end, + cpi->common.error); av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); ret = identify_regions(cpi->twopass_frame.stats_in, rest_frames, @@ -3996,7 +3983,7 @@ av1_mark_flashes(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); av1_estimate_noise(twopass->stats_buf_ctx->stats_in_start, - twopass->stats_buf_ctx->stats_in_end); + twopass->stats_buf_ctx->stats_in_end, cpi->common.error); av1_estimate_coeff(twopass->stats_buf_ctx->stats_in_start, twopass->stats_buf_ctx->stats_in_end); @@ -4234,7 +4221,7 @@ int maxq_adj_limit; minq_adj_limit = (rc_cfg->mode == AOM_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT); - maxq_adj_limit = rc->worst_quality - rc->active_worst_quality; + maxq_adj_limit = (rc->worst_quality - rc->active_worst_quality); // Undershoot if ((rc_cfg->under_shoot_pct < 100) && @@ -4246,8 +4233,9 @@ if ((pct_error >= rc_cfg->under_shoot_pct) && (p_rc->rate_error_estimate > 0)) { twopass->extend_minq += 1; + twopass->extend_maxq -= 1; } - twopass->extend_maxq -= 1; + // Overshoot } else if ((rc_cfg->over_shoot_pct < 100) && (p_rc->rolling_actual_bits > p_rc->rolling_target_bits)) { @@ -4259,18 +4247,8 @@ if ((pct_error >= rc_cfg->over_shoot_pct) && (p_rc->rate_error_estimate < 0)) { twopass->extend_maxq += 1; + twopass->extend_minq -= 1; } - twopass->extend_minq -= 1; - } else { - // Adjustment for extreme local overshoot. - // Only applies when normal adjustment above is not used (e.g. - // when threshold is set to 100). - if (rc->projected_frame_size > (2 * rc->base_frame_target) && - rc->projected_frame_size > (2 * rc->avg_frame_bandwidth)) - ++twopass->extend_maxq; - // Unwind extreme overshoot adjustment. - else if (p_rc->rolling_target_bits > p_rc->rolling_actual_bits) - --twopass->extend_maxq; } twopass->extend_minq = clamp(twopass->extend_minq, -minq_adj_limit, minq_adj_limit); diff -Nru aom-3.8.2/av1/encoder/pass2_strategy.h aom-3.9.0/av1/encoder/pass2_strategy.h --- aom-3.8.2/av1/encoder/pass2_strategy.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/pass2_strategy.h 2024-05-07 19:57:03.319000000 +0000 @@ -137,7 +137,8 @@ void av1_mark_flashes(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats); void av1_estimate_noise(FIRSTPASS_STATS *first_stats, - FIRSTPASS_STATS *last_stats); + FIRSTPASS_STATS *last_stats, + struct aom_internal_error_info *error_info); void av1_estimate_coeff(FIRSTPASS_STATS *first_stats, FIRSTPASS_STATS *last_stats); diff -Nru aom-3.8.2/av1/encoder/picklpf.c aom-3.9.0/av1/encoder/picklpf.c --- aom-3.8.2/av1/encoder/picklpf.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/picklpf.c 2024-05-07 19:57:03.324000000 +0000 @@ -27,12 +27,25 @@ #include "av1/encoder/encoder.h" #include "av1/encoder/picklpf.h" +// AV1 loop filter applies to the whole frame according to mi_rows and mi_cols, +// which are calculated based on aligned width and aligned height, +// In addition, if super res is enabled, it copies the whole frame +// according to the aligned width and height (av1_superres_upscale()). +// So we need to copy the whole filtered region, instead of the cropped region. +// For example, input image size is: 160x90. +// Then src->y_crop_width = 160, src->y_crop_height = 90. +// The aligned frame size is: src->y_width = 160, src->y_height = 96. +// AV1 aligns frame size to a multiple of 8, if there is +// chroma subsampling, it is able to ensure the chroma is also +// an integer number of mi units. mi unit is 4x4, 8 = 4 * 2, and 2 luma mi +// units correspond to 1 chroma mi unit if there is subsampling. +// See: aom_realloc_frame_buffer() in yv12config.c. static void yv12_copy_plane(const YV12_BUFFER_CONFIG *src_bc, YV12_BUFFER_CONFIG *dst_bc, int plane) { switch (plane) { - case 0: aom_yv12_copy_y(src_bc, dst_bc); break; - case 1: aom_yv12_copy_u(src_bc, dst_bc); break; - case 2: aom_yv12_copy_v(src_bc, dst_bc); break; + case 0: aom_yv12_copy_y(src_bc, dst_bc, 0); break; + case 1: aom_yv12_copy_u(src_bc, dst_bc, 0); break; + case 2: aom_yv12_copy_v(src_bc, dst_bc, 0); break; default: assert(plane >= 0 && plane <= 2); break; } } @@ -244,6 +257,8 @@ inter_frame_multiplier = inter_frame_multiplier << 1; else if (cpi->rc.frame_source_sad > 50000) inter_frame_multiplier = 3 * (inter_frame_multiplier >> 1); + } else if (cpi->sf.rt_sf.use_fast_fixed_part) { + inter_frame_multiplier = inter_frame_multiplier << 1; } // These values were determined by linear fitting the result of the // searched level for 8 bit depth: @@ -311,7 +326,7 @@ &cpi->last_frame_uf, cm->width, cm->height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); diff -Nru aom-3.8.2/av1/encoder/pickrst.c aom-3.9.0/av1/encoder/pickrst.c --- aom-3.8.2/av1/encoder/pickrst.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/pickrst.c 2024-05-07 19:57:03.328000000 +0000 @@ -1124,6 +1124,15 @@ // Let y = x * w / WIENER_TAP_SCALE_FACTOR // = x * (w1 * WIENER_TAP_SCALE_FACTOR + w2) / WIENER_TAP_SCALE_FACTOR const int64_t y = x * w1 + x * w2 / WIENER_TAP_SCALE_FACTOR; + // Double-check the calculation using __int128. + // TODO(wtc): Remove after 2024-04-30. +#if !defined(NDEBUG) && defined(__GNUC__) && defined(__LP64__) + const int32_t w = w1 * WIENER_TAP_SCALE_FACTOR + w2; + const __int128 z = (__int128)x * w / WIENER_TAP_SCALE_FACTOR; + assert(z >= INT64_MIN); + assert(z <= INT64_MAX); + assert(y == (int64_t)z); +#endif return y; } @@ -1199,7 +1208,8 @@ // Fix vector b, update vector a static AOM_INLINE void update_a_sep_sym(int wiener_win, int64_t **Mc, - int64_t **Hc, int32_t *a, int32_t *b) { + int64_t **Hc, int32_t *a, + const int32_t *b) { int i, j; int64_t S[WIENER_WIN]; int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; @@ -1269,7 +1279,8 @@ // Fix vector a, update vector b static AOM_INLINE void update_b_sep_sym(int wiener_win, int64_t **Mc, - int64_t **Hc, int32_t *a, int32_t *b) { + int64_t **Hc, const int32_t *a, + int32_t *b) { int i, j; int64_t S[WIENER_WIN]; int64_t A[WIENER_HALFWIN1], B[WIENER_HALFWIN1 * WIENER_HALFWIN1]; @@ -2056,7 +2067,7 @@ &cpi->trial_frame_rst, cm->superres_upscaled_width, cm->superres_upscaled_height, seq_params->subsampling_x, seq_params->subsampling_y, highbd, AOM_RESTORATION_FRAME_BORDER, - cm->features.byte_alignment, NULL, NULL, NULL, 0, 0)) + cm->features.byte_alignment, NULL, NULL, NULL, false, 0)) aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, "Failed to allocate trial restored frame buffer"); diff -Nru aom-3.8.2/av1/encoder/ratectrl.c aom-3.9.0/av1/encoder/ratectrl.c --- aom-3.8.2/av1/encoder/ratectrl.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/ratectrl.c 2024-05-07 19:57:03.334000000 +0000 @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -29,6 +30,7 @@ #include "av1/common/seg_common.h" #include "av1/encoder/encodemv.h" +#include "av1/encoder/encoder_utils.h" #include "av1/encoder/encode_strategy.h" #include "av1/encoder/gop_structure.h" #include "av1/encoder/random.h" @@ -438,6 +440,33 @@ rc->rtc_external_ratectrl = 0; rc->frame_level_fast_extra_bits = 0; rc->use_external_qp_one_pass = 0; + rc->percent_blocks_inactive = 0; +} + +static bool check_buffer_below_thresh(AV1_COMP *cpi, int64_t buffer_level, + int drop_mark) { + SVC *svc = &cpi->svc; + if (!cpi->ppi->use_svc || cpi->svc.number_spatial_layers == 1 || + cpi->svc.framedrop_mode == AOM_LAYER_DROP) { + return (buffer_level <= drop_mark); + } else { + // For SVC in the AOM_FULL_SUPERFRAME_DROP): the condition on + // buffer is checked on current and upper spatial layers. + for (int i = svc->spatial_layer_id; i < svc->number_spatial_layers; ++i) { + const int layer = LAYER_IDS_TO_IDX(i, svc->temporal_layer_id, + svc->number_temporal_layers); + LAYER_CONTEXT *lc = &svc->layer_context[layer]; + PRIMARY_RATE_CONTROL *lrc = &lc->p_rc; + // Exclude check for layer whose bitrate is 0. + if (lc->target_bandwidth > 0) { + const int drop_thresh = cpi->oxcf.rc_cfg.drop_frames_water_mark; + const int drop_mark_layer = + (int)(drop_thresh * lrc->optimal_buffer_level / 100); + if (lrc->buffer_level <= drop_mark_layer) return true; + } + } + return false; + } } int av1_rc_drop_frame(AV1_COMP *cpi) { @@ -463,18 +492,29 @@ rc->drop_count_consec >= rc->max_consec_drop)) { return 0; } else { - if (buffer_level < 0) { + SVC *svc = &cpi->svc; + // In the full_superframe framedrop mode for svc, if the previous spatial + // layer was dropped, drop the current spatial layer. + if (cpi->ppi->use_svc && svc->spatial_layer_id > 0 && + svc->drop_spatial_layer[svc->spatial_layer_id - 1] && + svc->framedrop_mode == AOM_FULL_SUPERFRAME_DROP) + return 1; + // -1 is passed here for drop_mark since we are checking if + // buffer goes below 0 (<= -1). + if (check_buffer_below_thresh(cpi, buffer_level, -1)) { // Always drop if buffer is below 0. rc->drop_count_consec++; return 1; } else { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. - int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * - p_rc->optimal_buffer_level / 100); - if ((buffer_level > drop_mark) && (rc->decimation_factor > 0)) { + const int drop_mark = (int)(oxcf->rc_cfg.drop_frames_water_mark * + p_rc->optimal_buffer_level / 100); + const bool buffer_below_thresh = + check_buffer_below_thresh(cpi, buffer_level, drop_mark); + if (!buffer_below_thresh && rc->decimation_factor > 0) { --rc->decimation_factor; - } else if (buffer_level <= drop_mark && rc->decimation_factor == 0) { + } else if (buffer_below_thresh && rc->decimation_factor == 0) { rc->decimation_factor = 1; } if (rc->decimation_factor > 0) { @@ -1681,41 +1721,39 @@ const AV1_COMMON *const cm = &cpi->common; const RATE_CONTROL *const rc = &cpi->rc; const PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; - const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; int active_best_quality = *active_best; int active_worst_quality = *active_worst; #if CONFIG_FPMT_TEST - const int simulate_parallel_frame = - cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && - cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; - int extend_minq = simulate_parallel_frame ? p_rc->temp_extend_minq - : cpi->ppi->twopass.extend_minq; - int extend_maxq = simulate_parallel_frame ? p_rc->temp_extend_maxq - : cpi->ppi->twopass.extend_maxq; #endif // Extension to max or min Q if undershoot or overshoot is outside // the permitted range. if (cpi->oxcf.rc_cfg.mode != AOM_Q) { +#if CONFIG_FPMT_TEST + const int simulate_parallel_frame = + cpi->ppi->gf_group.frame_parallel_level[cpi->gf_frame_index] > 0 && + cpi->ppi->fpmt_unit_test_cfg == PARALLEL_SIMULATION_ENCODE; + const int extend_minq = simulate_parallel_frame + ? p_rc->temp_extend_minq + : cpi->ppi->twopass.extend_minq; + const int extend_maxq = simulate_parallel_frame + ? p_rc->temp_extend_maxq + : cpi->ppi->twopass.extend_maxq; + const RefreshFrameInfo *const refresh_frame = &cpi->refresh_frame; if (frame_is_intra_only(cm) || (!rc->is_src_frame_alt_ref && (refresh_frame->golden_frame || is_intrl_arf_boost || refresh_frame->alt_ref_frame))) { -#if CONFIG_FPMT_TEST active_best_quality -= extend_minq; active_worst_quality += (extend_maxq / 2); -#else - active_best_quality -= cpi->ppi->twopass.extend_minq / 4; - active_worst_quality += (cpi->ppi->twopass.extend_maxq / 2); -#endif } else { -#if CONFIG_FPMT_TEST active_best_quality -= extend_minq / 2; active_worst_quality += extend_maxq; + } #else - active_best_quality -= cpi->ppi->twopass.extend_minq / 4; - active_worst_quality += cpi->ppi->twopass.extend_maxq; + (void)is_intrl_arf_boost; + active_best_quality -= cpi->ppi->twopass.extend_minq / 8; + active_worst_quality += cpi->ppi->twopass.extend_maxq / 4; #endif - } } #ifndef STRICT_RC @@ -2393,6 +2431,10 @@ // otherwise the avg_source_sad can get too large and subsequent frames // may miss the scene/slide detection. if (cpi->rc.high_source_sad) cpi->rc.avg_source_sad = 0; + if (cpi->ppi->use_svc && cpi->svc.number_spatial_layers > 1) { + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = true; + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id] = true; + } } int av1_find_qindex(double desired_q, aom_bit_depth_t bit_depth, @@ -2903,10 +2945,12 @@ for (int i = 0; i < REF_FRAMES; ++i) rtc_ref->refresh[i] = 0; // Set the reference frame flags. ext_flags->ref_frame_flags ^= AOM_LAST_FLAG; - ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; - ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; - if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) - ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG; + if (!cpi->sf.rt_sf.force_only_last_ref) { + ext_flags->ref_frame_flags ^= AOM_ALT_FLAG; + ext_flags->ref_frame_flags ^= AOM_GOLD_FLAG; + if (cpi->sf.rt_sf.ref_frame_comp_nonrd[1]) + ext_flags->ref_frame_flags ^= AOM_LAST2_FLAG; + } const int sh = 6; // Moving index slot for last: 0 - (sh - 1). if (frame_number > 1) last_idx = ((frame_number - 1) % sh); @@ -2947,6 +2991,24 @@ cpi->rt_reduce_num_ref_buffers &= (rtc_ref->ref_idx[2] < 7); } +static int set_block_is_active(unsigned char *const active_map_4x4, int mi_cols, + int mi_rows, int sbi_col, int sbi_row, int sh, + int num_4x4) { + int r = sbi_row << sh; + int c = sbi_col << sh; + const int row_max = AOMMIN(num_4x4, mi_rows - r); + const int col_max = AOMMIN(num_4x4, mi_cols - c); + // Active map is set for 16x16 blocks, so only need to + // check over16x16, + for (int x = 0; x < row_max; x += 4) { + for (int y = 0; y < col_max; y += 4) { + if (active_map_4x4[(r + x) * mi_cols + (c + y)] == AM_SEGMENT_ID_ACTIVE) + return 1; + } + } + return 0; +} + /*!\brief Check for scene detection, for 1 pass real-time mode. * * Compute average source sad (temporal sad: between current source and @@ -3049,11 +3111,26 @@ sizeof(*cpi->src_sad_blk_64x64))); } } + const CommonModeInfoParams *const mi_params = &cpi->common.mi_params; + const int mi_cols = mi_params->mi_cols; + const int mi_rows = mi_params->mi_rows; + int sh = (cm->seq_params->sb_size == BLOCK_128X128) ? 5 : 4; + int num_4x4 = (cm->seq_params->sb_size == BLOCK_128X128) ? 32 : 16; + unsigned char *const active_map_4x4 = cpi->active_map.map; // Avoid bottom and right border. for (int sbi_row = 0; sbi_row < sb_rows - border; ++sbi_row) { for (int sbi_col = 0; sbi_col < sb_cols; ++sbi_col) { - tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, - last_src_ystride); + int block_is_active = 1; + if (cpi->active_map.enabled && rc->percent_blocks_inactive > 0) { + block_is_active = set_block_is_active(active_map_4x4, mi_cols, mi_rows, + sbi_col, sbi_row, sh, num_4x4); + } + if (block_is_active) { + tmp_sad = cpi->ppi->fn_ptr[bsize].sdf(src_y, src_ystride, last_src_y, + last_src_ystride); + } else { + tmp_sad = 0; + } if (cpi->src_sad_blk_64x64 != NULL) cpi->src_sad_blk_64x64[sbi_col + sbi_row * sb_cols] = tmp_sad; if (check_light_change) { @@ -3381,6 +3458,7 @@ svc->layer_context[layer].is_key_frame = 1; } rc->frame_number_encoded = 0; + cpi->ppi->rtc_ref.non_reference_frame = 0; } else { *frame_type = INTER_FRAME; gf_group->update_type[cpi->gf_frame_index] = LF_UPDATE; @@ -3411,8 +3489,13 @@ } } } - // Check for scene change: for SVC check on base spatial layer only. - if (cpi->sf.rt_sf.check_scene_detection && svc->spatial_layer_id == 0) { + if (cpi->active_map.enabled && cpi->rc.percent_blocks_inactive == 100) { + rc->frame_source_sad = 0; + rc->avg_source_sad = (3 * rc->avg_source_sad + rc->frame_source_sad) >> 2; + rc->percent_blocks_with_motion = 0; + rc->high_source_sad = 0; + } else if (cpi->sf.rt_sf.check_scene_detection && + svc->spatial_layer_id == 0) { if (rc->prev_coded_width == cm->width && rc->prev_coded_height == cm->height) { rc_scene_detection_onepass_rt(cpi, frame_input); @@ -3477,6 +3560,10 @@ } } +#define CHECK_INTER_LAYER_PRED(ref_frame) \ + ((cpi->ref_frame_flags & av1_ref_frame_flag_list[ref_frame]) && \ + (av1_check_ref_is_low_spatial_res_super_frame(cpi, ref_frame))) + int av1_encodedframe_overshoot_cbr(AV1_COMP *cpi, int *q) { AV1_COMMON *const cm = &cpi->common; PRIMARY_RATE_CONTROL *const p_rc = &cpi->ppi->p_rc; @@ -3487,12 +3574,26 @@ int target_bits_per_mb; double q2; int enumerator; + int inter_layer_pred_on = 0; int is_screen_content = (cpi->oxcf.tune_cfg.content == AOM_CONTENT_SCREEN); - *q = (3 * cpi->rc.worst_quality + *q) >> 2; - // For screen content use the max-q set by the user to allow for less - // overshoot on slide changes. - if (is_screen_content) *q = cpi->rc.worst_quality; cpi->cyclic_refresh->counter_encode_maxq_scene_change = 0; + if (cpi->svc.spatial_layer_id > 0) { + // For spatial layers: check if inter-layer (spatial) prediction is used + // (check if any reference is being used that is the lower spatial layer), + inter_layer_pred_on = CHECK_INTER_LAYER_PRED(LAST_FRAME) || + CHECK_INTER_LAYER_PRED(GOLDEN_FRAME) || + CHECK_INTER_LAYER_PRED(ALTREF_FRAME); + } + // If inter-layer prediction is on: we expect to pull up the quality from + // the lower spatial layer, so we can use a lower q. + if (cpi->svc.spatial_layer_id > 0 && inter_layer_pred_on) { + *q = (cpi->rc.worst_quality + *q) >> 1; + } else { + *q = (3 * cpi->rc.worst_quality + *q) >> 2; + // For screen content use the max-q set by the user to allow for less + // overshoot on slide changes. + if (is_screen_content) *q = cpi->rc.worst_quality; + } // Adjust avg_frame_qindex, buffer_level, and rate correction factors, as // these parameters will affect QP selection for subsequent frames. If they // have settled down to a very different (low QP) state, then not adjusting @@ -3521,8 +3622,10 @@ rate_correction_factor; } // For temporal layers: reset the rate control parameters across all - // temporal layers. - if (cpi->svc.number_temporal_layers > 1) { + // temporal layers. Only do it for spatial enhancement layers when + // inter_layer_pred_on is not set (off). + if (cpi->svc.number_temporal_layers > 1 && + (cpi->svc.spatial_layer_id == 0 || inter_layer_pred_on == 0)) { SVC *svc = &cpi->svc; for (int tl = 0; tl < svc->number_temporal_layers; ++tl) { int sl = svc->spatial_layer_id; diff -Nru aom-3.8.2/av1/encoder/ratectrl.h aom-3.9.0/av1/encoder/ratectrl.h --- aom-3.8.2/av1/encoder/ratectrl.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/ratectrl.h 2024-05-07 19:57:03.342000000 +0000 @@ -249,6 +249,9 @@ // signals if number of blocks with motion is high int percent_blocks_with_motion; + // signals percentge of 16x16 blocks that are inactive, via active_maps + int percent_blocks_inactive; + // Maximum value of source sad across all blocks of frame. uint64_t max_block_source_sad; diff -Nru aom-3.8.2/av1/encoder/reconinter_enc.c aom-3.9.0/av1/encoder/reconinter_enc.c --- aom-3.8.2/av1/encoder/reconinter_enc.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/reconinter_enc.c 2024-05-07 19:57:03.373000000 +0000 @@ -157,7 +157,7 @@ get_ref_scale_factors_const(ctxt->cm, frame); xd->block_ref_scale_factors[0] = sf; - if ((!av1_is_valid_scale(sf))) + if (!av1_is_valid_scale(sf)) aom_internal_error(xd->error_info, AOM_CODEC_UNSUP_BITSTREAM, "Reference frame has invalid dimensions"); diff -Nru aom-3.8.2/av1/encoder/speed_features.c aom-3.9.0/av1/encoder/speed_features.c --- aom-3.8.2/av1/encoder/speed_features.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/speed_features.c 2024-05-07 19:57:03.380000000 +0000 @@ -514,6 +514,7 @@ sf->part_sf.prune_rectangular_split_based_on_qidx = allow_screen_content_tools ? 0 : 2; sf->part_sf.prune_rect_part_using_4x4_var_deviation = true; + sf->part_sf.prune_rect_part_using_none_pred_mode = true; sf->part_sf.prune_sub_8x8_partition_level = allow_screen_content_tools ? 0 : 1; sf->part_sf.prune_part4_search = 3; @@ -1176,6 +1177,7 @@ sf->mv_sf.subpel_search_method = SUBPEL_TREE_PRUNED_MORE; sf->gm_sf.prune_zero_mv_with_sse = 2; + sf->gm_sf.downsample_level = 1; sf->part_sf.simple_motion_search_prune_agg = allow_screen_content_tools ? SIMPLE_AGG_LVL0 : SIMPLE_AGG_LVL2; @@ -1281,6 +1283,8 @@ sf->hl_sf.disable_extra_sc_testing = 1; sf->hl_sf.second_alt_ref_filtering = 0; + sf->gm_sf.downsample_level = 2; + sf->inter_sf.prune_inter_modes_based_on_tpl = boosted ? 0 : 3; sf->inter_sf.selective_ref_frame = 6; sf->inter_sf.prune_single_ref = is_boosted_arf2_bwd_type ? 0 : 2; @@ -1453,7 +1457,27 @@ if (speed >= 9) sf->lpf_sf.cdef_pick_method = CDEF_PICK_FROM_Q; if (speed >= 10) sf->rt_sf.nonrd_aggressive_skip = 1; } - + // TODO(marpan): Tune settings for speed 11 video mode, + // for resolutions below 720p. + if (speed >= 11 && !is_720p_or_larger && + cpi->oxcf.tune_cfg.content != AOM_CONTENT_SCREEN) { + sf->rt_sf.skip_cdef_sb = 2; + sf->rt_sf.force_only_last_ref = 1; + sf->rt_sf.selective_cdf_update = 1; + sf->rt_sf.use_nonrd_filter_search = 0; + if (is_360p_or_larger) { + sf->part_sf.fixed_partition_size = BLOCK_32X32; + sf->rt_sf.use_fast_fixed_part = 1; + sf->mv_sf.subpel_force_stop = HALF_PEL; + } + sf->rt_sf.increase_source_sad_thresh = 1; + sf->rt_sf.part_early_exit_zeromv = 2; + sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; + for (int i = 0; i < BLOCK_SIZES; ++i) { + sf->rt_sf.intra_y_mode_bsize_mask_nrd[i] = INTRA_DC; + } + sf->rt_sf.hybrid_intra_pickmode = 0; + } // Setting for SVC, or when the ref_frame_config control is // used to set the reference structure. if (cpi->ppi->use_svc || cpi->ppi->rtc_ref.set_ref_frame_config) { @@ -1553,13 +1577,13 @@ sf->rt_sf.screen_content_cdef_filter_qindex_thresh = 80; sf->rt_sf.part_early_exit_zeromv = 1; sf->rt_sf.nonrd_aggressive_skip = 1; + sf->rt_sf.thresh_active_maps_skip_lf_cdef = 90; } if (speed >= 11) { sf->rt_sf.skip_lf_screen = 2; sf->rt_sf.skip_cdef_sb = 2; sf->rt_sf.part_early_exit_zeromv = 2; sf->rt_sf.prune_palette_nonrd = 1; - sf->rt_sf.set_zeromv_skip_based_on_source_sad = 2; sf->rt_sf.increase_color_thresh_palette = 0; } sf->rt_sf.use_nonrd_altref_frame = 0; @@ -1577,15 +1601,16 @@ } } if (cpi->rc.max_block_source_sad > 20000 && - cpi->rc.frame_source_sad > 100 && - cpi->rc.percent_blocks_with_motion > 1 && speed >= 6) { + cpi->rc.frame_source_sad > 100 && speed >= 6 && + (cpi->rc.percent_blocks_with_motion > 1 || + cpi->svc.last_layer_dropped[0])) { sf->mv_sf.search_method = NSTEP; sf->rt_sf.fullpel_search_step_param = 2; } sf->rt_sf.partition_direct_merging = 0; sf->hl_sf.accurate_bit_estimate = 0; - // This feature is for nonrd_pickmode and non-svc for now. - if (sf->rt_sf.use_nonrd_pick_mode && !cpi->ppi->use_svc) + // This feature is for nonrd_pickmode. + if (sf->rt_sf.use_nonrd_pick_mode) sf->rt_sf.estimate_motion_for_var_based_partition = 1; else sf->rt_sf.estimate_motion_for_var_based_partition = 0; @@ -1600,6 +1625,18 @@ // Disable for use_highbitdepth = 1 to mitigate issue: b/303023614. sf->rt_sf.estimate_motion_for_var_based_partition = 0; } + if (cpi->oxcf.superres_cfg.enable_superres) { + sf->rt_sf.use_rtc_tf = 0; + sf->rt_sf.nonrd_prune_ref_frame_search = 1; + } + // rtc_tf feature allocates new source because of possible + // temporal filtering which may change the input source during encoding: + // this causes an issue on resized frames when psnr is calculated, + // so disable it here for frames that are resized (encoding width/height + // different from configured width/height). + if (is_psnr_calc_enabled(cpi) && (cpi->oxcf.frm_dim_cfg.width != cm->width || + cpi->oxcf.frm_dim_cfg.height != cm->height)) + sf->rt_sf.use_rtc_tf = 0; } // TODO(kyslov): now this is very similar to @@ -1768,6 +1805,8 @@ FLAG_EARLY_TERMINATE; sf->rt_sf.var_part_split_threshold_shift = 5; if (!frame_is_intra_only(&cpi->common)) sf->rt_sf.var_part_based_on_qidx = 1; + sf->rt_sf.use_fast_fixed_part = 0; + sf->rt_sf.increase_source_sad_thresh = 0; if (speed >= 6) { sf->mv_sf.use_fullpel_costlist = 1; @@ -1940,6 +1979,7 @@ gm_sf->prune_ref_frame_for_gm_search = 0; gm_sf->prune_zero_mv_with_sse = 0; gm_sf->disable_gm_search_based_on_stats = 0; + gm_sf->downsample_level = 0; gm_sf->num_refinement_steps = GM_MAX_REFINEMENT_STEPS; } @@ -1978,6 +2018,7 @@ part_sf->prune_ext_part_using_split_info = 0; part_sf->prune_rectangular_split_based_on_qidx = 0; part_sf->prune_rect_part_using_4x4_var_deviation = false; + part_sf->prune_rect_part_using_none_pred_mode = false; part_sf->early_term_after_none_split = 0; part_sf->ml_predict_breakout_level = 0; part_sf->prune_sub_8x8_partition_level = 0; @@ -2235,6 +2276,7 @@ rt_sf->part_early_exit_zeromv = 0; rt_sf->sse_early_term_inter_search = EARLY_TERM_DISABLED; rt_sf->skip_lf_screen = 0; + rt_sf->thresh_active_maps_skip_lf_cdef = 100; rt_sf->sad_based_adp_altref_lag = 0; rt_sf->partition_direct_merging = 0; rt_sf->var_part_based_on_qidx = 0; @@ -2255,6 +2297,8 @@ rt_sf->enable_ref_short_signaling = false; rt_sf->check_globalmv_on_single_ref = true; rt_sf->increase_color_thresh_palette = false; + rt_sf->selective_cdf_update = 0; + rt_sf->force_only_last_ref = 0; } static fractional_mv_step_fp diff -Nru aom-3.8.2/av1/encoder/speed_features.h aom-3.9.0/av1/encoder/speed_features.h --- aom-3.8.2/av1/encoder/speed_features.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/speed_features.h 2024-05-07 19:57:03.386000000 +0000 @@ -587,6 +587,9 @@ // GF group int disable_gm_search_based_on_stats; + // Downsampling pyramid level to use for global motion estimation + int downsample_level; + // Number of refinement steps to apply after initial model generation int num_refinement_steps; } GLOBAL_MOTION_SPEED_FEATURES; @@ -720,6 +723,28 @@ // speed feature is not applicable to speed >= 7. bool prune_rect_part_using_4x4_var_deviation; + // Prune rectangular partitions based on prediction mode chosen by NONE + // partition. + // false : no pruning + // true : prunes rectangular partition as described below + // If prediction mode chosen by NONE partition is + // DC_PRED or SMOOTH_PRED: Prunes both horizontal and vertical partitions if + // at least one of the left and top neighbor blocks is larger than the + // current block. + // Directional Mode: Prunes either of the horizontal and vertical partition + // based on center angle of the prediction mode chosen by NONE partition. For + // example, vertical partition is pruned if center angle of the prediction + // mode chosen by NONE partition is close to 180 degrees (i.e. horizontal + // direction) and vice versa. + // For allintra encode, this speed feature reduces instruction count by 5.1% + // for speed=6 with coding performance change less than 0.22%. For AVIF image + // encode, this speed feature reduces encode time by 4.44% for speed 6 on a + // typical image dataset with coding performance change less than 0.15%. + // For speed >= 7, variance-based logic is used to determine the partition + // structure instead of recursive partition search. Therefore, this speed + // feature is not applicable in such cases. + bool prune_rect_part_using_none_pred_mode; + // Terminate partition search for child partition, // when NONE and SPLIT partition rd_costs are INT64_MAX. int early_term_after_none_split; @@ -1654,10 +1679,24 @@ // rc->high_source_sad = 0 (non slide-changes), and color sensitivity off. int skip_cdef_sb; + // Force selective cdf update. + int selective_cdf_update; + + // Force only single reference (LAST) for prediction. + int force_only_last_ref; + // Forces larger partition blocks in variance based partitioning for intra // frames int force_large_partition_blocks_intra; + // Use fixed partition for superblocks based on source_sad. + // 0: disabled + // 1: enabled + int use_fast_fixed_part; + + // Increase source_sad thresholds in nonrd pickmode. + int increase_source_sad_thresh; + // Skip evaluation of no split in tx size selection for merge partition int skip_tx_no_split_var_based_partition; @@ -1735,6 +1774,10 @@ // where rc->high_source_sad = 0 (no slide-changes). int skip_lf_screen; + // Threshold on the active/inactive region percent to disable + // the loopfilter and cdef. Setting to 100 disables this feature. + int thresh_active_maps_skip_lf_cdef; + // For nonrd: early exit out of variance partition that sets the // block size to superblock size, and sets mode to zeromv-last skip. // 0: disabled diff -Nru aom-3.8.2/av1/encoder/superres_scale.c aom-3.9.0/av1/encoder/superres_scale.c --- aom-3.8.2/av1/encoder/superres_scale.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/superres_scale.c 2024-05-07 19:57:03.394000000 +0000 @@ -347,7 +347,8 @@ SCALE_NUMERATOR }; int resize_denom = SCALE_NUMERATOR; if (has_no_stats_stage(cpi) && cpi->ppi->use_svc && - cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1) { + (cpi->common.width != cpi->oxcf.frm_dim_cfg.width || + cpi->common.height != cpi->oxcf.frm_dim_cfg.height)) { rsz.resize_width = cpi->common.width; rsz.resize_height = cpi->common.height; return rsz; @@ -403,7 +404,7 @@ assert(!is_lossless_requested(&cpi->oxcf.rc_cfg)); assert(!cm->features.all_lossless); - av1_superres_upscale(cm, NULL, cpi->image_pyramid_levels); + av1_superres_upscale(cm, NULL, cpi->alloc_pyramid); // If regular resizing is occurring the source will need to be downscaled to // match the upscaled superres resolution. Otherwise the original source is diff -Nru aom-3.8.2/av1/encoder/svc_layercontext.c aom-3.9.0/av1/encoder/svc_layercontext.c --- aom-3.8.2/av1/encoder/svc_layercontext.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/svc_layercontext.c 2024-05-07 19:57:03.395000000 +0000 @@ -77,6 +77,8 @@ } svc->downsample_filter_type[sl] = BILINEAR; svc->downsample_filter_phase[sl] = 8; + svc->last_layer_dropped[sl] = false; + svc->drop_spatial_layer[sl] = false; } if (svc->number_spatial_layers == 3) { svc->downsample_filter_type[0] = EIGHTTAP_SMOOTH; @@ -201,8 +203,10 @@ } } -static AOM_INLINE bool check_ref_is_low_spatial_res_super_frame( - int ref_frame, const SVC *svc, const RTC_REF *rtc_ref) { +bool av1_check_ref_is_low_spatial_res_super_frame(AV1_COMP *const cpi, + int ref_frame) { + SVC *svc = &cpi->svc; + RTC_REF *const rtc_ref = &cpi->ppi->rtc_ref; int ref_frame_idx = rtc_ref->ref_idx[ref_frame - 1]; return rtc_ref->buffer_time_index[ref_frame_idx] == svc->current_superframe && rtc_ref->buffer_spatial_layer[ref_frame_idx] <= @@ -251,13 +255,13 @@ // previous spatial layer(s) at the same time (current_superframe). if (rtc_ref->set_ref_frame_config && svc->force_zero_mode_spatial_ref && cpi->sf.rt_sf.use_nonrd_pick_mode) { - if (check_ref_is_low_spatial_res_super_frame(LAST_FRAME, svc, rtc_ref)) { + if (av1_check_ref_is_low_spatial_res_super_frame(cpi, LAST_FRAME)) { svc->skip_mvsearch_last = 1; } - if (check_ref_is_low_spatial_res_super_frame(GOLDEN_FRAME, svc, rtc_ref)) { + if (av1_check_ref_is_low_spatial_res_super_frame(cpi, GOLDEN_FRAME)) { svc->skip_mvsearch_gf = 1; } - if (check_ref_is_low_spatial_res_super_frame(ALTREF_FRAME, svc, rtc_ref)) { + if (av1_check_ref_is_low_spatial_res_super_frame(cpi, ALTREF_FRAME)) { svc->skip_mvsearch_altref = 1; } } @@ -320,8 +324,12 @@ svc->temporal_layer_fb[i] = svc->temporal_layer_id; } } - if (svc->spatial_layer_id == svc->number_spatial_layers - 1) + if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { svc->current_superframe++; + // Reset drop flag to false for next superframe. + for (int sl = 0; sl < svc->number_spatial_layers; sl++) + svc->drop_spatial_layer[sl] = false; + } } int av1_svc_primary_ref_frame(const AV1_COMP *const cpi) { @@ -386,6 +394,11 @@ int *height_out) { int w, h; if (width_out == NULL || height_out == NULL || den == 0) return; + if (den == 1 && num == 1) { + *width_out = width_org; + *height_out = height_org; + return; + } w = width_org * num / den; h = height_org * num / den; // Make height and width even. @@ -397,6 +410,7 @@ void av1_one_pass_cbr_svc_start_layer(AV1_COMP *const cpi) { SVC *const svc = &cpi->svc; + AV1_COMMON *const cm = &cpi->common; LAYER_CONTEXT *lc = NULL; int width = 0, height = 0; lc = &svc->layer_context[svc->spatial_layer_id * svc->number_temporal_layers + @@ -418,13 +432,13 @@ if (width * height <= 320 * 240) svc->downsample_filter_type[svc->spatial_layer_id] = EIGHTTAP_SMOOTH; - cpi->common.width = width; - cpi->common.height = height; + cm->width = width; + cm->height = height; alloc_mb_mode_info_buffers(cpi); av1_update_frame_size(cpi); if (svc->spatial_layer_id == svc->number_spatial_layers - 1) { - svc->mi_cols_full_resoln = cpi->common.mi_params.mi_cols; - svc->mi_rows_full_resoln = cpi->common.mi_params.mi_rows; + svc->mi_cols_full_resoln = cm->mi_params.mi_cols; + svc->mi_rows_full_resoln = cm->mi_params.mi_rows; } } diff -Nru aom-3.8.2/av1/encoder/svc_layercontext.h aom-3.9.0/av1/encoder/svc_layercontext.h --- aom-3.8.2/av1/encoder/svc_layercontext.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/svc_layercontext.h 2024-05-07 19:57:03.396000000 +0000 @@ -147,6 +147,23 @@ * different/lower bitrate. */ int has_lower_quality_layer; + + /*! + * Flag to indicate the frame drop mode for SVC: one of the two settings: + * AOM_LAYER_DROP (default) or AOM_FULL_SUPERFRAME_DROP. + */ + AOM_SVC_FRAME_DROP_MODE framedrop_mode; + + /*! + * Flag to indicate if frame was dropped for a given spatial_layer_id on + * previous superframe. + */ + bool last_layer_dropped[AOM_MAX_SS_LAYERS]; + + /*! + * Flag to indicate if a previous spatial was dropped for the same superframe. + */ + bool drop_spatial_layer[AOM_MAX_SS_LAYERS]; } SVC; struct AV1_COMP; @@ -206,6 +223,21 @@ */ void av1_update_temporal_layer_framerate(struct AV1_COMP *const cpi); +/*!\brief Prior to check if reference is lower spatial layer at the same + * timestamp/superframe. + * + * \ingroup SVC + * \callgraph + * \callergraph + * + * \param[in] cpi Top level encoder structure + * \param[in] ref_frame Reference frame + * + * \return True if the ref_frame if lower spatial layer, otherwise false. + */ +bool av1_check_ref_is_low_spatial_res_super_frame(struct AV1_COMP *const cpi, + int ref_frame); + /*!\brief Prior to encoding the frame, set the layer context, for the current layer to be encoded, to the cpi struct. * diff -Nru aom-3.8.2/av1/encoder/temporal_filter.c aom-3.9.0/av1/encoder/temporal_filter.c --- aom-3.8.2/av1/encoder/temporal_filter.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/temporal_filter.c 2024-05-07 19:57:03.397000000 +0000 @@ -1443,26 +1443,24 @@ return oxcf->algo_cfg.arnr_max_frames > 0 && oxcf->gf_cfg.lag_in_frames > 1; } -void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) { +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const AV1_COMP *cpi) { const AV1EncoderConfig *oxcf = &cpi->oxcf; tf_info->is_temporal_filter_on = av1_is_temporal_filter_on(oxcf); - if (tf_info->is_temporal_filter_on == 0) return; + if (tf_info->is_temporal_filter_on == 0) return true; const AV1_COMMON *cm = &cpi->common; const SequenceHeader *const seq_params = cm->seq_params; - int ret; for (int i = 0; i < TF_INFO_BUF_COUNT; ++i) { - ret = aom_realloc_frame_buffer( - &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, oxcf->frm_dim_cfg.height, - seq_params->subsampling_x, seq_params->subsampling_y, - seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, NULL, NULL, NULL, - cpi->image_pyramid_levels, 0); - if (ret) { - aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, - "Failed to allocate tf_info"); + if (aom_realloc_frame_buffer( + &tf_info->tf_buf[i], oxcf->frm_dim_cfg.width, + oxcf->frm_dim_cfg.height, seq_params->subsampling_x, + seq_params->subsampling_y, seq_params->use_highbitdepth, + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, NULL, NULL, + NULL, cpi->alloc_pyramid, 0)) { + return false; } } + return true; } void av1_tf_info_free(TEMPORAL_FILTER_INFO *tf_info) { diff -Nru aom-3.8.2/av1/encoder/temporal_filter.h aom-3.9.0/av1/encoder/temporal_filter.h --- aom-3.8.2/av1/encoder/temporal_filter.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/temporal_filter.h 2024-05-07 19:57:03.401000000 +0000 @@ -14,6 +14,8 @@ #include +#include "aom_util/aom_pthread.h" + #ifdef __cplusplus extern "C" { #endif @@ -204,8 +206,10 @@ /*!\brief Allocate buffers for TEMPORAL_FILTER_INFO * \param[in,out] tf_info Temporal filter info for a gop * \param[in,out] cpi Top level encoder instance structure + * + * \return True on success, false on memory allocation failure. */ -void av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, +bool av1_tf_info_alloc(TEMPORAL_FILTER_INFO *tf_info, const struct AV1_COMP *cpi); /*!\brief Free buffers for TEMPORAL_FILTER_INFO diff -Nru aom-3.8.2/av1/encoder/tpl_model.c aom-3.9.0/av1/encoder/tpl_model.c --- aom-3.8.2/av1/encoder/tpl_model.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/tpl_model.c 2024-05-07 19:57:03.406000000 +0000 @@ -19,6 +19,7 @@ #include "config/aom_scale_rtcd.h" #include "aom/aom_codec.h" +#include "aom_util/aom_pthread.h" #include "av1/common/av1_common_int.h" #include "av1/common/enums.h" @@ -193,7 +194,7 @@ &tpl_data->tpl_rec_pool[frame], width, height, seq_params->subsampling_x, seq_params->subsampling_y, seq_params->use_highbitdepth, tpl_data->border_in_pixels, - byte_alignment, 0, alloc_y_plane_only)) + byte_alignment, false, alloc_y_plane_only)) aom_internal_error(&ppi->error, AOM_CODEC_MEM_ERROR, "Failed to allocate frame buffer"); } diff -Nru aom-3.8.2/av1/encoder/tpl_model.h aom-3.9.0/av1/encoder/tpl_model.h --- aom-3.8.2/av1/encoder/tpl_model.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/tpl_model.h 2024-05-07 19:57:03.411000000 +0000 @@ -30,6 +30,7 @@ #include "config/aom_config.h" #include "aom_scale/yv12config.h" +#include "aom_util/aom_pthread.h" #include "av1/common/mv.h" #include "av1/common/scale.h" diff -Nru aom-3.8.2/av1/encoder/tune_butteraugli.c aom-3.9.0/av1/encoder/tune_butteraugli.c --- aom-3.8.2/av1/encoder/tune_butteraugli.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/tune_butteraugli.c 2024-05-07 19:57:03.413000000 +0000 @@ -209,7 +209,7 @@ if (dst->buffer_alloc_sz == 0) { aom_alloc_frame_buffer( dst, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); } av1_copy_and_extend_frame(cpi->source, dst); @@ -218,7 +218,7 @@ aom_alloc_frame_buffer( resized_dst, width / resize_factor, height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); } if (!av1_resize_and_extend_frame_nonnormative( cpi->source, resized_dst, bit_depth, av1_num_planes(cm))) { @@ -244,7 +244,7 @@ aom_alloc_frame_buffer( &resized_recon, width / resize_factor, height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); copy_img(&cpi->common.cur_frame->buf, &resized_recon, width / resize_factor, height / resize_factor); @@ -267,12 +267,12 @@ cpi->source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_source, &cpi->scaled_source, cm->features.interp_filter, - 0, false, false, cpi->oxcf.border_in_pixels, cpi->image_pyramid_levels); + 0, false, false, cpi->oxcf.border_in_pixels, cpi->alloc_pyramid); if (cpi->unscaled_last_source != NULL) { cpi->last_source = av1_realloc_and_scale_if_required( cm, cpi->unscaled_last_source, &cpi->scaled_last_source, cm->features.interp_filter, 0, false, false, cpi->oxcf.border_in_pixels, - cpi->image_pyramid_levels); + cpi->alloc_pyramid); } av1_setup_butteraugli_source(cpi); diff -Nru aom-3.8.2/av1/encoder/tune_vmaf.c aom-3.9.0/av1/encoder/tune_vmaf.c --- aom-3.8.2/av1/encoder/tune_vmaf.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/tune_vmaf.c 2024-05-07 19:57:03.414000000 +0000 @@ -247,7 +247,9 @@ // 8-tap Gaussian convolution filter with sigma = 1.0, sums to 128, // all co-efficients must be even. -DECLARE_ALIGNED(16, static const int16_t, gauss_filter[8]) = { 0, 8, 30, 52, +// The array is of size 9 to allow passing gauss_filter + 1 to +// _mm_loadu_si128() in prepare_coeffs_6t(). +DECLARE_ALIGNED(16, static const int16_t, gauss_filter[9]) = { 0, 8, 30, 52, 30, 8, 0, 0 }; static AOM_INLINE void gaussian_blur(const int bit_depth, const YV12_BUFFER_CONFIG *source, @@ -288,10 +290,10 @@ } } -static AOM_INLINE double cal_approx_vmaf(const AV1_COMP *const cpi, - double source_variance, - YV12_BUFFER_CONFIG *const source, - YV12_BUFFER_CONFIG *const sharpened) { +static AOM_INLINE double cal_approx_vmaf( + const AV1_COMP *const cpi, double source_variance, + const YV12_BUFFER_CONFIG *const source, + const YV12_BUFFER_CONFIG *const sharpened) { const int bit_depth = cpi->td.mb.e_mbd.bd; const bool cal_vmaf_neg = cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN; @@ -305,11 +307,11 @@ } static double find_best_frame_unsharp_amount_loop( - const AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const source, - YV12_BUFFER_CONFIG *const blurred, YV12_BUFFER_CONFIG *const sharpened, - double best_vmaf, const double baseline_variance, - const double unsharp_amount_start, const double step_size, - const int max_loop_count, const double max_amount) { + const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source, + const YV12_BUFFER_CONFIG *const blurred, + const YV12_BUFFER_CONFIG *const sharpened, double best_vmaf, + const double baseline_variance, const double unsharp_amount_start, + const double step_size, const int max_loop_count, const double max_amount) { const double min_amount = 0.0; int loop_count = 0; double approx_vmaf = best_vmaf; @@ -328,13 +330,11 @@ return AOMMIN(max_amount, AOMMAX(unsharp_amount, min_amount)); } -static double find_best_frame_unsharp_amount(const AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source, - YV12_BUFFER_CONFIG *const blurred, - const double unsharp_amount_start, - const double step_size, - const int max_loop_count, - const double max_filter_amount) { +static double find_best_frame_unsharp_amount( + const AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const source, + const YV12_BUFFER_CONFIG *const blurred, const double unsharp_amount_start, + const double step_size, const int max_loop_count, + const double max_filter_amount) { const AV1_COMMON *const cm = &cpi->common; const int width = source->y_width; const int height = source->y_height; @@ -343,7 +343,7 @@ aom_alloc_frame_buffer( &sharpened, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); const double baseline_variance = frame_average_variance(cpi, source); double unsharp_amount; @@ -376,7 +376,7 @@ } void av1_vmaf_neg_preprocessing(AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source) { + const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_width; @@ -395,7 +395,7 @@ aom_alloc_frame_buffer( &blurred, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, source, &blurred); unsharp(cpi, source, &blurred, source, best_frame_unsharp_amount); @@ -403,7 +403,7 @@ } void av1_vmaf_frame_preprocessing(AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source) { + const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int bit_depth = cpi->td.mb.e_mbd.bd; const int width = source->y_width; @@ -415,11 +415,11 @@ aom_alloc_frame_buffer( &source_extended, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer( &blurred, width, height, source->subsampling_x, source->subsampling_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); @@ -442,7 +442,7 @@ } void av1_vmaf_blk_preprocessing(AV1_COMP *const cpi, - YV12_BUFFER_CONFIG *const source) { + const YV12_BUFFER_CONFIG *const source) { const AV1_COMMON *const cm = &cpi->common; const int width = source->y_width; const int height = source->y_height; @@ -455,11 +455,11 @@ memset(&source_extended, 0, sizeof(source_extended)); aom_alloc_frame_buffer( &blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&source_extended, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); av1_copy_and_extend_frame(source, &source_extended); gaussian_blur(bit_depth, &source_extended, &blurred); @@ -495,11 +495,11 @@ aom_alloc_frame_buffer(&source_block, block_w, block_h, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_block, block_w, block_h, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); for (int row = 0; row < num_rows; ++row) { for (int col = 0; col < num_cols; ++col) { @@ -622,7 +622,7 @@ aom_alloc_frame_buffer( &resized_source, y_width / resize_factor, y_height / resize_factor, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); if (!av1_resize_and_extend_frame_nonnormative( cpi->source, &resized_source, bit_depth, av1_num_planes(cm))) { aom_internal_error(cm->error, AOM_CODEC_MEM_ERROR, @@ -643,7 +643,7 @@ aom_alloc_frame_buffer(&blurred, resized_y_width, resized_y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, &resized_source, &blurred); YV12_BUFFER_CONFIG recon; @@ -651,7 +651,7 @@ aom_alloc_frame_buffer(&recon, resized_y_width, resized_y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_yv12_copy_frame(&resized_source, &recon, 1); VmafContext *vmaf_context; @@ -830,15 +830,15 @@ aom_alloc_frame_buffer(&blurred_cur, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_last, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&blurred_next, y_width, y_height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, cur, &blurred_cur); gaussian_blur(bit_depth, last, &blurred_last); @@ -881,8 +881,8 @@ } static AOM_INLINE void get_neighbor_frames(const AV1_COMP *const cpi, - YV12_BUFFER_CONFIG **last, - YV12_BUFFER_CONFIG **next) { + const YV12_BUFFER_CONFIG **last, + const YV12_BUFFER_CONFIG **next) { const AV1_COMMON *const cm = &cpi->common; const GF_GROUP *gf_group = &cpi->ppi->gf_group; const int src_index = @@ -920,7 +920,7 @@ if (approx_sse < sse_threshold || approx_dvmaf < vmaf_threshold) { return current_qindex; } - YV12_BUFFER_CONFIG *cur_buf = cpi->source; + const YV12_BUFFER_CONFIG *cur_buf = cpi->source; if (cm->show_frame == 0) { const int src_index = gf_group->arf_src_offset[cpi->gf_frame_index]; struct lookahead_entry *cur_entry = av1_lookahead_peek( @@ -929,7 +929,7 @@ } assert(cur_buf); - YV12_BUFFER_CONFIG *next_buf, *last_buf; + const YV12_BUFFER_CONFIG *next_buf, *last_buf; get_neighbor_frames(cpi, &last_buf, &next_buf); assert(last_buf); @@ -954,8 +954,8 @@ static AOM_INLINE double cal_approx_score( AV1_COMP *const cpi, double src_variance, double new_variance, - double src_score, YV12_BUFFER_CONFIG *const src, - YV12_BUFFER_CONFIG *const recon_sharpened) { + double src_score, const YV12_BUFFER_CONFIG *const src, + const YV12_BUFFER_CONFIG *const recon_sharpened) { double score; const uint32_t bit_depth = cpi->td.mb.e_mbd.bd; const bool cal_vmaf_neg = @@ -967,11 +967,12 @@ static double find_best_frame_unsharp_amount_loop_neg( AV1_COMP *const cpi, double src_variance, double base_score, - YV12_BUFFER_CONFIG *const src, YV12_BUFFER_CONFIG *const recon, - YV12_BUFFER_CONFIG *const ref, YV12_BUFFER_CONFIG *const src_blurred, - YV12_BUFFER_CONFIG *const recon_blurred, - YV12_BUFFER_CONFIG *const src_sharpened, - YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, + const YV12_BUFFER_CONFIG *const src, const YV12_BUFFER_CONFIG *const recon, + const YV12_BUFFER_CONFIG *const ref, + const YV12_BUFFER_CONFIG *const src_blurred, + const YV12_BUFFER_CONFIG *const recon_blurred, + const YV12_BUFFER_CONFIG *const src_sharpened, + const YV12_BUFFER_CONFIG *const recon_sharpened, FULLPEL_MV *mvs, double best_score, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_amount) { const double min_amount = 0.0; @@ -999,8 +1000,8 @@ } static double find_best_frame_unsharp_amount_neg( - AV1_COMP *const cpi, YV12_BUFFER_CONFIG *const src, - YV12_BUFFER_CONFIG *const recon, YV12_BUFFER_CONFIG *const ref, + AV1_COMP *const cpi, const YV12_BUFFER_CONFIG *const src, + const YV12_BUFFER_CONFIG *const recon, const YV12_BUFFER_CONFIG *const ref, double base_score, const double unsharp_amount_start, const double step_size, const int max_loop_count, const double max_filter_amount) { @@ -1023,18 +1024,18 @@ aom_alloc_frame_buffer(&recon_sharpened, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&src_sharpened, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer(&recon_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, cpi->oxcf.border_in_pixels, - cm->features.byte_alignment, 0, 0); + cm->features.byte_alignment, false, 0); aom_alloc_frame_buffer( &src_blurred, width, height, ss_x, ss_y, cm->seq_params->use_highbitdepth, - cpi->oxcf.border_in_pixels, cm->features.byte_alignment, 0, 0); + cpi->oxcf.border_in_pixels, cm->features.byte_alignment, false, 0); gaussian_blur(bit_depth, recon, &recon_blurred); gaussian_blur(bit_depth, src, &src_blurred); @@ -1076,8 +1077,8 @@ } void av1_update_vmaf_curve(AV1_COMP *cpi) { - YV12_BUFFER_CONFIG *source = cpi->source; - YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; + const YV12_BUFFER_CONFIG *source = cpi->source; + const YV12_BUFFER_CONFIG *recon = &cpi->common.cur_frame->buf; const int bit_depth = cpi->td.mb.e_mbd.bd; const GF_GROUP *const gf_group = &cpi->ppi->gf_group; const int layer_depth = @@ -1099,7 +1100,7 @@ } if (cpi->oxcf.tune_cfg.tuning == AOM_TUNE_VMAF_NEG_MAX_GAIN) { - YV12_BUFFER_CONFIG *last, *next; + const YV12_BUFFER_CONFIG *last, *next; get_neighbor_frames(cpi, &last, &next); double best_unsharp_amount_start = get_layer_value(cpi->vmaf_info.last_frame_unsharp_amount, layer_depth); diff -Nru aom-3.8.2/av1/encoder/tune_vmaf.h aom-3.9.0/av1/encoder/tune_vmaf.h --- aom-3.8.2/av1/encoder/tune_vmaf.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/tune_vmaf.h 2024-05-07 19:57:03.416000000 +0000 @@ -43,13 +43,13 @@ struct AV1_COMP; void av1_vmaf_blk_preprocessing(struct AV1_COMP *cpi, - YV12_BUFFER_CONFIG *source); + const YV12_BUFFER_CONFIG *source); void av1_vmaf_frame_preprocessing(struct AV1_COMP *cpi, - YV12_BUFFER_CONFIG *source); + const YV12_BUFFER_CONFIG *source); void av1_vmaf_neg_preprocessing(struct AV1_COMP *cpi, - YV12_BUFFER_CONFIG *source); + const YV12_BUFFER_CONFIG *source); void av1_set_mb_vmaf_rdmult_scaling(struct AV1_COMP *cpi); diff -Nru aom-3.8.2/av1/encoder/var_based_part.c aom-3.9.0/av1/encoder/var_based_part.c --- aom-3.8.2/av1/encoder/var_based_part.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/var_based_part.c 2024-05-07 19:57:03.440000000 +0000 @@ -1109,8 +1109,8 @@ static void fill_variance_tree_leaves( AV1_COMP *cpi, MACROBLOCK *x, VP128x128 *vt, PART_EVAL_STATUS *force_split, int avg_16x16[][4], int maxvar_16x16[][4], int minvar_16x16[][4], - int *variance4x4downsample, int64_t *thresholds, const uint8_t *src_buf, - int src_stride, const uint8_t *dst_buf, int dst_stride, bool is_key_frame, + int64_t *thresholds, const uint8_t *src_buf, int src_stride, + const uint8_t *dst_buf, int dst_stride, bool is_key_frame, const bool is_small_sb) { MACROBLOCKD *xd = &x->e_mbd; const int num_64x64_blocks = is_small_sb ? 1 : 4; @@ -1157,11 +1157,8 @@ const int split_index = 21 + lvl1_scale_idx + lvl2_idx; VP16x16 *vst = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; force_split[split_index] = PART_EVAL_ALL; - variance4x4downsample[lvl1_scale_idx + lvl2_idx] = 0; if (is_key_frame) { - force_split[split_index] = PART_EVAL_ALL; // Go down to 4x4 down-sampling for variance. - variance4x4downsample[lvl1_scale_idx + lvl2_idx] = 1; for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) { const int x8_idx = x16_idx + GET_BLK_IDX_X(lvl3_idx, 3); const int y8_idx = y16_idx + GET_BLK_IDX_Y(lvl3_idx, 3); @@ -1347,6 +1344,8 @@ AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int num_planes = av1_num_planes(cm); + bool scaled_ref_golden = false; + bool scaled_ref_alt = false; BLOCK_SIZE bsize = is_small_sb ? BLOCK_64X64 : BLOCK_128X128; MB_MODE_INFO *mi = xd->mi[0]; const YV12_BUFFER_CONFIG *yv12 = @@ -1364,21 +1363,22 @@ cpi->sf.rt_sf.use_nonrd_altref_frame || (cpi->sf.rt_sf.use_comp_ref_nonrd && cpi->sf.rt_sf.ref_frame_comp_nonrd[2] == 1); - // On a resized frame (reference has different scale) only use - // LAST as reference for partitioning for now. - if (scaled_ref_last) { - use_golden_ref = 0; - use_alt_ref = 0; - } // For 1 spatial layer: GOLDEN is another temporal reference. // Check if it should be used as reference for partitioning. if (cpi->svc.number_spatial_layers == 1 && use_golden_ref && (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { yv12_g = get_ref_frame_yv12_buf(cm, GOLDEN_FRAME); + if (yv12_g && (yv12_g->y_crop_height != cm->height || + yv12_g->y_crop_width != cm->width)) { + yv12_g = av1_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + scaled_ref_golden = true; + } if (yv12_g && yv12_g != yv12) { - av1_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col, - get_ref_scale_factors(cm, GOLDEN_FRAME), num_planes); + av1_setup_pre_planes( + xd, 0, yv12_g, mi_row, mi_col, + scaled_ref_golden ? NULL : get_ref_scale_factors(cm, GOLDEN_FRAME), + num_planes); *y_sad_g = cpi->ppi->fn_ptr[bsize].sdf( x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, xd->plane[AOM_PLANE_Y].pre[0].buf, @@ -1392,9 +1392,16 @@ (cpi->ref_frame_flags & AOM_ALT_FLAG) && (x->content_state_sb.source_sad_nonrd != kZeroSad || !use_last_ref)) { yv12_alt = get_ref_frame_yv12_buf(cm, ALTREF_FRAME); + if (yv12_alt && (yv12_alt->y_crop_height != cm->height || + yv12_alt->y_crop_width != cm->width)) { + yv12_alt = av1_get_scaled_ref_frame(cpi, ALTREF_FRAME); + scaled_ref_alt = true; + } if (yv12_alt && yv12_alt != yv12) { - av1_setup_pre_planes(xd, 0, yv12_alt, mi_row, mi_col, - get_ref_scale_factors(cm, ALTREF_FRAME), num_planes); + av1_setup_pre_planes( + xd, 0, yv12_alt, mi_row, mi_col, + scaled_ref_alt ? NULL : get_ref_scale_factors(cm, ALTREF_FRAME), + num_planes); *y_sad_alt = cpi->ppi->fn_ptr[bsize].sdf( x->plane[AOM_PLANE_Y].src.buf, x->plane[AOM_PLANE_Y].src.stride, xd->plane[AOM_PLANE_Y].pre[0].buf, @@ -1518,7 +1525,9 @@ int set_zeromv_skip_based_on_source_sad, SOURCE_SAD source_sad_nonrd) { if (set_zeromv_skip_based_on_source_sad == 0) return false; - if (set_zeromv_skip_based_on_source_sad >= 2) + if (set_zeromv_skip_based_on_source_sad >= 3) + return source_sad_nonrd <= kLowSad; + else if (set_zeromv_skip_based_on_source_sad >= 2) return source_sad_nonrd <= kVeryLowSad; else if (set_zeromv_skip_based_on_source_sad >= 1) return source_sad_nonrd == kZeroSad; @@ -1527,20 +1536,21 @@ } static AOM_INLINE bool set_force_zeromv_skip_for_sb( - AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP16x16 *vt2, - VP128x128 *vt, unsigned int *uv_sad, int mi_row, int mi_col, - unsigned int y_sad, BLOCK_SIZE bsize) { + AV1_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, VP128x128 *vt, + unsigned int *uv_sad, int mi_row, int mi_col, unsigned int y_sad, + BLOCK_SIZE bsize) { AV1_COMMON *const cm = &cpi->common; if (!is_set_force_zeromv_skip_based_on_src_sad( cpi->sf.rt_sf.set_zeromv_skip_based_on_source_sad, x->content_state_sb.source_sad_nonrd)) return false; + int shift = cpi->sf.rt_sf.increase_source_sad_thresh ? 1 : 0; const int block_width = mi_size_wide[cm->seq_params->sb_size]; const int block_height = mi_size_high[cm->seq_params->sb_size]; const unsigned int thresh_exit_part_y = - cpi->zeromv_skip_thresh_exit_part[bsize]; + cpi->zeromv_skip_thresh_exit_part[bsize] << shift; unsigned int thresh_exit_part_uv = - CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y); + CALC_CHROMA_THRESH_FOR_ZEROMV_SKIP(thresh_exit_part_y) << shift; // Be more aggressive in UV threshold if source_sad >= VeryLowSad // to suppreess visual artifact caused by the speed feature: // set_zeromv_skip_based_on_source_sad = 2. For now only for @@ -1553,7 +1563,6 @@ uv_sad[0] < thresh_exit_part_uv && uv_sad[1] < thresh_exit_part_uv) { set_block_size(cpi, mi_row, mi_col, bsize); x->force_zeromv_skip_for_sb = 1; - aom_free(vt2); aom_free(vt); // Partition shape is set here at SB level. // Exit needs to happen from av1_choose_var_based_partitioning(). @@ -1573,8 +1582,6 @@ AV1_COMMON *const cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; const int64_t *const vbp_thresholds = cpi->vbp_info.thresholds; - VP128x128 *vt; - VP16x16 *vt2 = NULL; PART_EVAL_STATUS force_split[85]; int avg_64x64; int max_var_32x32[4]; @@ -1586,7 +1593,6 @@ int avg_16x16[4][4]; int maxvar_16x16[4][4]; int minvar_16x16[4][4]; - int64_t threshold_4x4avg; const uint8_t *src_buf; const uint8_t *dst_buf; int dst_stride; @@ -1617,16 +1623,10 @@ // Ref frame used in partitioning. MV_REFERENCE_FRAME ref_frame_partition = LAST_FRAME; - AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt))); - - vt->split = td->vt64x64; - int64_t thresholds[5] = { vbp_thresholds[0], vbp_thresholds[1], vbp_thresholds[2], vbp_thresholds[3], vbp_thresholds[4] }; - const int low_res = (cm->width <= 352 && cm->height <= 288); - int variance4x4downsample[64]; const int segment_id = xd->mi[0]->segment_id; uint64_t blk_sad = 0; if (cpi->src_sad_blk_64x64 != NULL && @@ -1653,9 +1653,6 @@ x->content_state_sb.source_sad_nonrd, x->content_state_sb.source_sad_rd, is_segment_id_boosted, x->content_state_sb.lighting_change); - // For non keyframes, disable 4x4 average for low resolution when speed = 8 - threshold_4x4avg = INT64_MAX; - src_buf = x->plane[AOM_PLANE_Y].src.buf; int src_stride = x->plane[AOM_PLANE_Y].src.stride; @@ -1720,6 +1717,10 @@ x->force_zeromv_skip_for_sb = 0; + VP128x128 *vt; + AOM_CHECK_MEM_ERROR(xd->error_info, vt, aom_malloc(sizeof(*vt))); + vt->split = td->vt64x64; + // If the superblock is completely static (zero source sad) and // the y_sad (relative to LAST ref) is very small, take the sb_size partition // and exit, and force zeromv_last skip mode for nonrd_pickmode. @@ -1730,28 +1731,19 @@ cpi->rc.frames_since_key > 30 && segment_id == CR_SEGMENT_ID_BASE && ref_frame_partition == LAST_FRAME && xd->mi[0]->mv[0].as_int == 0) { // Exit here, if zero mv skip flag is set at SB level. - if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt2, vt, uv_sad, mi_row, - mi_col, y_sad, bsize)) + if (set_force_zeromv_skip_for_sb(cpi, x, tile, vt, uv_sad, mi_row, mi_col, + y_sad, bsize)) return 0; } if (cpi->noise_estimate.enabled) noise_level = av1_noise_estimate_extract_level(&cpi->noise_estimate); - if (low_res && threshold_4x4avg < INT64_MAX) { - vt2 = aom_malloc(sizeof(*vt2)); - if (!vt2) { - aom_free(vt); - aom_internal_error(xd->error_info, AOM_CODEC_MEM_ERROR, - "Error allocating partition buffer vt2"); - } - } - // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances - // for splits. + // Fill in the entire tree of 8x8 (for inter frames) or 4x4 (for key frames) + // variances for splits. fill_variance_tree_leaves(cpi, x, vt, force_split, avg_16x16, maxvar_16x16, - minvar_16x16, variance4x4downsample, thresholds, - src_buf, src_stride, dst_buf, dst_stride, - is_key_frame, is_small_sb); + minvar_16x16, thresholds, src_buf, src_stride, + dst_buf, dst_stride, is_key_frame, is_small_sb); avg_64x64 = 0; for (int blk64_idx = 0; blk64_idx < num_64x64_blocks; ++blk64_idx) { @@ -1761,11 +1753,8 @@ for (int lvl1_idx = 0; lvl1_idx < 4; lvl1_idx++) { const int lvl1_scale_idx = (blk64_scale_idx + lvl1_idx) << 2; for (int lvl2_idx = 0; lvl2_idx < 4; lvl2_idx++) { - if (variance4x4downsample[lvl1_scale_idx + lvl2_idx] != 1) continue; - VP16x16 *vtemp = - (!is_key_frame) - ? &vt2[lvl1_scale_idx + lvl2_idx] - : &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + if (!is_key_frame) continue; + VP16x16 *vtemp = &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; for (int lvl3_idx = 0; lvl3_idx < 4; lvl3_idx++) fill_variance_tree(&vtemp->split[lvl3_idx], BLOCK_8X8); fill_variance_tree(vtemp, BLOCK_16X16); @@ -1892,14 +1881,8 @@ const int x16_idx = GET_BLK_IDX_X(lvl2_idx, 2); const int y16_idx = GET_BLK_IDX_Y(lvl2_idx, 2); const int split_index = 21 + lvl1_scale_idx + lvl2_idx; - // For inter frames: if variance4x4downsample[] == 1 for this - // 16x16 block, then the variance is based on 4x4 down-sampling, - // so use vt2 in set_vt_partioning(), otherwise use vt. VP16x16 *vtemp = - (!is_key_frame && - variance4x4downsample[lvl1_scale_idx + lvl2_idx] == 1) - ? &vt2[lvl1_scale_idx + lvl2_idx] - : &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; + &vt->split[blk64_idx].split[lvl1_idx].split[lvl2_idx]; if (set_vt_partitioning(cpi, xd, tile, vtemp, BLOCK_16X16, mi_row + y64_idx + y32_idx + y16_idx, mi_col + x64_idx + x32_idx + x16_idx, @@ -1923,7 +1906,6 @@ ref_frame_partition, mi_col, mi_row, is_small_sb); } - aom_free(vt2); aom_free(vt); #if CONFIG_COLLECT_COMPONENT_TIMING end_timing(cpi, choose_var_based_partitioning_time); diff -Nru aom-3.8.2/av1/encoder/x86/av1_fwd_txfm_sse2.c aom-3.9.0/av1/encoder/x86/av1_fwd_txfm_sse2.c --- aom-3.8.2/av1/encoder/x86/av1_fwd_txfm_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/x86/av1_fwd_txfm_sse2.c 2024-05-07 19:57:03.453000000 +0000 @@ -2638,6 +2638,11 @@ } } +// Include top-level function only for 32-bit x86, to support Valgrind. +// For normal use, we require SSE4.1, so av1_lowbd_fwd_txfm_sse4_1 will be used +// instead of this function. However, 32-bit Valgrind does not support SSE4.1, +// so we include a fallback to SSE2 to improve performance +#if AOM_ARCH_X86 static FwdTxfm2dFunc fwd_txfm2d_func_ls[TX_SIZES_ALL] = { av1_lowbd_fwd_txfm2d_4x4_sse2, // 4x4 transform av1_lowbd_fwd_txfm2d_8x8_sse2, // 8x8 transform @@ -2671,3 +2676,4 @@ fwd_txfm2d_func(src_diff, coeff, diff_stride, txfm_param->tx_type, txfm_param->bd); } +#endif // AOM_ARCH_X86 diff -Nru aom-3.8.2/av1/encoder/x86/av1_k_means_avx2.c aom-3.9.0/av1/encoder/x86/av1_k_means_avx2.c --- aom-3.8.2/av1/encoder/x86/av1_k_means_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/x86/av1_k_means_avx2.c 2024-05-07 19:57:03.458000000 +0000 @@ -10,7 +10,7 @@ */ #include // AVX2 -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/synonyms.h" static int64_t k_means_horizontal_sum_avx2(__m256i a) { diff -Nru aom-3.8.2/av1/encoder/x86/av1_k_means_sse2.c aom-3.9.0/av1/encoder/x86/av1_k_means_sse2.c --- aom-3.8.2/av1/encoder/x86/av1_k_means_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/x86/av1_k_means_sse2.c 2024-05-07 19:57:03.459000000 +0000 @@ -11,7 +11,7 @@ #include // SSE2 -#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" #include "aom_dsp/x86/synonyms.h" static int64_t k_means_horizontal_sum_sse2(__m128i a) { diff -Nru aom-3.8.2/av1/encoder/x86/hash_sse42.c aom-3.9.0/av1/encoder/x86/hash_sse42.c --- aom-3.8.2/av1/encoder/x86/hash_sse42.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/x86/hash_sse42.c 2024-05-07 19:57:03.465000000 +0000 @@ -12,6 +12,8 @@ #include #include +#include "config/av1_rtcd.h" + // Byte-boundary alignment issues #define ALIGN_SIZE 8 #define ALIGN_MASK (ALIGN_SIZE - 1) diff -Nru aom-3.8.2/av1/encoder/x86/highbd_block_error_intrin_avx2.c aom-3.9.0/av1/encoder/x86/highbd_block_error_intrin_avx2.c --- aom-3.8.2/av1/encoder/x86/highbd_block_error_intrin_avx2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/x86/highbd_block_error_intrin_avx2.c 2024-05-07 19:57:03.465000000 +0000 @@ -13,6 +13,7 @@ #include #include "aom/aom_integer.h" #include "av1/common/common.h" +#include "config/av1_rtcd.h" int64_t av1_highbd_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff, diff -Nru aom-3.8.2/av1/encoder/x86/highbd_block_error_intrin_sse2.c aom-3.9.0/av1/encoder/x86/highbd_block_error_intrin_sse2.c --- aom-3.8.2/av1/encoder/x86/highbd_block_error_intrin_sse2.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/encoder/x86/highbd_block_error_intrin_sse2.c 2024-05-07 19:57:03.466000000 +0000 @@ -13,6 +13,7 @@ #include #include "av1/common/common.h" +#include "config/av1_rtcd.h" int64_t av1_highbd_block_error_sse2(const tran_low_t *coeff, const tran_low_t *dqcoeff, diff -Nru aom-3.8.2/av1/ratectrl_rtc.cc aom-3.9.0/av1/ratectrl_rtc.cc --- aom-3.8.2/av1/ratectrl_rtc.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/av1/ratectrl_rtc.cc 2024-05-07 19:57:03.485000000 +0000 @@ -128,6 +128,7 @@ oxcf->tune_cfg.content = AOM_CONTENT_DEFAULT; oxcf->rc_cfg.drop_frames_water_mark = rc_cfg.frame_drop_thresh; rc->max_consec_drop = rc_cfg.max_consec_drop; + cpi_->svc.framedrop_mode = AOM_FULL_SUPERFRAME_DROP; oxcf->tool_cfg.bit_depth = AOM_BITS_8; oxcf->tool_cfg.superblock_size = AOM_SUPERBLOCK_SIZE_DYNAMIC; oxcf->algo_cfg.loopfilter_control = LOOPFILTER_ALL; diff -Nru aom-3.8.2/build/cmake/aom_config_defaults.cmake aom-3.9.0/build/cmake/aom_config_defaults.cmake --- aom-3.8.2/build/cmake/aom_config_defaults.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/build/cmake/aom_config_defaults.cmake 2024-05-07 19:57:03.487000000 +0000 @@ -37,6 +37,7 @@ set_aom_detect_var(HAVE_NEON_I8MM 0 "Enables Armv8.2-A Neon i8mm intrinsics optimizations.") set_aom_detect_var(HAVE_SVE 0 "Enables Armv8.2-A SVE intrinsics optimizations.") +set_aom_detect_var(HAVE_SVE2 0 "Enables Armv9-A SVE2 intrinsics optimizations.") # PPC feature flags. set_aom_detect_var(HAVE_VSX 0 "Enables VSX optimizations.") @@ -84,6 +85,9 @@ set_aom_config_var(CONFIG_MULTITHREAD 1 "Multithread support.") set_aom_config_var(CONFIG_OS_SUPPORT 0 "Internal flag.") set_aom_config_var(CONFIG_PIC 0 "Build with PIC enabled.") +set_aom_config_var(CONFIG_QUANT_MATRIX 1 + "Build with quantization matrices for AV1 encoder." + "AV1 decoder is always built with quantization matrices.") set_aom_config_var(CONFIG_REALTIME_ONLY 0 "Build for RTC-only. See aomcx.h for all disabled features.") set_aom_config_var(CONFIG_RUNTIME_CPU_DETECT 1 "Runtime CPU detection support.") @@ -168,6 +172,9 @@ "AV1 experiment: Enable saliency map based encoding tuning for VMAF.") set_aom_config_var(CONFIG_CWG_C013 0 "AV1 experiment: Support for 7.x and 8.x levels.") +# Add this change to make aomenc reported PSNR consistent with libvmaf result. +set_aom_config_var(CONFIG_LIBVMAF_PSNR_PEAK 1 + "Use libvmaf PSNR peak for 10- and 12-bit") # # Variables in this section control optional features of the build system. @@ -206,6 +213,8 @@ "Enables Armv8.2-A Neon i8mm optimizations on AArch64 targets." ON) set_aom_option_var(ENABLE_SVE "Enables Armv8.2-A SVE optimizations on AArch64 targets." ON) +set_aom_option_var(ENABLE_SVE2 + "Enables Armv9-A SVE2 optimizations on AArch64 targets." ON) # VSX intrinsics flags. set_aom_option_var(ENABLE_VSX "Enables VSX optimizations on PowerPC targets." diff -Nru aom-3.8.2/build/cmake/aom_configure.cmake aom-3.9.0/build/cmake/aom_configure.cmake --- aom-3.8.2/build/cmake/aom_configure.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/build/cmake/aom_configure.cmake 2024-05-07 19:57:03.488000000 +0000 @@ -320,6 +320,10 @@ # minimum supported C++ version. If Clang is using this Standard Library # implementation, it cannot target C++11. require_cxx_flag_nomsvc("-std=c++14" YES) + elseif(CYGWIN AND CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + # The GNU C++ compiler in Cygwin needs the -std=gnu++11 flag to make the + # POSIX function declarations visible in the Standard C Library headers. + require_cxx_flag_nomsvc("-std=gnu++11" YES) else() require_cxx_flag_nomsvc("-std=c++11" YES) endif() @@ -393,6 +397,13 @@ endif() add_compiler_flag_if_supported("-D_LARGEFILE_SOURCE") add_compiler_flag_if_supported("-D_FILE_OFFSET_BITS=64") + + # Do not allow implicit vector type conversions on Clang builds (this is + # already the default on GCC builds). + if(CMAKE_C_COMPILER_ID MATCHES "Clang") + # Clang 8.0.1 (in Cygwin) doesn't support -flax-vector-conversions=none. + add_compiler_flag_if_supported("-flax-vector-conversions=none") + endif() endif() # Prior to r23, or with ANDROID_USE_LEGACY_TOOLCHAIN_FILE set, diff -Nru aom-3.8.2/build/cmake/compiler_flags.cmake aom-3.9.0/build/cmake/compiler_flags.cmake --- aom-3.8.2/build/cmake/compiler_flags.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/build/cmake/compiler_flags.cmake 2024-05-07 19:57:03.491000000 +0000 @@ -176,11 +176,11 @@ endif() unset(HAVE_CXX_FLAG CACHE) - message("Checking C compiler flag support for: " ${cxx_flag}) + message("Checking C++ compiler flag support for: " ${cxx_flag}) check_cxx_compiler_flag("${cxx_flag}" HAVE_CXX_FLAG) if(NOT HAVE_CXX_FLAG) message( - FATAL_ERROR "${PROJECT_NAME} requires support for C flag: ${cxx_flag}.") + FATAL_ERROR "${PROJECT_NAME} requires support for C++ flag: ${cxx_flag}.") endif() if(NOT "${AOM_EXE_LINKER_FLAGS}" STREQUAL "") diff -Nru aom-3.8.2/build/cmake/cpu.cmake aom-3.9.0/build/cmake/cpu.cmake --- aom-3.8.2/build/cmake/cpu.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/build/cmake/cpu.cmake 2024-05-07 19:57:03.492000000 +0000 @@ -14,11 +14,12 @@ set(AOM_ARCH_AARCH64 1) set(RTCD_ARCH_ARM "yes") - set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE") + set(ARM64_FLAVORS "NEON;ARM_CRC32;NEON_DOTPROD;NEON_I8MM;SVE;SVE2") set(AOM_ARM_CRC32_DEFAULT_FLAG "-march=armv8-a+crc") set(AOM_NEON_DOTPROD_DEFAULT_FLAG "-march=armv8.2-a+dotprod") set(AOM_NEON_I8MM_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm") set(AOM_SVE_DEFAULT_FLAG "-march=armv8.2-a+dotprod+i8mm+sve") + set(AOM_SVE2_DEFAULT_FLAG "-march=armv9-a+sve2") # SVE2 is a v9-only feature # Check that the compiler flag to enable each flavor is supported by the # compiler. This may not be the case for new architecture features on old @@ -45,8 +46,8 @@ endif() endforeach() - # SVE requires that the Neon-SVE bridge header is also available. - if(ENABLE_SVE) + # SVE and SVE2 require that the Neon-SVE bridge header is also available. + if(ENABLE_SVE OR ENABLE_SVE2) set(OLD_CMAKE_REQURED_FLAGS ${CMAKE_REQUIRED_FLAGS}) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} ${AOM_SVE_FLAG}") aom_check_source_compiles("arm_neon_sve_bridge_available" " @@ -58,6 +59,7 @@ set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQURED_FLAGS}) if(HAVE_SVE_HEADERS EQUAL 0) set(ENABLE_SVE 0) + set(ENABLE_SVE2 0) endif() endif() diff -Nru aom-3.8.2/build/cmake/rtcd.pl aom-3.9.0/build/cmake/rtcd.pl --- aom-3.8.2/build/cmake/rtcd.pl 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/build/cmake/rtcd.pl 2024-05-07 19:57:03.493000000 +0000 @@ -392,7 +392,7 @@ @ALL_ARCHS = filter(qw/neon/); arm; } elsif ($opts{arch} eq 'arm64' ) { - @ALL_ARCHS = filter(qw/neon arm_crc32 neon_dotprod neon_i8mm sve/); + @ALL_ARCHS = filter(qw/neon arm_crc32 neon_dotprod neon_i8mm sve sve2/); @REQUIRES = filter(qw/neon/); &require(@REQUIRES); arm; diff -Nru aom-3.8.2/common/tools_common.c aom-3.9.0/common/tools_common.c --- aom-3.8.2/common/tools_common.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/common/tools_common.c 2024-05-07 19:57:03.502000000 +0000 @@ -97,7 +97,7 @@ int w = aom_img_plane_width(yuv_frame, plane); const int h = aom_img_plane_height(yuv_frame, plane); int r; - // Assuming that for nv12 we read all chroma data at one time + // Assuming that for nv12 we read all chroma data at once if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; if (yuv_frame->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; /* Determine the correct plane based on the image format. The for-loop @@ -245,17 +245,21 @@ void aom_img_write(const aom_image_t *img, FILE *file) { int plane; + const int bytespp = (img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1; for (plane = 0; plane < 3; ++plane) { const unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane) * - ((img->fmt & AOM_IMG_FMT_HIGHBITDEPTH) ? 2 : 1); + int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; + // Assuming that for nv12 we write all chroma data at once + if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; + if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; + for (y = 0; y < h; ++y) { - fwrite(buf, 1, w, file); + fwrite(buf, bytespp, w, file); buf += stride; } } @@ -268,12 +272,16 @@ for (plane = 0; plane < 3; ++plane) { unsigned char *buf = img->planes[plane]; const int stride = img->stride[plane]; - const int w = aom_img_plane_width(img, plane) * bytespp; + int w = aom_img_plane_width(img, plane); const int h = aom_img_plane_height(img, plane); int y; + // Assuming that for nv12 we read all chroma data at once + if (img->fmt == AOM_IMG_FMT_NV12 && plane > 1) break; + if (img->fmt == AOM_IMG_FMT_NV12 && plane == 1) w *= 2; + for (y = 0; y < h; ++y) { - if (fread(buf, 1, w, file) != (size_t)w) return false; + if (fread(buf, bytespp, w, file) != (size_t)w) return false; buf += stride; } } diff -Nru aom-3.8.2/debian/changelog aom-3.9.0/debian/changelog --- aom-3.8.2/debian/changelog 2024-04-17 20:29:24.000000000 +0000 +++ aom-3.9.0/debian/changelog 2024-05-07 20:20:38.000000000 +0000 @@ -1,3 +1,9 @@ +aom (3.9.0-0ubuntu1~22.04.sav0) jammy; urgency=medium + + * New upstream release + + -- Rob Savoury Tue, 07 May 2024 13:20:38 -0700 + aom (3.8.2-2~22.04.sav1) jammy; urgency=medium * debian/control: Add missing BDs for tests (as intended with last upload) diff -Nru aom-3.8.2/doc/dev_guide/av1_encoder.dox aom-3.9.0/doc/dev_guide/av1_encoder.dox --- aom-3.8.2/doc/dev_guide/av1_encoder.dox 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/doc/dev_guide/av1_encoder.dox 2024-05-07 19:57:03.513000000 +0000 @@ -1313,6 +1313,34 @@ All the related functions are listed in \ref coefficient_coding. +\section architecture_simd SIMD usage + +In order to efficiently encode video on modern platforms, it is necessary to +implement optimized versions of many core encoding and decoding functions using +architecture-specific SIMD instructions. + +Functions which have optimized implementations will have multiple variants +in the code, each suffixed with the name of the appropriate instruction set. +There will additionally be an `_c` version, which acts as a reference +implementation which the SIMD variants can be tested against. + +As different machines with the same nominal architecture may support different +subsets of SIMD instructions, we have dynamic CPU detection logic which chooses +the appropriate functions to use at run time. This process is handled by +`build/cmake/rtcd.pl`, with function definitions in the files +`*_rtcd_defs.pl` elsewhere in the codebase. + +Currently SIMD is supported on the following platforms: + +- x86: Requires SSE4.1 or above + +- Arm: Requires Neon (Armv7-A and above) + +We aim to provide implementations of all performance-critical functions which +are compatible with the instruction sets listed above. Additional SIMD +extensions (e.g. AVX on x86, SVE on Arm) are also used to provide even +greater performance where available. + */ /*!\defgroup encoder_algo Encoder Algorithm diff -Nru aom-3.8.2/examples/aom_cx_set_ref.c aom-3.9.0/examples/aom_cx_set_ref.c --- aom-3.8.2/examples/aom_cx_set_ref.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/examples/aom_cx_set_ref.c 2024-05-07 19:57:03.597000000 +0000 @@ -61,7 +61,7 @@ static const char *exec_name; -void usage_exit() { +void usage_exit(void) { fprintf(stderr, "Usage: %s " " \n", diff -Nru aom-3.8.2/examples/av1_dec_fuzzer.cc aom-3.9.0/examples/av1_dec_fuzzer.cc --- aom-3.8.2/examples/av1_dec_fuzzer.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/examples/av1_dec_fuzzer.cc 2024-05-07 19:57:03.598000000 +0000 @@ -34,6 +34,14 @@ return 0; } + // Abusing the four unused bytes at the end of the IVF file header as a source + // of random bits. + unsigned int tile_mode = (data[IVF_FILE_HDR_SZ - 1] & 2) != 0; + unsigned int ext_tile_debug = (data[IVF_FILE_HDR_SZ - 1] & 4) != 0; + unsigned int is_annexb = (data[IVF_FILE_HDR_SZ - 1] & 8) != 0; + int output_all_layers = (data[IVF_FILE_HDR_SZ - 1] & 0x10) != 0; + int operating_point = data[IVF_FILE_HDR_SZ - 2] & 0x1F; + aom_codec_iface_t *codec_interface = aom_codec_av1_dx(); aom_codec_ctx_t codec; // Set thread count in the range [1, 64]. @@ -42,6 +50,13 @@ if (aom_codec_dec_init(&codec, codec_interface, &cfg, 0)) { return 0; } + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1_SET_TILE_MODE, tile_mode); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_EXT_TILE_DEBUG, ext_tile_debug); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_IS_ANNEXB, is_annexb); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OUTPUT_ALL_LAYERS, + output_all_layers); + AOM_CODEC_CONTROL_TYPECHECKED(&codec, AV1D_SET_OPERATING_POINT, + operating_point); data += IVF_FILE_HDR_SZ; size -= IVF_FILE_HDR_SZ; diff -Nru aom-3.8.2/examples/inspect.c aom-3.9.0/examples/inspect.c --- aom-3.8.2/examples/inspect.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/examples/inspect.c 2024-05-07 19:57:03.600000000 +0000 @@ -742,7 +742,7 @@ aom_free(buffer); } -void ifd_init_cb() { +void ifd_init_cb(void) { aom_inspect_init ii; ii.inspect_cb = inspect; ii.inspect_ctx = NULL; @@ -775,7 +775,7 @@ size_t frame_size = 0; EMSCRIPTEN_KEEPALIVE -int read_frame() { +int read_frame(void) { img = NULL; // This loop skips over any frames that are show_existing_frames, as @@ -824,16 +824,18 @@ } EMSCRIPTEN_KEEPALIVE -const char *get_aom_codec_build_config() { return aom_codec_build_config(); } +const char *get_aom_codec_build_config(void) { + return aom_codec_build_config(); +} EMSCRIPTEN_KEEPALIVE -int get_bit_depth() { return img->bit_depth; } +int get_bit_depth(void) { return img->bit_depth; } EMSCRIPTEN_KEEPALIVE -int get_bits_per_sample() { return img->bps; } +int get_bits_per_sample(void) { return img->bps; } EMSCRIPTEN_KEEPALIVE -int get_image_format() { return img->fmt; } +int get_image_format(void) { return img->fmt; } EMSCRIPTEN_KEEPALIVE unsigned char *get_plane(int plane) { return img->planes[plane]; } @@ -848,10 +850,10 @@ int get_plane_height(int plane) { return aom_img_plane_height(img, plane); } EMSCRIPTEN_KEEPALIVE -int get_frame_width() { return info->frame_width; } +int get_frame_width(void) { return info->frame_width; } EMSCRIPTEN_KEEPALIVE -int get_frame_height() { return info->frame_height; } +int get_frame_height(void) { return info->frame_height; } static void parse_args(char **argv) { char **argi, **argj; @@ -949,7 +951,7 @@ } EMSCRIPTEN_KEEPALIVE -void quit() { +void quit(void) { if (aom_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec"); aom_video_reader_close(reader); } diff -Nru aom-3.8.2/examples/svc_encoder_rtc.cc aom-3.9.0/examples/svc_encoder_rtc.cc --- aom-3.8.2/examples/svc_encoder_rtc.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/examples/svc_encoder_rtc.cc 2024-05-07 19:57:03.608000000 +0000 @@ -1442,6 +1442,35 @@ return 63; } +static void set_active_map(const aom_codec_enc_cfg_t *cfg, + aom_codec_ctx_t *codec, int frame_cnt) { + aom_active_map_t map = { 0, 0, 0 }; + + map.rows = (cfg->g_h + 15) / 16; + map.cols = (cfg->g_w + 15) / 16; + + map.active_map = (uint8_t *)malloc(map.rows * map.cols); + if (!map.active_map) die("Failed to allocate active map"); + + // Example map for testing. + for (unsigned int i = 0; i < map.rows; ++i) { + for (unsigned int j = 0; j < map.cols; ++j) { + int index = map.cols * i + j; + map.active_map[index] = 1; + if (frame_cnt < 300) { + if (i < map.rows / 2 && j < map.cols / 2) map.active_map[index] = 0; + } else if (frame_cnt >= 300) { + if (i < map.rows / 2 && j >= map.cols / 2) map.active_map[index] = 0; + } + } + } + + if (aom_codec_control(codec, AOME_SET_ACTIVEMAP, &map)) + die_codec(codec, "Failed to set active map"); + + free(map.active_map); +} + int main(int argc, const char **argv) { AppInput app_input; AvxVideoWriter *outfile[AOM_MAX_LAYERS] = { NULL }; @@ -1494,6 +1523,9 @@ // Flag to test setting speed per layer. const int test_speed_per_layer = 0; + // Flag for testing active maps. + const int test_active_maps = 0; + /* Setup default input stream settings */ app_input.input_ctx.framerate.numerator = 30; app_input.input_ctx.framerate.denominator = 1; @@ -1675,6 +1707,9 @@ aom_codec_control(&codec, AV1E_SET_MAX_CONSEC_FRAME_DROP_CBR, INT_MAX); + aom_codec_control(&codec, AV1E_SET_SVC_FRAME_DROP_MODE, + AOM_FULL_SUPERFRAME_DROP); + svc_params.number_spatial_layers = ss_number_layers; svc_params.number_temporal_layers = ts_number_layers; for (i = 0; i < ss_number_layers * ts_number_layers; ++i) { @@ -1871,6 +1906,8 @@ } } + if (test_active_maps) set_active_map(&cfg, &codec, frame_cnt); + // Do the layer encode. aom_usec_timer_start(&timer); if (aom_codec_encode(&codec, frame_avail ? &raw : NULL, pts, 1, flags)) diff -Nru aom-3.8.2/libs.doxy_template aom-3.9.0/libs.doxy_template --- aom-3.8.2/libs.doxy_template 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/libs.doxy_template 2024-05-07 19:57:03.613000000 +0000 @@ -1219,15 +1219,6 @@ HTML_COLORSTYLE_GAMMA = 80 -# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML -# page will contain the date and time when the page was generated. Setting this -# to YES can help to show when doxygen was last run and thus if the -# documentation is up to date. -# The default value is: NO. -# This tag requires that the tag GENERATE_HTML is set to YES. - -HTML_TIMESTAMP = NO - # If the HTML_DYNAMIC_MENUS tag is set to YES then the generated HTML # documentation will contain a main index with vertical navigation menus that # are dynamically created via Javascript. If disabled, the navigation index will @@ -1509,17 +1500,6 @@ FORMULA_FONTSIZE = 10 -# Use the FORMULA_TRANSPARENT tag to determine whether or not the images -# generated for formulas are transparent PNGs. Transparent PNGs are not -# supported properly for IE 6.0, but are supported on all modern browsers. -# -# Note that when changing this option you need to delete any form_*.png files in -# the HTML output directory before the changes have effect. -# The default value is: YES. -# This tag requires that the tag GENERATE_HTML is set to YES. - -FORMULA_TRANSPARENT = YES - # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # https://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX @@ -1820,14 +1800,6 @@ LATEX_BIB_STYLE = plain -# If the LATEX_TIMESTAMP tag is set to YES then the footer of each generated -# page will contain the date and time when the page was generated. Setting this -# to NO can help when comparing the output of multiple runs. -# The default value is: NO. -# This tag requires that the tag GENERATE_LATEX is set to YES. - -LATEX_TIMESTAMP = NO - # The LATEX_EMOJI_DIRECTORY tag is used to specify the (relative or absolute) # path from which the emoji images will be read. If a relative path is entered, # it will be relative to the LATEX_OUTPUT directory. If left blank the @@ -2167,23 +2139,6 @@ DOT_NUM_THREADS = 0 -# When you want a differently looking font in the dot files that doxygen -# generates you can specify the font name using DOT_FONTNAME. You need to make -# sure dot is able to find the font, which can be done by putting it in a -# standard location or by setting the DOTFONTPATH environment variable or by -# setting DOT_FONTPATH to the directory containing the font. -# The default value is: Helvetica. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_FONTNAME = Helvetica - -# The DOT_FONTSIZE tag can be used to set the size (in points) of the font of -# dot graphs. -# Minimum value: 4, maximum value: 24, default value: 10. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_FONTSIZE = 10 - # By default doxygen will tell dot to use the default font as specified with # DOT_FONTNAME. If you specify a different font using DOT_FONTNAME you can set # the path where dot can find it using this tag. @@ -2401,18 +2356,6 @@ MAX_DOT_GRAPH_DEPTH = 0 -# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent -# background. This is disabled by default, because dot on Windows does not seem -# to support this out of the box. -# -# Warning: Depending on the platform used, enabling this option may lead to -# badly anti-aliased labels on the edges of a graph (i.e. they become hard to -# read). -# The default value is: NO. -# This tag requires that the tag HAVE_DOT is set to YES. - -DOT_TRANSPARENT = NO - # Set the DOT_MULTI_TARGETS tag to YES to allow dot to generate multiple output # files in one run (i.e. multiple -o and -T options on the command line). This # makes dot run faster, but since only newer versions of dot (>1.8.10) support diff -Nru aom-3.8.2/stats/rate_hist.c aom-3.9.0/stats/rate_hist.c --- aom-3.8.2/stats/rate_hist.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/stats/rate_hist.c 2024-05-07 19:57:03.621000000 +0000 @@ -42,8 +42,7 @@ if (hist == NULL || cfg == NULL || fps == NULL || fps->num == 0 || fps->den == 0) { - destroy_rate_histogram(hist); - return NULL; + goto fail; } // Determine the number of samples in the buffer. Use the file's framerate @@ -59,6 +58,7 @@ hist->pts = calloc(hist->samples, sizeof(*hist->pts)); hist->sz = calloc(hist->samples, sizeof(*hist->sz)); + if (hist->pts == NULL || hist->sz == NULL) goto fail; for (i = 0; i < RATE_BINS; i++) { hist->bucket[i].low = INT_MAX; hist->bucket[i].high = 0; @@ -66,6 +66,14 @@ } return hist; + +fail: + fprintf(stderr, + "Warning: Unable to allocate buffers required for " + "show_rate_histogram().\n" + "Continuing without rate histogram feature...\n"); + destroy_rate_histogram(hist); + return NULL; } void destroy_rate_histogram(struct rate_hist *hist) { diff -Nru aom-3.8.2/test/accounting_test.cc aom-3.9.0/test/accounting_test.cc --- aom-3.8.2/test/accounting_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/accounting_test.cc 2024-05-07 19:57:03.621000000 +0000 @@ -33,7 +33,7 @@ aom_write(&bw, 0, 32); aom_write(&bw, 0, 32); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); diff -Nru aom-3.8.2/test/active_map_test.cc aom-3.9.0/test/active_map_test.cc --- aom-3.8.2/test/active_map_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/active_map_test.cc 2024-05-07 19:57:03.622000000 +0000 @@ -19,8 +19,10 @@ namespace { +// Params: test mode, speed, aq_mode and screen_content mode. class ActiveMapTest - : public ::libaom_test::CodecTestWith2Params, + : public ::libaom_test::CodecTestWith4Params, public ::libaom_test::EncoderTest { protected: static const int kWidth = 208; @@ -32,6 +34,8 @@ void SetUp() override { InitializeConfig(GET_PARAM(1)); cpu_used_ = GET_PARAM(2); + aq_mode_ = GET_PARAM(3); + screen_mode_ = GET_PARAM(4); } void PreEncodeFrameHook(::libaom_test::VideoSource *video, @@ -41,6 +45,9 @@ encoder->Control(AV1E_SET_ALLOW_WARPED_MOTION, 0); encoder->Control(AV1E_SET_ENABLE_GLOBAL_MOTION, 0); encoder->Control(AV1E_SET_ENABLE_OBMC, 0); + encoder->Control(AV1E_SET_AQ_MODE, aq_mode_); + encoder->Control(AV1E_SET_TUNE_CONTENT, screen_mode_); + if (screen_mode_) encoder->Control(AV1E_SET_ENABLE_PALETTE, 1); } else if (video->frame() == 3) { aom_active_map_t map = aom_active_map_t(); /* clang-format off */ @@ -79,19 +86,22 @@ cfg_.g_pass = AOM_RC_ONE_PASS; cfg_.rc_end_usage = AOM_CBR; cfg_.kf_max_dist = 90000; - ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30, - 1, 0, 20); + ::libaom_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 100, + 1, 0, 100); ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } int cpu_used_; + int aq_mode_; + int screen_mode_; }; TEST_P(ActiveMapTest, Test) { DoTest(); } AV1_INSTANTIATE_TEST_SUITE(ActiveMapTest, ::testing::Values(::libaom_test::kRealTime), - ::testing::Range(5, 9)); + ::testing::Range(5, 12), ::testing::Values(0, 3), + ::testing::Values(0, 1)); } // namespace diff -Nru aom-3.8.2/test/aom_image_test.cc aom-3.9.0/test/aom_image_test.cc --- aom-3.8.2/test/aom_image_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/aom_image_test.cc 2024-05-07 19:57:03.623000000 +0000 @@ -9,6 +9,8 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include + #include "aom/aom_image.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" @@ -47,6 +49,16 @@ 0); } +TEST(AomImageTest, AomImgAllocNone) { + const int kWidth = 128; + const int kHeight = 128; + + aom_image_t img; + aom_img_fmt_t format = AOM_IMG_FMT_NONE; + unsigned int align = 32; + ASSERT_EQ(aom_img_alloc(&img, format, kWidth, kHeight, align), nullptr); +} + TEST(AomImageTest, AomImgAllocNv12) { const int kWidth = 128; const int kHeight = 128; @@ -54,9 +66,72 @@ aom_image_t img; aom_img_fmt_t format = AOM_IMG_FMT_NV12; unsigned int align = 32; - EXPECT_NE(aom_img_alloc(&img, format, kWidth, kHeight, align), nullptr); + EXPECT_EQ(aom_img_alloc(&img, format, kWidth, kHeight, align), &img); EXPECT_EQ(img.stride[AOM_PLANE_U], img.stride[AOM_PLANE_Y]); EXPECT_EQ(img.stride[AOM_PLANE_V], 0); EXPECT_EQ(img.planes[AOM_PLANE_V], nullptr); aom_img_free(&img); } + +TEST(AomImageTest, AomImgAllocHugeWidth) { + // The stride (0x80000000 * 2) would overflow unsigned int. + aom_image_t *image = + aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The stride (0x80000000) would overflow int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x80000000, 1, 1); + ASSERT_EQ(image, nullptr); + + // The aligned width (UINT_MAX + 1) would overflow unsigned int. + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, UINT_MAX, 1, 1); + ASSERT_EQ(image, nullptr); + + image = aom_img_alloc_with_border(nullptr, AOM_IMG_FMT_I422, 1, INT_MAX, 1, + 0x40000000, 0); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 0x7ffffffe, 1, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I420, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_NV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_YV12, 285245883, 64, 1); + if (image) { + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 65536, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } + + image = aom_img_alloc(nullptr, AOM_IMG_FMT_I42016, 285245883, 2, 1); + if (image) { + uint16_t *y_plane = + reinterpret_cast(image->planes[AOM_PLANE_Y]); + y_plane[0] = 0; + y_plane[image->d_w - 1] = 0; + aom_img_free(image); + } +} diff -Nru aom-3.8.2/test/av1_c_vs_simd_encode.sh aom-3.9.0/test/av1_c_vs_simd_encode.sh --- aom-3.8.2/test/av1_c_vs_simd_encode.sh 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/av1_c_vs_simd_encode.sh 2024-05-07 19:57:03.626000000 +0000 @@ -10,14 +10,18 @@ ## ## This script checks the bit exactness between C and SIMD ## implementations of AV1 encoder. +## +. $(dirname $0)/tools_common.sh PRESETS="good rt" -LOWBD_CLIPS="yuv_raw_input yuv_480p_raw_input y4m_720p_input y4m_screen_input" -HIGHBD_CLIPS="y4m_360p_10bit_input" +LOWBD_CIF_CLIP="yuv_raw_input" +LOWBD_480p_CLIP="yuv_480p_raw_input" +LOWBD_720p_CLIP="y4m_720p_input" +HIGHBD_CLIP="y4m_360p_10bit_input" +SC_CLIP="y4m_screen_input" OUT_FILE_SUFFIX=".ivf" SCRIPT_DIR=$(dirname "$0") LIBAOM_SOURCE_DIR=$(cd ${SCRIPT_DIR}/..; pwd) -devnull='> /dev/null 2>&1' # Clips used in test. YUV_RAW_INPUT="${LIBAOM_TEST_DATA_PATH}/hantro_collage_w352h288.yuv" @@ -93,21 +97,23 @@ fi } -cleanup() { - rm -rf ${AOM_TEST_OUTPUT_DIR} -} +# This is not needed since tools_common.sh does the same cleanup. +# Keep the code here for our reference. +# cleanup() { +# rm -rf ${AOM_TEST_OUTPUT_DIR} +# } # Echo AOM_SIMD_CAPS_MASK for different instruction set architecture. -avx512f() { +avx2() { echo "0x1FF" } -avx2() { - echo "0x0FF" +avx() { + echo "0x17F" } -avx() { - echo "0x07F" +sse4_2() { + echo "0x13F" } sse4_1() { @@ -131,15 +137,15 @@ local preset=$2 # Bit-rates: - local bitrate_lowres_good="100 1000" - local bitrate_480p_good="200 2000" - local bitrate_720p_good="600 6000" - local bitrate_scc_360p_good="400 1200" - local bitrate_lowres_rt="50 400" - local bitrate_480p_rt="100 1800" - local bitrate_720p_rt="150 2000" - local bitrate_scc_360p_rt="400 800" - local bitrate_hbd_360p="100 1600" + local bitrate_lowres_good="300" + local bitrate_480p_good="500" + local bitrate_720p_good="1000" + local bitrate_scc_360p_good="500" + local bitrate_lowres_rt="200" + local bitrate_480p_rt="300" + local bitrate_720p_rt="600" + local bitrate_scc_360p_rt="300" + local bitrate_hbd_360p="500" if [ "${preset}" = "good" ]; then if [ "${content}" = "yuv_raw_input" ]; then @@ -208,8 +214,8 @@ has_x86_isa_extn() { instruction_set=$1 - grep -q "$instruction_set" /proc/cpuinfo - if [ $? -eq 1 ]; then + if ! grep -q "$instruction_set" /proc/cpuinfo; then + # This instruction set is not supported. return 1 fi } @@ -297,7 +303,8 @@ -DCMAKE_BUILD_TYPE=Release \ -DENABLE_CCACHE=1 \ '-DCMAKE_C_FLAGS_RELEASE=-O3 -g' \ - '-DCMAKE_CXX_FLAGS_RELEASE=-O3 -g'" + '-DCMAKE_CXX_FLAGS_RELEASE=-O3 -g' \ + -DENABLE_DOCS=0 -DENABLE_TESTS=0 -DENABLE_TOOLS=0" for preset in $PRESETS; do echo "Building target[${preset} encoding]: ${target}" @@ -309,8 +316,16 @@ elog "Invalid preset" return 1 fi - eval "$cmake_command" "${cmake_common_args}" "${cmake_extra_args}" ${devnull} - eval make -j$(nproc) ${devnull} + if ! eval "$cmake_command" "${cmake_common_args}" "${cmake_extra_args}" \ + ${devnull}; then + elog "cmake failure" + return 1 + fi + if ! eval make -j$(nproc) aomenc ${devnull}; then + elog "build failure" + return 1 + fi + mv aomenc aomenc_${preset} done echo "Done building target: ${target}" @@ -322,9 +337,8 @@ local clip=$3 local bitrate=$4 local preset=$5 - diff ${AOM_TEST_OUTPUT_DIR}/Out-generic-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ - ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} > /dev/null - if [ $? -eq 1 ]; then + if ! diff -q ${AOM_TEST_OUTPUT_DIR}/Out-generic-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX} \ + ${AOM_TEST_OUTPUT_DIR}/Out-${target}-"${clip}"-${preset}-${bitrate}kbps-cpu${cpu}${OUT_FILE_SUFFIX}; then elog "C vs ${target} encode mismatches for ${clip}, at ${bitrate} kbps, speed ${cpu}, ${preset} preset" return 1 fi @@ -332,35 +346,58 @@ av1_enc_test() { local encoder="$1" - local target="$2" - local preset="$3" + local arch="$2" + local target="$3" + local preset="$4" if [ -z "$(av1_enc_tool_path "${target}" "${preset}")" ]; then elog "aomenc_{preset} not found. It must exist in ${AOM_TEST_OUTPUT_DIR}/build_target_${target} path" return 1 fi if [ "${preset}" = "good" ]; then - local min_cpu_used=0 - local max_cpu_used=6 - local test_params=av1_encode_good_params - if [ "${target}" = "armv8-linux-gcc" ]; then - # TODO(BUG=aomedia:3474): Enable testing of high bit-depth clips after - # fixing C vs SIMD mismatches. - local test_clips="${LOWBD_CLIPS}" - else - local test_clips="${LOWBD_CLIPS} ${HIGHBD_CLIPS}" + if [ "${arch}" = "x86_64" ]; then + local min_cpu_used=0 + local max_cpu_used=6 + elif [ "${arch}" = "x86" ]; then + local min_cpu_used=2 + local max_cpu_used=3 fi + local test_params=av1_encode_good_params elif [ "${preset}" = "rt" ]; then local min_cpu_used=5 - local max_cpu_used=10 + local max_cpu_used=11 local test_params=av1_encode_rt_params - local test_clips="${LOWBD_CLIPS}" else elog "Invalid preset" return 1 fi for cpu in $(seq $min_cpu_used $max_cpu_used); do + if [ "${preset}" = "good" ]; then + if [ "${arch}" = "x86_64" ]; then + if [ "${cpu}" -lt 2 ]; then + local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}" + elif [ "${cpu}" -lt 5 ]; then + local test_clips="${LOWBD_480p_CLIP} ${HIGHBD_CLIP}" + else + local test_clips="${LOWBD_720p_CLIP} ${HIGHBD_CLIP}" + fi + elif [ "${arch}" = "x86" ]; then + local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}" + elif [ "${arch}" = "arm64" ]; then + local test_clips="${LOWBD_CIF_CLIP} ${HIGHBD_CLIP}" + fi + elif [ "${preset}" = "rt" ]; then + if [ "${cpu}" -lt 8 ]; then + local test_clips="${LOWBD_CIF_CLIP} ${SC_CLIP}" + else + local test_clips="${LOWBD_480p_CLIP} ${SC_CLIP}" + fi + else + elog "Invalid preset" + return 1 + fi + for clip in ${test_clips}; do local test_bitrates=$(get_bitrates ${clip} ${preset}) for bitrate in ${test_bitrates}; do @@ -371,8 +408,8 @@ ${devnull} if [ "${target}" != "generic" ]; then - compare_enc_output ${target} $cpu ${clip} $bitrate ${preset} - if [ $? -eq 1 ]; then + if ! compare_enc_output ${target} $cpu ${clip} $bitrate ${preset}; then + # Found a mismatch return 1 fi fi @@ -392,40 +429,41 @@ # The cmake command line option -DENABLE_MMX=0 flag disables all SIMD # optimizations, and generates a C-only binary. local cmake_command="cmake $LIBAOM_SOURCE_DIR -DENABLE_MMX=0 \ - -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/${arch}-linux.cmake" + -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake" fi echo "Build for: Generic ${arch}" - av1_enc_build "${target}" "${cmake_command}" + if ! av1_enc_build "${target}" "${cmake_command}"; then + return 1 + fi for preset in $PRESETS; do local encoder="$(av1_enc_tool_path "${target}" "${preset}")" - av1_enc_test $encoder "${target}" "${preset}" + av1_enc_test $encoder "${arch}" "${target}" "${preset}" done } -# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, AVX, AVX2 as there are -# no functions with MMX, SSE and AVX512 specialization. +# This function encodes AV1 bitstream by enabling SSE2, SSE3, SSSE3, SSE4_1, SSE4_2, AVX, AVX2 as +# there are no functions with MMX, SSE and AVX512 specialization. # The value of environment variable 'AOM_SIMD_CAPS_MASK' controls enabling of different instruction # set extension optimizations. The value of the flag 'AOM_SIMD_CAPS_MASK' and the corresponding # instruction set extension optimization enabled are as follows: -# AVX512 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX -# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX512 and lower variants -# 0 1 1 1 1 1 1 1 1 -> 0x0FF -> Enable AVX2 and lower variants -# 0 0 1 1 1 1 1 1 1 -> 0x07F -> Enable AVX and lower variants -# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants -# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants -# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants -# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants -# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants -# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX +# SSE4_2 AVX2 AVX SSE4_1 SSSE3 SSE3 SSE2 SSE MMX +# 1 1 1 1 1 1 1 1 1 -> 0x1FF -> Enable AVX2 and lower variants +# 1 0 1 1 1 1 1 1 1 -> 0x17F -> Enable AVX and lower variants +# 1 0 0 1 1 1 1 1 1 -> 0x13F -> Enable SSE4_2 and lower variants +# 0 0 0 1 1 1 1 1 1 -> 0x03F -> Enable SSE4_1 and lower variants +# 0 0 0 0 1 1 1 1 1 -> 0x01F -> Enable SSSE3 and lower variants +# 0 0 0 0 0 1 1 1 1 -> 0x00F -> Enable SSE3 and lower variants +# 0 0 0 0 0 0 1 1 1 -> 0x007 -> Enable SSE2 and lower variants +# 0 0 0 0 0 0 0 1 1 -> 0x003 -> Enable SSE and lower variants +# 0 0 0 0 0 0 0 0 1 -> 0x001 -> Enable MMX ## NOTE: In x86_64 platform, it is not possible to enable sse/mmx/c using "AOM_SIMD_CAPS_MASK" as # all x86_64 platforms implement sse2. av1_test_x86() { local arch=$1 - uname -m | grep -q "x86" - if [ $? -eq 1 ]; then + if ! uname -m | grep -q "x86"; then elog "Machine architecture is not x86 or x86_64" return 0 fi @@ -434,28 +472,31 @@ local target="x86-linux" local cmake_command="cmake \ $LIBAOM_SOURCE_DIR \ - -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/${target}.cmake" + -DCMAKE_TOOLCHAIN_FILE=${LIBAOM_SOURCE_DIR}/build/cmake/toolchains/i686-linux-gcc.cmake" elif [ $arch = "x86_64" ]; then local target="x86_64-linux" local cmake_command="cmake $LIBAOM_SOURCE_DIR" fi - local x86_isa_variants="avx2 avx sse4_1 ssse3 sse3 sse2" + # Available x86 isa variants: "avx2 avx sse4_2 sse4_1 ssse3 sse3 sse2" + local x86_isa_variants="avx2 sse4_2 sse2" echo "Build for x86: ${target}" - av1_enc_build "${target}" "${cmake_command}" + if ! av1_enc_build "${target}" "${cmake_command}"; then + return 1 + fi for preset in $PRESETS; do local encoder="$(av1_enc_tool_path "${target}" "${preset}")" for isa in $x86_isa_variants; do - has_x86_isa_extn $isa - if [ $? -eq 1 ]; then + # Note that if has_x86_isa_extn returns 1, it is false, and vice versa. + if ! has_x86_isa_extn $isa; then echo "${isa} is not supported in this machine" continue fi export AOM_SIMD_CAPS_MASK=$($isa) - av1_enc_test $encoder "${target}" "${preset}" - if [ $? -eq 1 ]; then + if ! av1_enc_test $encoder "${arch}" "${target}" "${preset}"; then + # Found a mismatch return 1 fi unset AOM_SIMD_CAPS_MASK @@ -464,23 +505,20 @@ } av1_test_arm() { + local arch="arm64" local target="arm64-linux-gcc" local cmake_command="cmake $LIBAOM_SOURCE_DIR \ -DCMAKE_TOOLCHAIN_FILE=$LIBAOM_SOURCE_DIR/build/cmake/toolchains/${target}.cmake \ -DCMAKE_C_FLAGS=-Wno-maybe-uninitialized" echo "Build for arm64: ${target}" - av1_enc_build "${target}" "${cmake_command}" + if ! av1_enc_build "${target}" "${cmake_command}"; then + return 1 + fi for preset in $PRESETS; do - # Enable armv8 test for real-time only - # TODO(BUG=aomedia:3486, BUG=aomedia:3474): Enable testing for 'good' preset - # after fixing C vs NEON mismatches. - if [ "${preset}" = "good" ]; then - continue - fi local encoder="$(av1_enc_tool_path "${target}" "${preset}")" - av1_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" "${target}" "${preset}" - if [ $? -eq 1 ]; then + if ! av1_enc_test "qemu-aarch64 -L /usr/aarch64-linux-gnu ${encoder}" "${arch}" "${target}" "${preset}"; then + # Found a mismatch return 1 fi done @@ -488,14 +526,15 @@ av1_c_vs_simd_enc_test () { # Test x86 (32 bit) + # x86 requires the i686-linux-gnu toolchain: + # $ sudo apt-get install g++-i686-linux-gnu echo "av1 test for x86 (32 bit): Started." # Encode 'C' only av1_test_generic "x86" - # Encode with SIMD optimizations enabled - av1_test_x86 "x86" - if [ $? -eq 1 ]; then + if ! av1_test_x86 "x86"; then echo "av1 test for x86 (32 bit): Done, test failed." + return 1 else echo "av1 test for x86 (32 bit): Done, all tests passed." fi @@ -506,9 +545,9 @@ # Encode 'C' only av1_test_generic "x86_64" # Encode with SIMD optimizations enabled - av1_test_x86 "x86_64" - if [ $? -eq 1 ]; then + if ! av1_test_x86 "x86_64"; then echo "av1 test for x86_64 (64 bit): Done, test failed." + return 1 else echo "av1 test for x86_64 (64 bit): Done, all tests passed." fi @@ -516,20 +555,12 @@ # Test ARM echo "av1_test_arm: Started." - av1_test_arm - if [ $? -eq 1 ]; then + if ! av1_test_arm; then echo "av1 test for arm: Done, test failed." + return 1 else echo "av1 test for arm: Done, all tests passed." fi } -# Setup a trap function to clean up build, and output files after tests complete. -trap cleanup EXIT - -av1_c_vs_simd_enc_verify_environment -if [ $? -eq 1 ]; then - echo "Environment check failed." - exit 1 -fi -av1_c_vs_simd_enc_test +run_tests av1_c_vs_simd_enc_verify_environment av1_c_vs_simd_enc_test diff -Nru aom-3.8.2/test/av1_convolve_test.cc aom-3.9.0/test/av1_convolve_test.cc --- aom-3.8.2/test/av1_convolve_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/av1_convolve_test.cc 2024-05-07 19:57:03.628000000 +0000 @@ -631,6 +631,11 @@ BuildHighbdParams(av1_highbd_convolve_x_sr_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P(SVE2, AV1ConvolveXHighbdTest, + BuildHighbdParams(av1_highbd_convolve_x_sr_sve2)); +#endif + ///////////////////////////////////////////////////////////////// // Single reference convolve-x IntraBC functions (high bit-depth) ///////////////////////////////////////////////////////////////// @@ -998,6 +1003,11 @@ BuildHighbdParams(av1_highbd_convolve_y_sr_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P(SVE2, AV1ConvolveYHighbdTest, + BuildHighbdParams(av1_highbd_convolve_y_sr_sve2)); +#endif + ///////////////////////////////////////////////////////////////// // Single reference convolve-y IntraBC functions (high bit-depth) ///////////////////////////////////////////////////////////////// @@ -1523,6 +1533,11 @@ BuildHighbdParams(av1_highbd_convolve_2d_sr_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P(SVE2, AV1Convolve2DHighbdTest, + BuildHighbdParams(av1_highbd_convolve_2d_sr_sve2)); +#endif + ////////////////////////////////////////////////////////////////// // Single reference convolve-2d IntraBC functions (high bit-depth) ////////////////////////////////////////////////////////////////// @@ -1943,6 +1958,12 @@ BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_neon)); #endif +#if HAVE_SVE2 +INSTANTIATE_TEST_SUITE_P( + SVE2, AV1ConvolveXHighbdCompoundTest, + BuildHighbdLumaParams(av1_highbd_dist_wtd_convolve_x_sve2)); +#endif + #endif // CONFIG_AV1_HIGHBITDEPTH //////////////////////////////////////////////// @@ -2312,11 +2333,6 @@ INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DCompoundTest, BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_c)); -#if HAVE_SSE2 -INSTANTIATE_TEST_SUITE_P(SSE2, AV1Convolve2DCompoundTest, - BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_sse2)); -#endif - #if HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P(SSSE3, AV1Convolve2DCompoundTest, BuildLowbdLumaParams(av1_dist_wtd_convolve_2d_ssse3)); diff -Nru aom-3.8.2/test/av1_fwd_txfm2d_test.cc aom-3.9.0/test/av1_fwd_txfm2d_test.cc --- aom-3.8.2/test/av1_fwd_txfm2d_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/av1_fwd_txfm2d_test.cc 2024-05-07 19:57:03.633000000 +0000 @@ -443,7 +443,7 @@ using ::testing::Values; using ::testing::ValuesIn; -#if HAVE_SSE2 +#if AOM_ARCH_X86 && HAVE_SSE2 static TX_SIZE fwd_txfm_for_sse2[] = { TX_4X4, TX_8X8, @@ -469,15 +469,14 @@ INSTANTIATE_TEST_SUITE_P(SSE2, AV1FwdTxfm2dTest, Combine(ValuesIn(fwd_txfm_for_sse2), Values(av1_lowbd_fwd_txfm_sse2))); -#endif // HAVE_SSE2 +#endif // AOM_ARCH_X86 && HAVE_SSE2 #if HAVE_SSE4_1 -static TX_SIZE fwd_txfm_for_sse41[] = { - TX_4X4, - TX_64X64, - TX_32X64, - TX_64X32, -}; +static TX_SIZE fwd_txfm_for_sse41[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, + TX_64X64, TX_4X8, TX_8X4, TX_8X16, + TX_16X8, TX_16X32, TX_32X16, TX_32X64, + TX_64X32, TX_4X16, TX_16X4, TX_8X32, + TX_32X8, TX_16X64, TX_64X16 }; INSTANTIATE_TEST_SUITE_P(SSE4_1, AV1FwdTxfm2dTest, Combine(ValuesIn(fwd_txfm_for_sse41), diff -Nru aom-3.8.2/test/av1_wedge_utils_test.cc aom-3.9.0/test/av1_wedge_utils_test.cc --- aom-3.8.2/test/av1_wedge_utils_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/av1_wedge_utils_test.cc 2024-05-07 19:57:03.642000000 +0000 @@ -408,4 +408,16 @@ av1_wedge_compute_delta_squares_avx2))); #endif // HAVE_AVX2 +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, WedgeUtilsSSEOptTest, + ::testing::Values(TestFuncsFSSE(av1_wedge_sse_from_residuals_c, + av1_wedge_sse_from_residuals_sve))); + +INSTANTIATE_TEST_SUITE_P( + SVE, WedgeUtilsSignOptTest, + ::testing::Values(TestFuncsFSign(av1_wedge_sign_from_residuals_c, + av1_wedge_sign_from_residuals_sve))); +#endif // HAVE_SVE + } // namespace diff -Nru aom-3.8.2/test/avg_test.cc aom-3.9.0/test/avg_test.cc --- aom-3.8.2/test/avg_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/avg_test.cc 2024-05-07 19:57:03.642000000 +0000 @@ -1021,6 +1021,15 @@ make_tuple(5, &aom_vector_var_c, &aom_vector_var_neon))); #endif +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, VectorVarTest, + ::testing::Values(make_tuple(2, &aom_vector_var_c, &aom_vector_var_sve), + make_tuple(3, &aom_vector_var_c, &aom_vector_var_sve), + make_tuple(4, &aom_vector_var_c, &aom_vector_var_sve), + make_tuple(5, &aom_vector_var_c, &aom_vector_var_sve))); +#endif // HAVE_SVE + #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, VectorVarTest, diff -Nru aom-3.8.2/test/binary_codes_test.cc aom-3.9.0/test/binary_codes_test.cc --- aom-3.8.2/test/binary_codes_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/binary_codes_test.cc 2024-05-07 19:57:03.644000000 +0000 @@ -59,7 +59,7 @@ } } } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); GTEST_ASSERT_GE(aom_reader_tell(&br), 0u); diff -Nru aom-3.8.2/test/boolcoder_test.cc aom-3.9.0/test/boolcoder_test.cc --- aom-3.8.2/test/boolcoder_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/boolcoder_test.cc 2024-05-07 19:57:03.645000000 +0000 @@ -66,7 +66,7 @@ aom_write(&bw, bit, static_cast(probas[i])); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); @@ -100,7 +100,7 @@ for (int i = 0; i < kSymbols; i++) { aom_write(&bw, 0, p); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); uint32_t last_tell = aom_reader_tell(&br); @@ -146,7 +146,7 @@ for (int i = 0; i < kSymbols; i++) { aom_write(&bw, 1, p); } - aom_stop_encode(&bw); + GTEST_ASSERT_GE(aom_stop_encode(&bw), 0); aom_reader br; aom_reader_init(&br, bw_buffer, bw.pos); ASSERT_FALSE(aom_reader_has_overflowed(&br)); diff -Nru aom-3.8.2/test/cdef_test.cc aom-3.9.0/test/cdef_test.cc --- aom-3.8.2/test/cdef_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/cdef_test.cc 2024-05-07 19:57:03.646000000 +0000 @@ -441,10 +441,11 @@ constexpr int stride = MAX_CDEF_BLOCK; int error = 0; for (int k = 0; k < kIterations && !error; k++) { - // Generate a random value between 1 and 256, making sure height is even. - // Test once for very small values to avoid potential overflows. - const int width = k == 0 ? 2 : rnd_.Rand8() % 256 + 1; - const int height = k == 0 ? 2 : (rnd_.Rand8() % 128 + 1) * 2; + // This function operates on values of width that are either 4 or a + // multiple of 8. For height, generate a random value between 1 and 256, + // making sure it is even. + const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8; + const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2; for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { src_[i * stride + j] = rnd_.Rand8(); @@ -524,10 +525,11 @@ constexpr int stride = MAX_CDEF_BLOCK; int error = 0; for (int k = 0; k < kIterations && !error; k++) { - // Generate a random value between 1 and 256, making sure height is even. - // Test once for very small values to avoid potential overflows. - const int width = k == 0 ? 2 : rnd_.Rand8() % 256 + 1; - const int height = k == 0 ? 2 : (rnd_.Rand8() % 128 + 1) * 2; + // This function operates on values of width that are either 4 or a + // multiple of 8. For height, generate a random value between 1 and 256, + // making sure it is even. + const int width = k == 0 ? 4 : (rnd_.Rand8() % 32 + 1) * 8; + const int height = k == 0 ? 4 : (rnd_.Rand8() % 128 + 1) * 2; for (int i = 0; i < height; i++) { for (int j = 0; j < width; j++) { src_[i * stride + j] = rnd_.Rand16(); @@ -612,7 +614,7 @@ using std::make_tuple; -#if (HAVE_SSE2 || HAVE_SSSE3 || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON) +#if ((AOM_ARCH_X86 && HAVE_SSSE3) || HAVE_SSE4_1 || HAVE_AVX2 || HAVE_NEON) static const CdefFilterBlockFunctions kCdefFilterFuncC[] = { { &cdef_filter_8_0_c, &cdef_filter_8_1_c, &cdef_filter_8_2_c, &cdef_filter_8_3_c } @@ -624,50 +626,7 @@ }; #endif -#if HAVE_SSE2 -static const CdefFilterBlockFunctions kCdefFilterFuncSse2[] = { - { &cdef_filter_8_0_sse2, &cdef_filter_8_1_sse2, &cdef_filter_8_2_sse2, - &cdef_filter_8_3_sse2 } -}; - -static const CdefFilterBlockFunctions kCdefFilterHighbdFuncSse2[] = { - { &cdef_filter_16_0_sse2, &cdef_filter_16_1_sse2, &cdef_filter_16_2_sse2, - &cdef_filter_16_3_sse2 } -}; - -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFBlockTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2), - ::testing::ValuesIn(kCdefFilterFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Values(8))); -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFBlockHighbdTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2), - ::testing::ValuesIn(kCdefFilterHighbdFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Range(10, 13, 2))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirTest, - ::testing::Values(make_tuple(&cdef_find_dir_sse2, - &cdef_find_dir_c))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualTest, - ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2, - &cdef_find_dir_dual_c))); - -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFCopyRect8to16Test, - ::testing::Values(make_tuple(&cdef_copy_rect8_8bit_to_16bit_c, - &cdef_copy_rect8_8bit_to_16bit_sse2))); - -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFCopyRect16to16Test, - ::testing::Values(make_tuple(&cdef_copy_rect8_16bit_to_16bit_c, - &cdef_copy_rect8_16bit_to_16bit_sse2))); -#endif - -#if HAVE_SSSE3 +#if AOM_ARCH_X86 && HAVE_SSSE3 static const CdefFilterBlockFunctions kCdefFilterFuncSsse3[] = { { &cdef_filter_8_0_ssse3, &cdef_filter_8_1_ssse3, &cdef_filter_8_2_ssse3, &cdef_filter_8_3_ssse3 } @@ -841,30 +800,7 @@ #endif // Test speed for all supported architectures -#if HAVE_SSE2 -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFSpeedTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSse2), - ::testing::ValuesIn(kCdefFilterFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Values(8))); -INSTANTIATE_TEST_SUITE_P( - SSE2, CDEFSpeedHighbdTest, - ::testing::Combine(::testing::ValuesIn(kCdefFilterHighbdFuncSse2), - ::testing::ValuesIn(kCdefFilterHighbdFuncC), - ::testing::Values(BLOCK_4X4, BLOCK_4X8, BLOCK_8X4, - BLOCK_8X8), - ::testing::Range(0, 16), ::testing::Values(10))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirSpeedTest, - ::testing::Values(make_tuple(&cdef_find_dir_sse2, - &cdef_find_dir_c))); -INSTANTIATE_TEST_SUITE_P(SSE2, CDEFFindDirDualSpeedTest, - ::testing::Values(make_tuple(&cdef_find_dir_dual_sse2, - &cdef_find_dir_dual_c))); -#endif - -#if HAVE_SSSE3 +#if AOM_ARCH_X86 && HAVE_SSSE3 INSTANTIATE_TEST_SUITE_P( SSSE3, CDEFSpeedTest, ::testing::Combine(::testing::ValuesIn(kCdefFilterFuncSsse3), diff -Nru aom-3.8.2/test/cnn_test.cc aom-3.9.0/test/cnn_test.cc --- aom-3.8.2/test/cnn_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/cnn_test.cc 2024-05-07 19:57:03.648000000 +0000 @@ -2651,4 +2651,11 @@ &av1_cnn_convolve_no_maxpool_padding_valid_avx2))); #endif +#if HAVE_NEON +INSTANTIATE_TEST_SUITE_P(NEON, CNNConvolveTest, + ::testing::Values(CNNConvolveTestFuncs( + &av1_cnn_convolve_no_maxpool_padding_valid_c, + &av1_cnn_convolve_no_maxpool_padding_valid_neon))); +#endif + } // namespace diff -Nru aom-3.8.2/test/convolve_test.cc aom-3.9.0/test/convolve_test.cc --- aom-3.8.2/test/convolve_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/convolve_test.cc 2024-05-07 19:57:03.657000000 +0000 @@ -773,6 +773,17 @@ WRAP(convolve8_horiz_neon, 12) WRAP(convolve8_vert_neon, 12) #endif // HAVE_NEON + +#if HAVE_SVE +WRAP(convolve8_horiz_sve, 8) +WRAP(convolve8_vert_sve, 8) + +WRAP(convolve8_horiz_sve, 10) +WRAP(convolve8_vert_sve, 10) + +WRAP(convolve8_horiz_sve, 12) +WRAP(convolve8_vert_sve, 12) +#endif // HAVE_SVE #endif // CONFIG_AV1_HIGHBITDEPTH #undef WRAP @@ -832,12 +843,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, HighbdConvolveTest, ::testing::ValuesIn(kArrayHighbdConvolve_sse2)); #endif -const ConvolveFunctions convolve8_sse2(aom_convolve8_horiz_sse2, - aom_convolve8_vert_sse2, 0); -const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2) }; - -INSTANTIATE_TEST_SUITE_P(SSE2, LowbdConvolveTest, - ::testing::ValuesIn(kArrayConvolve_sse2)); #endif #if HAVE_SSSE3 @@ -919,4 +924,22 @@ ::testing::ValuesIn(kArray_Convolve8_neon_i8mm)); #endif // HAVE_NEON_I8MM +#if HAVE_SVE +#if CONFIG_AV1_HIGHBITDEPTH +const ConvolveFunctions wrap_convolve8_sve(wrap_convolve8_horiz_sve_8, + wrap_convolve8_vert_sve_8, 8); +const ConvolveFunctions wrap_convolve10_sve(wrap_convolve8_horiz_sve_10, + wrap_convolve8_vert_sve_10, 10); +const ConvolveFunctions wrap_convolve12_sve(wrap_convolve8_horiz_sve_12, + wrap_convolve8_vert_sve_12, 12); +const ConvolveParam kArray_HighbdConvolve8_sve[] = { + ALL_SIZES_64(wrap_convolve8_sve), ALL_SIZES_64(wrap_convolve10_sve), + ALL_SIZES_64(wrap_convolve12_sve) +}; + +INSTANTIATE_TEST_SUITE_P(SVE, HighbdConvolveTest, + ::testing::ValuesIn(kArray_HighbdConvolve8_sve)); +#endif +#endif // HAVE_SVE + } // namespace diff -Nru aom-3.8.2/test/corner_match_test.cc aom-3.9.0/test/corner_match_test.cc --- aom-3.8.2/test/corner_match_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/corner_match_test.cc 2024-05-07 19:57:03.658000000 +0000 @@ -27,13 +27,19 @@ using libaom_test::ACMRandom; -typedef double (*ComputeCrossCorrFunc)(const unsigned char *im1, int stride1, - int x1, int y1, const unsigned char *im2, - int stride2, int x2, int y2); +typedef bool (*ComputeMeanStddevFunc)(const unsigned char *frame, int stride, + int x, int y, double *mean, + double *one_over_stddev); +typedef double (*ComputeCorrFunc)(const unsigned char *frame1, int stride1, + int x1, int y1, double mean1, + double one_over_stddev1, + const unsigned char *frame2, int stride2, + int x2, int y2, double mean2, + double one_over_stddev2); using std::make_tuple; using std::tuple; -typedef tuple CornerMatchParam; +typedef tuple CornerMatchParam; class AV1CornerMatchTest : public ::testing::TestWithParam { public: @@ -41,8 +47,11 @@ void SetUp() override; protected: - void RunCheckOutput(int run_times); - ComputeCrossCorrFunc target_func; + void GenerateInput(uint8_t *input1, uint8_t *input2, int w, int h, int mode); + void RunCheckOutput(); + void RunSpeedTest(); + ComputeMeanStddevFunc target_compute_mean_stddev_func; + ComputeCorrFunc target_compute_corr_func; libaom_test::ACMRandom rnd_; }; @@ -51,13 +60,87 @@ AV1CornerMatchTest::~AV1CornerMatchTest() = default; void AV1CornerMatchTest::SetUp() { rnd_.Reset(ACMRandom::DeterministicSeed()); - target_func = GET_PARAM(1); + target_compute_mean_stddev_func = GET_PARAM(1); + target_compute_corr_func = GET_PARAM(2); } -void AV1CornerMatchTest::RunCheckOutput(int run_times) { +void AV1CornerMatchTest::GenerateInput(uint8_t *input1, uint8_t *input2, int w, + int h, int mode) { + if (mode == 0) { + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + input1[i * w + j] = rnd_.Rand8(); + input2[i * w + j] = rnd_.Rand8(); + } + } else if (mode == 1) { + for (int i = 0; i < h; ++i) + for (int j = 0; j < w; ++j) { + int v = rnd_.Rand8(); + input1[i * w + j] = v; + input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15); + } + } +} + +void AV1CornerMatchTest::RunCheckOutput() { const int w = 128, h = 128; - const int num_iters = 10000; - int i, j; + const int num_iters = 1000; + + std::unique_ptr input1(new (std::nothrow) uint8_t[w * h]); + std::unique_ptr input2(new (std::nothrow) uint8_t[w * h]); + ASSERT_NE(input1, nullptr); + ASSERT_NE(input2, nullptr); + + // Test the two extreme cases: + // i) Random data, should have correlation close to 0 + // ii) Linearly related data + noise, should have correlation close to 1 + int mode = GET_PARAM(0); + GenerateInput(&input1[0], &input2[0], w, h, mode); + + for (int i = 0; i < num_iters; ++i) { + int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w + 1 - MATCH_SZ); + int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h + 1 - MATCH_SZ); + int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w + 1 - MATCH_SZ); + int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h + 1 - MATCH_SZ); + + double c_mean1, c_one_over_stddev1, c_mean2, c_one_over_stddev2; + bool c_valid1 = aom_compute_mean_stddev_c(input1.get(), w, x1, y1, &c_mean1, + &c_one_over_stddev1); + bool c_valid2 = aom_compute_mean_stddev_c(input2.get(), w, x2, y2, &c_mean2, + &c_one_over_stddev2); + + double simd_mean1, simd_one_over_stddev1, simd_mean2, simd_one_over_stddev2; + bool simd_valid1 = target_compute_mean_stddev_func( + input1.get(), w, x1, y1, &simd_mean1, &simd_one_over_stddev1); + bool simd_valid2 = target_compute_mean_stddev_func( + input2.get(), w, x2, y2, &simd_mean2, &simd_one_over_stddev2); + + // Run the correlation calculation even if one of the "valid" flags is + // false, i.e. if one of the patches doesn't have enough variance. This is + // safe because any potential division by 0 is caught in + // aom_compute_mean_stddev(), and one_over_stddev is set to 0 instead. + // This causes aom_compute_correlation() to return 0, without causing a + // division by 0. + const double c_corr = aom_compute_correlation_c( + input1.get(), w, x1, y1, c_mean1, c_one_over_stddev1, input2.get(), w, + x2, y2, c_mean2, c_one_over_stddev2); + const double simd_corr = target_compute_corr_func( + input1.get(), w, x1, y1, c_mean1, c_one_over_stddev1, input2.get(), w, + x2, y2, c_mean2, c_one_over_stddev2); + + ASSERT_EQ(simd_valid1, c_valid1); + ASSERT_EQ(simd_valid2, c_valid2); + ASSERT_EQ(simd_mean1, c_mean1); + ASSERT_EQ(simd_one_over_stddev1, c_one_over_stddev1); + ASSERT_EQ(simd_mean2, c_mean2); + ASSERT_EQ(simd_one_over_stddev2, c_one_over_stddev2); + ASSERT_EQ(simd_corr, c_corr); + } +} + +void AV1CornerMatchTest::RunSpeedTest() { + const int w = 16, h = 16; + const int num_iters = 1000000; aom_usec_timer ref_timer, test_timer; std::unique_ptr input1(new (std::nothrow) uint8_t[w * h]); @@ -69,76 +152,82 @@ // i) Random data, should have correlation close to 0 // ii) Linearly related data + noise, should have correlation close to 1 int mode = GET_PARAM(0); - if (mode == 0) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - input1[i * w + j] = rnd_.Rand8(); - input2[i * w + j] = rnd_.Rand8(); - } - } else if (mode == 1) { - for (i = 0; i < h; ++i) - for (j = 0; j < w; ++j) { - int v = rnd_.Rand8(); - input1[i * w + j] = v; - input2[i * w + j] = (v / 2) + (rnd_.Rand8() & 15); - } + GenerateInput(&input1[0], &input2[0], w, h, mode); + + // Time aom_compute_mean_stddev() + double c_mean1, c_one_over_stddev1, c_mean2, c_one_over_stddev2; + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_iters; i++) { + aom_compute_mean_stddev_c(input1.get(), w, 0, 0, &c_mean1, + &c_one_over_stddev1); + aom_compute_mean_stddev_c(input2.get(), w, 0, 0, &c_mean2, + &c_one_over_stddev2); } + aom_usec_timer_mark(&ref_timer); + int elapsed_time_c = static_cast(aom_usec_timer_elapsed(&ref_timer)); - for (i = 0; i < num_iters; ++i) { - int x1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2); - int y1 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2); - int x2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(w - 2 * MATCH_SZ_BY2); - int y2 = MATCH_SZ_BY2 + rnd_.PseudoUniform(h - 2 * MATCH_SZ_BY2); - - double res_c = av1_compute_cross_correlation_c(input1.get(), w, x1, y1, - input2.get(), w, x2, y2); - double res_simd = - target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2); - - if (run_times > 1) { - aom_usec_timer_start(&ref_timer); - for (j = 0; j < run_times; j++) { - av1_compute_cross_correlation_c(input1.get(), w, x1, y1, input2.get(), - w, x2, y2); - } - aom_usec_timer_mark(&ref_timer); - const int elapsed_time_c = - static_cast(aom_usec_timer_elapsed(&ref_timer)); - - aom_usec_timer_start(&test_timer); - for (j = 0; j < run_times; j++) { - target_func(input1.get(), w, x1, y1, input2.get(), w, x2, y2); - } - aom_usec_timer_mark(&test_timer); - const int elapsed_time_simd = - static_cast(aom_usec_timer_elapsed(&test_timer)); - - printf( - "c_time=%d \t simd_time=%d \t " - "gain=%d\n", - elapsed_time_c, elapsed_time_simd, - (elapsed_time_c / elapsed_time_simd)); - } else { - ASSERT_EQ(res_simd, res_c); - } + double simd_mean1, simd_one_over_stddev1, simd_mean2, simd_one_over_stddev2; + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_iters; i++) { + target_compute_mean_stddev_func(input1.get(), w, 0, 0, &simd_mean1, + &simd_one_over_stddev1); + target_compute_mean_stddev_func(input2.get(), w, 0, 0, &simd_mean2, + &simd_one_over_stddev2); } + aom_usec_timer_mark(&test_timer); + int elapsed_time_simd = static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "aom_compute_mean_stddev(): c_time=%6d simd_time=%6d " + "gain=%.3f\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / (double)elapsed_time_simd)); + + // Time aom_compute_correlation + aom_usec_timer_start(&ref_timer); + for (int i = 0; i < num_iters; i++) { + aom_compute_correlation_c(input1.get(), w, 0, 0, c_mean1, + c_one_over_stddev1, input2.get(), w, 0, 0, + c_mean2, c_one_over_stddev2); + } + aom_usec_timer_mark(&ref_timer); + elapsed_time_c = static_cast(aom_usec_timer_elapsed(&ref_timer)); + + aom_usec_timer_start(&test_timer); + for (int i = 0; i < num_iters; i++) { + target_compute_corr_func(input1.get(), w, 0, 0, c_mean1, c_one_over_stddev1, + input2.get(), w, 0, 0, c_mean2, + c_one_over_stddev2); + } + aom_usec_timer_mark(&test_timer); + elapsed_time_simd = static_cast(aom_usec_timer_elapsed(&test_timer)); + + printf( + "aom_compute_correlation(): c_time=%6d simd_time=%6d " + "gain=%.3f\n", + elapsed_time_c, elapsed_time_simd, + (elapsed_time_c / (double)elapsed_time_simd)); } -TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(1); } -TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunCheckOutput(100000); } +TEST_P(AV1CornerMatchTest, CheckOutput) { RunCheckOutput(); } +TEST_P(AV1CornerMatchTest, DISABLED_Speed) { RunSpeedTest(); } #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, AV1CornerMatchTest, - ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_sse4_1), - make_tuple(1, &av1_compute_cross_correlation_sse4_1))); + ::testing::Values(make_tuple(0, &aom_compute_mean_stddev_sse4_1, + &aom_compute_correlation_sse4_1), + make_tuple(1, &aom_compute_mean_stddev_sse4_1, + &aom_compute_correlation_sse4_1))); #endif #if HAVE_AVX2 INSTANTIATE_TEST_SUITE_P( AVX2, AV1CornerMatchTest, - ::testing::Values(make_tuple(0, &av1_compute_cross_correlation_avx2), - make_tuple(1, &av1_compute_cross_correlation_avx2))); + ::testing::Values(make_tuple(0, &aom_compute_mean_stddev_avx2, + &aom_compute_correlation_avx2), + make_tuple(1, &aom_compute_mean_stddev_avx2, + &aom_compute_correlation_avx2))); #endif } // namespace AV1CornerMatch diff -Nru aom-3.8.2/test/disflow_test.cc aom-3.9.0/test/disflow_test.cc --- aom-3.8.2/test/disflow_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/disflow_test.cc 2024-05-07 19:57:03.663000000 +0000 @@ -114,6 +114,11 @@ ::testing::Values(aom_compute_flow_at_point_sse4_1)); #endif +#if HAVE_AVX2 +INSTANTIATE_TEST_SUITE_P(AVX2, ComputeFlowTest, + ::testing::Values(aom_compute_flow_at_point_avx2)); +#endif + #if HAVE_NEON INSTANTIATE_TEST_SUITE_P(NEON, ComputeFlowTest, ::testing::Values(aom_compute_flow_at_point_neon)); diff -Nru aom-3.8.2/test/dr_prediction_test.cc aom-3.9.0/test/dr_prediction_test.cc --- aom-3.8.2/test/dr_prediction_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/dr_prediction_test.cc 2024-05-07 19:57:03.663000000 +0000 @@ -10,6 +10,7 @@ */ #include +#include #include "third_party/googletest/src/googletest/include/gtest/gtest.h" @@ -18,6 +19,7 @@ #include "aom_mem/aom_mem.h" #include "aom_ports/aom_timer.h" +#include "aom_ports/sanitizer.h" #include "av1/common/blockd.h" #include "av1/common/pred_common.h" #include "av1/common/reconintra.h" @@ -149,8 +151,6 @@ protected: static const int kMaxNumTests = 10000; static const int kIterations = 10; - static const int kDstStride = 64; - static const int kDstSize = kDstStride * kDstStride; static const int kOffset = 16; static const int kBufSize = ((2 * MAX_TX_SIZE) << 1) + 16; @@ -161,9 +161,6 @@ start_angle_ = params_.start_angle; stop_angle_ = start_angle_ + 90; - dst_ref_ = &dst_ref_data_[0]; - dst_tst_ = &dst_tst_data_[0]; - dst_stride_ = kDstStride; above_ = &above_data_[kOffset]; left_ = &left_data_[kOffset]; @@ -171,16 +168,12 @@ above_data_[i] = rng_.Rand8(); left_data_[i] = rng_.Rand8(); } - - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = 0; - dst_tst_[i] = 0; - } } ~DrPredTest() override = default; - void Predict(bool speedtest, int tx) { + void Predict(bool speedtest, int tx, Pixel *dst_ref, Pixel *dst_tst, + int dst_stride) { const int kNumTests = speedtest ? kMaxNumTests : 1; aom_usec_timer timer; int tst_time = 0; @@ -189,7 +182,7 @@ aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { - params_.ref_fn(dst_ref_, dst_stride_, bw_, bh_, above_, left_, + params_.ref_fn(dst_ref, dst_stride, bw_, bh_, above_, left_, upsample_above_, upsample_left_, dx_, dy_, bd_); } aom_usec_timer_mark(&timer); @@ -198,15 +191,17 @@ if (params_.tst_fn) { aom_usec_timer_start(&timer); for (int k = 0; k < kNumTests; ++k) { - API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst_, dst_stride_, bw_, bh_, + API_REGISTER_STATE_CHECK(params_.tst_fn(dst_tst, dst_stride, bw_, bh_, above_, left_, upsample_above_, upsample_left_, dx_, dy_, bd_)); } aom_usec_timer_mark(&timer); tst_time = static_cast(aom_usec_timer_elapsed(&timer)); } else { - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = dst_tst_[i]; + for (int r = 0; r < bh_; ++r) { + for (int c = 0; c < bw_; ++c) { + dst_tst[r * dst_stride + c] = dst_ref[r * dst_stride + c]; + } } } @@ -222,18 +217,6 @@ } } for (int tx = 0; tx < TX_SIZES_ALL; ++tx) { - if (params_.tst_fn == nullptr) { - for (int i = 0; i < kDstSize; ++i) { - dst_tst_[i] = (1 << bd_) - 1; - dst_ref_[i] = (1 << bd_) - 1; - } - } else { - for (int i = 0; i < kDstSize; ++i) { - dst_ref_[i] = 0; - dst_tst_[i] = 0; - } - } - bw_ = tx_size_wide[kTxSize[tx]]; bh_ = tx_size_high[kTxSize[tx]]; @@ -246,12 +229,31 @@ upsample_above_ = upsample_left_ = 0; } - Predict(speedtest, tx); + // Add additional padding to allow detection of over reads/writes when + // the transform width is equal to MAX_TX_SIZE. + const int dst_stride = MAX_TX_SIZE + 16; + std::vector dst_ref(dst_stride * bh_); + std::vector dst_tst(dst_stride * bh_); + + for (int r = 0; r < bh_; ++r) { + ASAN_POISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + ASAN_POISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + } + + Predict(speedtest, tx, dst_ref.data(), dst_tst.data(), dst_stride); + + for (int r = 0; r < bh_; ++r) { + ASAN_UNPOISON_MEMORY_REGION(&dst_ref[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + ASAN_UNPOISON_MEMORY_REGION(&dst_tst[r * dst_stride + bw_], + (dst_stride - bw_) * sizeof(Pixel)); + } for (int r = 0; r < bh_; ++r) { for (int c = 0; c < bw_; ++c) { - ASSERT_EQ(dst_ref_[r * dst_stride_ + c], - dst_tst_[r * dst_stride_ + c]) + ASSERT_EQ(dst_ref[r * dst_stride + c], dst_tst[r * dst_stride + c]) << bw_ << "x" << bh_ << " r: " << r << " c: " << c << " dx: " << dx_ << " dy: " << dy_ << " upsample_above: " << upsample_above_ @@ -292,18 +294,12 @@ } } - Pixel dst_ref_data_[kDstSize]; - Pixel dst_tst_data_[kDstSize]; - Pixel left_data_[kBufSize]; Pixel dummy_data_[kBufSize]; Pixel above_data_[kBufSize]; - Pixel *dst_ref_; - Pixel *dst_tst_; Pixel *above_; Pixel *left_; - int dst_stride_; int enable_upsample_; int upsample_above_; @@ -386,6 +382,33 @@ TEST_P(LowbdDrPredTest, DISABLED_Speed) { RundrPredTest(1); } +#if CONFIG_AV1_HIGHBITDEPTH +TEST_P(HighbdDrPredTest, OperationCheck) { + if (params_.tst_fn == nullptr) return; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int angle = start_angle_; angle < stop_angle_; angle++) { + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + if (dx_ && dy_) RunTest(false, false, angle); + } + } +} + +TEST_P(HighbdDrPredTest, DISABLED_Speed) { + const int angles[] = { 3, 45, 87 }; + for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { + for (int i = 0; i < 3; ++i) { + int angle = angles[i] + start_angle_; + dx_ = av1_get_dx(angle); + dy_ = av1_get_dy(angle); + printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", + enable_upsample_, angle); + if (dx_ && dy_) RunTest(true, false, angle); + } + } +} +#endif // CONFIG_AV1_HIGHBITDEPTH + #if HAVE_SSE4_1 INSTANTIATE_TEST_SUITE_P( SSE4_1, LowbdDrPredTest, @@ -453,32 +476,6 @@ &z3_wrapper_hbd, &z3_wrapper_hbd, AOM_BITS_12, kZ3Start))); - -TEST_P(HighbdDrPredTest, DISABLED_Speed) { - const int angles[] = { 3, 45, 87 }; - for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { - for (int i = 0; i < 3; ++i) { - int angle = angles[i] + start_angle_; - dx_ = av1_get_dx(angle); - dy_ = av1_get_dy(angle); - printf("enable_upsample: %d angle: %d ~~~~~~~~~~~~~~~\n", - enable_upsample_, angle); - if (dx_ && dy_) RunTest(true, false, angle); - } - } -} - -TEST_P(HighbdDrPredTest, OperationCheck) { - if (params_.tst_fn == nullptr) return; - // const int angles[] = { 3, 45, 81, 87, 93, 100, 145, 187, 199, 260 }; - for (enable_upsample_ = 0; enable_upsample_ < 2; ++enable_upsample_) { - for (int angle = start_angle_; angle < stop_angle_; angle++) { - dx_ = av1_get_dx(angle); - dy_ = av1_get_dy(angle); - if (dx_ && dy_) RunTest(false, false, angle); - } - } -} #endif // CONFIG_AV1_HIGHBITDEPTH #endif // HAVE_AVX2 @@ -495,6 +492,47 @@ &z3_wrapper, AOM_BITS_8, kZ3Start))); +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + NEON, HighbdDrPredTest, + ::testing::Values(DrPredFunc( + &z1_wrapper_hbd, + &z1_wrapper_hbd, + AOM_BITS_8, kZ1Start), + DrPredFunc( + &z1_wrapper_hbd, + &z1_wrapper_hbd, + AOM_BITS_10, kZ1Start), + DrPredFunc( + &z1_wrapper_hbd, + &z1_wrapper_hbd, + AOM_BITS_12, kZ1Start), + DrPredFunc( + &z2_wrapper_hbd, + &z2_wrapper_hbd, + AOM_BITS_8, kZ2Start), + DrPredFunc( + &z2_wrapper_hbd, + &z2_wrapper_hbd, + AOM_BITS_10, kZ2Start), + DrPredFunc( + &z2_wrapper_hbd, + &z2_wrapper_hbd, + AOM_BITS_12, kZ2Start), + DrPredFunc( + &z3_wrapper_hbd, + &z3_wrapper_hbd, + AOM_BITS_8, kZ3Start), + DrPredFunc( + &z3_wrapper_hbd, + &z3_wrapper_hbd, + AOM_BITS_10, kZ3Start), + DrPredFunc( + &z3_wrapper_hbd, + &z3_wrapper_hbd, + AOM_BITS_12, kZ3Start))); +#endif // CONFIG_AV1_HIGHBITDEPTH + #endif // HAVE_NEON } // namespace diff -Nru aom-3.8.2/test/ec_test.cc aom-3.9.0/test/ec_test.cc --- aom-3.8.2/test/ec_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/ec_test.cc 2024-05-07 19:57:03.667000000 +0000 @@ -78,6 +78,7 @@ tell[j + 1] = od_ec_enc_tell_frac(&enc); } ptr = od_ec_enc_done(&enc, &ptr_sz); + ASSERT_NE(ptr, nullptr); EXPECT_GE(((od_ec_enc_tell(&enc) + 7U) >> 3), ptr_sz) << "od_ec_enc_tell() lied: " "there's " @@ -143,6 +144,7 @@ od_ec_enc_patch_initial_bits(&enc, 0, 2); EXPECT_FALSE(enc.error) << "od_ec_enc_patch_initial_bits() failed.\n"; ptr = od_ec_enc_done(&enc, &ptr_sz); + ASSERT_NE(ptr, nullptr); EXPECT_EQ(ptr_sz, 2u); EXPECT_EQ(ptr[0], 63) << "Got " << ptr[0] diff -Nru aom-3.8.2/test/encode_api_test.cc aom-3.9.0/test/encode_api_test.cc --- aom-3.8.2/test/encode_api_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/encode_api_test.cc 2024-05-07 19:57:03.667000000 +0000 @@ -10,6 +10,8 @@ */ #include +#include +#include #include #include #include @@ -556,6 +558,83 @@ encoder.Encode(false); } +TEST(EncodeAPI, PtsSmallerThanInitialPts) { + // Initialize libaom encoder. + aom_codec_iface_t *const iface = aom_codec_av1_cx(); + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME), + AOM_CODEC_OK); + + cfg.g_w = 1280; + cfg.g_h = 720; + cfg.rc_target_bitrate = 1000; + + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + // Create input image. + aom_image_t *const image = + CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(aom_codec_encode(&enc, image, 12, 1, 0), AOM_CODEC_OK); + ASSERT_EQ(aom_codec_encode(&enc, image, 13, 1, 0), AOM_CODEC_OK); + // pts (10) is smaller than the initial pts (12). + ASSERT_EQ(aom_codec_encode(&enc, image, 10, 1, 0), AOM_CODEC_INVALID_PARAM); + + // Free resources. + aom_img_free(image); + aom_codec_destroy(&enc); +} + +TEST(EncodeAPI, PtsOrDurationTooBig) { + // Initialize libaom encoder. + aom_codec_iface_t *const iface = aom_codec_av1_cx(); + aom_codec_ctx_t enc; + aom_codec_enc_cfg_t cfg; + + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_REALTIME), + AOM_CODEC_OK); + + cfg.g_w = 1280; + cfg.g_h = 720; + cfg.rc_target_bitrate = 1000; + + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + // Create input image. + aom_image_t *const image = + CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frame. + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK); + // pts, when converted to ticks, is too big. + ASSERT_EQ(aom_codec_encode(&enc, image, INT64_MAX / 1000000 + 1, 1, 0), + AOM_CODEC_INVALID_PARAM); +#if ULONG_MAX > INT64_MAX + // duration is too big. + ASSERT_EQ(aom_codec_encode(&enc, image, 0, (1ul << 63), 0), + AOM_CODEC_INVALID_PARAM); + // pts + duration is too big. + ASSERT_EQ(aom_codec_encode(&enc, image, 1, INT64_MAX, 0), + AOM_CODEC_INVALID_PARAM); +#endif + // pts + duration, when converted to ticks, is too big. +#if ULONG_MAX > INT64_MAX + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 0x1c0a0a1a3232, 0), + AOM_CODEC_INVALID_PARAM); +#endif + ASSERT_EQ(aom_codec_encode(&enc, image, INT64_MAX / 1000000, 1, 0), + AOM_CODEC_INVALID_PARAM); + + // Free resources. + aom_img_free(image); + aom_codec_destroy(&enc); +} + class EncodeAPIParameterized : public testing::TestWithParam> {}; @@ -655,6 +734,32 @@ EXPECT_EQ(AOM_CODEC_INVALID_PARAM, aom_codec_enc_init(&enc, iface, &cfg, 0)); } +TEST(EncodeAPI, AllIntraAndUsePsnr) { + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, AOM_USAGE_ALL_INTRA), + AOM_CODEC_OK); + + aom_codec_ctx_t enc; + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, AOM_CODEC_USE_PSNR), + AOM_CODEC_OK); + + aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + ASSERT_EQ(aom_codec_encode(&enc, image, 0, 1, 0), AOM_CODEC_OK); + const aom_codec_cx_pkt_t *pkt; + aom_codec_iter_t iter = nullptr; + while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) { + if (pkt->kind != AOM_CODEC_CX_FRAME_PKT) { + ASSERT_EQ(pkt->kind, AOM_CODEC_PSNR_PKT); + } + } + + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} + // A test that reproduces bug aomedia:3534. TEST(EncodeAPI, AllIntraAndNoRefLast) { aom_codec_iface_t *iface = aom_codec_av1_cx(); diff -Nru aom-3.8.2/test/error_block_test.cc aom-3.9.0/test/error_block_test.cc --- aom-3.8.2/test/error_block_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/error_block_test.cc 2024-05-07 19:57:03.672000000 +0000 @@ -245,7 +245,7 @@ using std::make_tuple; -#if (HAVE_SSE2) +#if HAVE_SSE2 const ErrorBlockParam kErrorBlockTestParamsSse2[] = { #if CONFIG_AV1_HIGHBITDEPTH make_tuple(&av1_highbd_block_error_sse2, &av1_highbd_block_error_c, @@ -265,7 +265,7 @@ ::testing::ValuesIn(kErrorBlockTestParamsSse2)); #endif // HAVE_SSE2 -#if (HAVE_AVX2) +#if HAVE_AVX2 const ErrorBlockParam kErrorBlockTestParamsAvx2[] = { #if CONFIG_AV1_HIGHBITDEPTH make_tuple(&av1_highbd_block_error_avx2, &av1_highbd_block_error_c, @@ -285,7 +285,7 @@ ::testing::ValuesIn(kErrorBlockTestParamsAvx2)); #endif // HAVE_AVX2 -#if (HAVE_NEON) +#if HAVE_NEON const ErrorBlockParam kErrorBlockTestParamsNeon[] = { #if CONFIG_AV1_HIGHBITDEPTH make_tuple(&av1_highbd_block_error_neon, &av1_highbd_block_error_c, @@ -304,4 +304,16 @@ INSTANTIATE_TEST_SUITE_P(NEON, ErrorBlockTest, ::testing::ValuesIn(kErrorBlockTestParamsNeon)); #endif // HAVE_NEON + +#if HAVE_SVE +const ErrorBlockParam kErrorBlockTestParamsSVE[] = { + make_tuple(&BlockError8BitWrapper, + &BlockError8BitWrapper, AOM_BITS_8), + make_tuple(&BlockErrorLpWrapper, + &BlockErrorLpWrapper, AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(SVE, ErrorBlockTest, + ::testing::ValuesIn(kErrorBlockTestParamsSVE)); +#endif // HAVE_SVE } // namespace diff -Nru aom-3.8.2/test/examples.sh aom-3.9.0/test/examples.sh --- aom-3.8.2/test/examples.sh 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/examples.sh 2024-05-07 19:57:03.674000000 +0000 @@ -10,12 +10,13 @@ ## ## This file runs all of the tests for the libaom examples. ## +readonly EXEC_DIR="$(pwd)" . $(dirname $0)/tools_common.sh example_tests=$(ls -r $(dirname $0)/*.sh) # List of script names to exclude. -exclude_list="best_encode examples run_encodes tools_common av1_c_vs_simd_encode" +exclude_list="best_encode examples run_encodes tools_common" if [ "$(realtime_only_build)" = "yes" ]; then exclude_list="${exclude_list} twopass_encoder simple_decoder lightfield_test" @@ -30,4 +31,7 @@ # Source each test script so that exporting variables can be avoided. AOM_TEST_NAME="$(basename ${test%.*})" . "${test}" + # Restore the working directory to the one at the beginning of execution. + # This avoids side-effects from tests that change the directory. + cd "${EXEC_DIR}" done diff -Nru aom-3.8.2/test/hbd_metrics_test.cc aom-3.9.0/test/hbd_metrics_test.cc --- aom-3.8.2/test/hbd_metrics_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/hbd_metrics_test.cc 2024-05-07 19:57:03.685000000 +0000 @@ -112,10 +112,10 @@ memset(&hbd_src, 0, sizeof(hbd_src)); memset(&hbd_dst, 0, sizeof(hbd_dst)); - aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16, 0, 0); - aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16, 0, 0); - aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16, 0, 0); - aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16, 0, 0); + aom_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16, false, 0); + aom_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16, false, 0); + aom_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16, false, 0); + aom_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16, false, 0); memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz); while (i < lbd_src.buffer_alloc_sz) { diff -Nru aom-3.8.2/test/kf_test.cc aom-3.9.0/test/kf_test.cc --- aom-3.8.2/test/kf_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/kf_test.cc 2024-05-07 19:57:03.689000000 +0000 @@ -9,9 +9,14 @@ * PATENTS file, you can obtain it at www.aomedia.org/license/patent. */ +#include + #include #include "aom/aom_codec.h" +#include "aom/aom_encoder.h" +#include "aom/aom_image.h" +#include "aom/aomcx.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" @@ -21,6 +26,87 @@ #define NUM_LAG_VALUES 3 namespace { +aom_image_t *CreateGrayImage(aom_img_fmt_t fmt, unsigned int w, + unsigned int h) { + aom_image_t *const image = aom_img_alloc(nullptr, fmt, w, h, 1); + if (!image) return image; + + for (unsigned int i = 0; i < image->d_h; ++i) { + memset(image->planes[0] + i * image->stride[0], 128, image->d_w); + } + const unsigned int uv_h = (image->d_h + 1) / 2; + const unsigned int uv_w = (image->d_w + 1) / 2; + for (unsigned int i = 0; i < uv_h; ++i) { + memset(image->planes[1] + i * image->stride[1], 128, uv_w); + memset(image->planes[2] + i * image->stride[2], 128, uv_w); + } + return image; +} + +// Tests kf_max_dist in one-pass encoding with zero lag. +void TestKeyFrameMaximumInterval(unsigned int usage, unsigned int kf_max_dist) { + aom_codec_iface_t *iface = aom_codec_av1_cx(); + aom_codec_enc_cfg_t cfg; + ASSERT_EQ(aom_codec_enc_config_default(iface, &cfg, usage), AOM_CODEC_OK); + cfg.g_w = 320; + cfg.g_h = 240; + cfg.g_pass = AOM_RC_ONE_PASS; + cfg.g_lag_in_frames = 0; + cfg.kf_mode = AOM_KF_AUTO; + cfg.kf_min_dist = 0; + cfg.kf_max_dist = kf_max_dist; + + aom_codec_ctx_t enc; + ASSERT_EQ(aom_codec_enc_init(&enc, iface, &cfg, 0), AOM_CODEC_OK); + + ASSERT_EQ(aom_codec_control(&enc, AOME_SET_CPUUSED, 6), AOM_CODEC_OK); + + aom_image_t *image = CreateGrayImage(AOM_IMG_FMT_I420, cfg.g_w, cfg.g_h); + ASSERT_NE(image, nullptr); + + // Encode frames. + const aom_codec_cx_pkt_t *pkt; + const unsigned int num_frames = kf_max_dist == 0 ? 4 : 3 * kf_max_dist + 1; + for (unsigned int i = 0; i < num_frames; ++i) { + ASSERT_EQ(aom_codec_encode(&enc, image, i, 1, 0), AOM_CODEC_OK); + aom_codec_iter_t iter = nullptr; + while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + if (kf_max_dist == 0 || i % kf_max_dist == 0) { + ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, AOM_FRAME_IS_KEY); + } else { + ASSERT_EQ(pkt->data.frame.flags & AOM_FRAME_IS_KEY, 0u); + } + } + } + + // Flush the encoder. + bool got_data; + do { + ASSERT_EQ(aom_codec_encode(&enc, nullptr, 0, 1, 0), AOM_CODEC_OK); + got_data = false; + aom_codec_iter_t iter = nullptr; + while ((pkt = aom_codec_get_cx_data(&enc, &iter)) != nullptr) { + ASSERT_EQ(pkt->kind, AOM_CODEC_CX_FRAME_PKT); + got_data = true; + } + } while (got_data); + + aom_img_free(image); + ASSERT_EQ(aom_codec_destroy(&enc), AOM_CODEC_OK); +} + +TEST(KeyFrameIntervalTest, KeyFrameMaximumInterval) { + for (unsigned int usage : { AOM_USAGE_GOOD_QUALITY, AOM_USAGE_REALTIME }) { + // Test 0 and 1 (both mean all intra), some powers of 2, some multiples of + // 10, and some prime numbers. + for (unsigned int kf_max_dist : + { 0, 1, 2, 3, 4, 7, 10, 13, 16, 20, 23, 29, 32 }) { + TestKeyFrameMaximumInterval(usage, kf_max_dist); + } + } +} + typedef struct { const unsigned int min_kf_dist; const unsigned int max_kf_dist; diff -Nru aom-3.8.2/test/level_test.cc aom-3.9.0/test/level_test.cc --- aom-3.8.2/test/level_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/level_test.cc 2024-05-07 19:57:03.690000000 +0000 @@ -135,12 +135,12 @@ // To save run time, we only test speed 4. if (cpu_used_ == 4) { libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 40); + 30, 1, 0, 30); target_level_ = kLevelKeepStats; cfg_.rc_target_bitrate = 1000; - cfg_.g_limit = 40; + cfg_.g_limit = 30; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(level_[0], 0); + ASSERT_LE(level_[0], 0); } } @@ -148,12 +148,12 @@ // To save run time, we only test speed 4. if (cpu_used_ == 4) { libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, - 30, 1, 0, 40); + 30, 1, 0, 30); target_level_ = kLevelKeepStats; cfg_.rc_target_bitrate = 4000; - cfg_.g_limit = 40; + cfg_.g_limit = 30; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(level_[0], 4); + ASSERT_LE(level_[0], 4); } } @@ -166,7 +166,7 @@ target_level_ = target_level; cfg_.rc_target_bitrate = 4000; ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); - ASSERT_EQ(level_[0], target_level); + ASSERT_LE(level_[0], target_level); } } diff -Nru aom-3.8.2/test/pickrst_test.cc aom-3.9.0/test/pickrst_test.cc --- aom-3.8.2/test/pickrst_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/pickrst_test.cc 2024-05-07 19:57:03.701000000 +0000 @@ -363,6 +363,12 @@ ::testing::Values(av1_highbd_pixel_proj_error_avx2)); #endif // HAVE_AVX2 +#if HAVE_NEON + +INSTANTIATE_TEST_SUITE_P(NEON, PixelProjHighbdErrorTest, + ::testing::Values(av1_highbd_pixel_proj_error_neon)); +#endif // HAVE_NEON + } // namespace pickrst_test_highbd #endif // CONFIG_AV1_HIGHBITDEPTH diff -Nru aom-3.8.2/test/resize_test.cc aom-3.9.0/test/resize_test.cc --- aom-3.8.2/test/resize_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/resize_test.cc 2024-05-07 19:57:03.705000000 +0000 @@ -11,15 +11,16 @@ #include #include + +#include "aom/aomcx.h" #include "aom_dsp/aom_dsp_common.h" -#include "common/tools_common.h" #include "av1/encoder/encoder.h" #include "third_party/googletest/src/googletest/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" -#include "test/video_source.h" #include "test/util.h" +#include "test/video_source.h" #include "test/y4m_video_source.h" // Enable(1) or Disable(0) writing of the compressed bitstream. @@ -403,7 +404,7 @@ ResizeRealtimeTest() : EncoderTest(GET_PARAM(0)), num_threads_(GET_PARAM(3)), set_scale_mode_(false), set_scale_mode2_(false), - set_scale_mode3_(false) {} + set_scale_mode3_(false), is_screen_(false) {} ~ResizeRealtimeTest() override = default; void PreEncodeFrameHook(libaom_test::VideoSource *video, @@ -415,6 +416,8 @@ encoder->Control(AV1E_SET_ENABLE_OBMC, 0); encoder->Control(AOME_SET_CPUUSED, set_cpu_used_); encoder->Control(AV1E_SET_FRAME_PARALLEL_DECODING, 1); + if (is_screen_) + encoder->Control(AV1E_SET_TUNE_CONTENT, AOM_CONTENT_SCREEN); } if (set_scale_mode_) { struct aom_scaling_mode mode; @@ -508,6 +511,7 @@ bool set_scale_mode_; bool set_scale_mode2_; bool set_scale_mode3_; + bool is_screen_; }; // Check the AOME_SET_SCALEMODE control by downsizing to @@ -685,6 +689,45 @@ } } +TEST_P(ResizeRealtimeTest, TestExternalResizeWorksUsePSNR) { + ResizingVideoSource video; + video.flag_codec_ = 1; + change_bitrate_ = false; + set_scale_mode_ = false; + set_scale_mode2_ = false; + set_scale_mode3_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + init_flags_ = AOM_CODEC_USE_PSNR; + cfg_.rc_dropframe_thresh = 30; + DefaultConfig(); + // Test external resizing with start resolution equal to + // 1. kInitialWidth and kInitialHeight + // 2. down-scaled kInitialWidth and kInitialHeight + for (int i = 0; i < 2; i++) { + video.change_start_resln_ = static_cast(i); + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + // Check we decoded the same number of frames as we attempted to encode + ASSERT_EQ(frame_info_list_.size(), video.limit()); + for (const auto &info : frame_info_list_) { + const unsigned int frame = static_cast(info.pts); + unsigned int expected_w; + unsigned int expected_h; + ScaleForFrameNumber(frame, kInitialWidth, kInitialHeight, + video.flag_codec_, video.change_start_resln_, + &expected_w, &expected_h); + EXPECT_EQ(expected_w, info.w) + << "Frame " << frame << " had unexpected width"; + EXPECT_EQ(expected_h, info.h) + << "Frame " << frame << " had unexpected height"; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); + } + frame_info_list_.clear(); + } +} + // Verify the dynamic resizer behavior for real time, 1 pass CBR mode. // Run at low bitrate, with resize_allowed = 1, and verify that we get // one resize down event. @@ -740,6 +783,7 @@ TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRate) { ::libaom_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1, 0, 400); + init_flags_ = AOM_CODEC_USE_PSNR; cfg_.g_w = 640; cfg_.g_h = 480; change_bitrate_ = true; @@ -790,6 +834,63 @@ ASSERT_GE(resize_down_count, 1) << "Resizing down should occur at lease once."; EXPECT_EQ(static_cast(0), GetMismatchFrames()); +#else + printf("Warning: AV1 decoder unavailable, unable to check resize count!\n"); +#endif +} + +// Verify the dynamic resizer behavior for real time, 1 pass CBR mode for +// screen content mode. Start at low target bitrate, raise the bitrate in the +// middle of the clip (at frame# = frame_change_bitrate_), scaling-up should +// occur after bitrate is increased. +TEST_P(ResizeRealtimeTest, TestInternalResizeDownUpChangeBitRateScreen) { + ::libaom_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288, + 30, 1, 0, 300); + init_flags_ = AOM_CODEC_USE_PSNR; + cfg_.g_w = 352; + cfg_.g_h = 288; + change_bitrate_ = true; + frame_change_bitrate_ = 120; + set_scale_mode_ = false; + set_scale_mode2_ = false; + set_scale_mode3_ = false; + mismatch_psnr_ = 0.0; + mismatch_nframes_ = 0; + is_screen_ = true; + DefaultConfig(); + // Disable dropped frames. + cfg_.rc_dropframe_thresh = 0; + // Starting bitrate low. + cfg_.rc_target_bitrate = 100; + cfg_.rc_resize_mode = RESIZE_DYNAMIC; + cfg_.g_forced_max_frame_width = 1280; + cfg_.g_forced_max_frame_height = 1280; + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + + unsigned int last_w = cfg_.g_w; + unsigned int last_h = cfg_.g_h; + unsigned int frame_number = 0; + int resize_down_count = 0; + for (std::vector::const_iterator info = frame_info_list_.begin(); + info != frame_info_list_.end(); ++info) { + if (info->w != last_w || info->h != last_h) { + if (frame_number < frame_change_bitrate_) { + // Verify that resize down occurs, before bitrate is increased. + ASSERT_LT(info->w, last_w); + ASSERT_LT(info->h, last_h); + resize_down_count++; + } + last_w = info->w; + last_h = info->h; + } + frame_number++; + } + +#if CONFIG_AV1_DECODER + // Verify that we get at least 1 resize event in this test. + ASSERT_GE(resize_down_count, 1) + << "Resizing down should occur at lease once."; + EXPECT_EQ(static_cast(0), GetMismatchFrames()); #else printf("Warning: AV1 decoder unavailable, unable to check resize count!\n"); #endif diff -Nru aom-3.8.2/test/sad_test.cc aom-3.9.0/test/sad_test.cc --- aom-3.8.2/test/sad_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/sad_test.cc 2024-05-07 19:57:03.707000000 +0000 @@ -3202,6 +3202,7 @@ make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1), make_tuple(16, 64, &aom_sad_skip_16x64x4d_avx2, -1), + make_tuple(16, 4, &aom_sad_skip_16x4x4d_avx2, -1), #endif }; @@ -3294,6 +3295,7 @@ #if !CONFIG_REALTIME_ONLY make_tuple(32, 8, &aom_sad32x8x3d_avx2, -1), make_tuple(64, 16, &aom_sad64x16x3d_avx2, -1), + make_tuple(16, 4, &aom_sad16x4x3d_avx2, -1), #endif // !CONFIG_REALTIME_ONLY #if CONFIG_AV1_HIGHBITDEPTH diff -Nru aom-3.8.2/test/segment_binarization_sync.cc aom-3.9.0/test/segment_binarization_sync.cc --- aom-3.8.2/test/segment_binarization_sync.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/segment_binarization_sync.cc 2024-05-07 19:57:03.712000000 +0000 @@ -10,15 +10,14 @@ */ #include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "av1/common/seg_common.h" +#include "av1/decoder/decodemv.h" +#include "av1/encoder/bitstream.h" #include "test/acm_random.h" using libaom_test::ACMRandom; -extern "C" { -int av1_neg_interleave(int x, int ref, int max); -int av1_neg_deinterleave(int diff, int ref, int max); -} - namespace { struct Segment { @@ -28,8 +27,6 @@ }; Segment GenerateSegment(int seed) { - static const int MAX_SEGMENTS = 8; - ACMRandom rnd_(seed); Segment segment; diff -Nru aom-3.8.2/test/sharpness_test.cc aom-3.9.0/test/sharpness_test.cc --- aom-3.8.2/test/sharpness_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/sharpness_test.cc 2024-05-07 19:57:03.713000000 +0000 @@ -30,7 +30,7 @@ kPsnrThreshold = { { static_cast(::libaom_test::kTwoPassGood), { { 2, { { 2, 37.6 }, { 5, 37.6 } } }, { 4, { { 2, 37.5 }, { 5, 37.5 } } }, - { 6, { { 2, 37.5 }, { 5, 37.5 } } } } }, + { 6, { { 2, 37.4 }, { 5, 37.4 } } } } }, { static_cast(::libaom_test::kAllIntra), { { 3, { { 2, 42.2 }, { 5, 42.2 } } }, { 6, { { 2, 41.8 }, { 4, 41.9 }, { 5, 41.9 } } }, diff -Nru aom-3.8.2/test/simd_cmp_neon.cc aom-3.9.0/test/simd_cmp_neon.cc --- aom-3.8.2/test/simd_cmp_neon.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/simd_cmp_neon.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ -#define ARCH NEON -#define ARCH_POSTFIX(name) name##_neon -#define SIMD_NAMESPACE simd_test_neon -#include "test/simd_cmp_impl.h" -#endif diff -Nru aom-3.8.2/test/simd_neon_test.cc aom-3.9.0/test/simd_neon_test.cc --- aom-3.8.2/test/simd_neon_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/simd_neon_test.cc 1970-01-01 00:00:00.000000000 +0000 @@ -1,17 +0,0 @@ -/* - * Copyright (c) 2016, Alliance for Open Media. All rights reserved - * - * This source code is subject to the terms of the BSD 2 Clause License and - * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License - * was not distributed with this source code in the LICENSE file, you can - * obtain it at www.aomedia.org/license/software. If the Alliance for Open - * Media Patent License 1.0 was not distributed with this source code in the - * PATENTS file, you can obtain it at www.aomedia.org/license/patent. - */ - -#if defined(__OPTIMIZE__) && __OPTIMIZE__ -#define ARCH NEON -#define ARCH_POSTFIX(name) name##_neon -#define SIMD_NAMESPACE simd_test_neon -#include "test/simd_impl.h" -#endif diff -Nru aom-3.8.2/test/sse_sum_test.cc aom-3.9.0/test/sse_sum_test.cc --- aom-3.8.2/test/sse_sum_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/sse_sum_test.cc 2024-05-07 19:57:03.717000000 +0000 @@ -173,4 +173,10 @@ &aom_sum_sse_2d_i16_c, &aom_sum_sse_2d_i16_avx2))); #endif // HAVE_AVX2 +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, SumSSETest, + ::testing::Values(TestFuncs(&aom_sum_sse_2d_i16_c, + &aom_sum_sse_2d_i16_sve))); +#endif // HAVE_SVE + } // namespace diff -Nru aom-3.8.2/test/sum_squares_test.cc aom-3.9.0/test/sum_squares_test.cc --- aom-3.8.2/test/sum_squares_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/sum_squares_test.cc 2024-05-07 19:57:03.718000000 +0000 @@ -172,6 +172,14 @@ #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, SumSquaresTest, + ::testing::Values(TestFuncs(&aom_sum_squares_2d_i16_c, + &aom_sum_squares_2d_i16_sve))); + +#endif // HAVE_SVE + #if HAVE_AVX2 INSTANTIATE_TEST_SUITE_P( AVX2, SumSquaresTest, @@ -200,8 +208,8 @@ for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max; - const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize - : rng_(kMaxSize) + 1; + // Block size is between 64 and 128 * 128 and is always a multiple of 64. + const int n = (rng_(255) + 1) * 64; const uint64_t ref_res = params_.ref_func(src, n); uint64_t tst_res; @@ -221,8 +229,8 @@ for (int i = 0; i < kMaxSize * kMaxSize; ++i) src[i] = -kInt13Max; } - const int n = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize - : rng_(kMaxSize) + 1; + // Block size is between 64 and 128 * 128 and is always a multiple of 64. + const int n = (rng_(255) + 1) * 64; const uint64_t ref_res = params_.ref_func(src, n); uint64_t tst_res; @@ -246,6 +254,13 @@ #endif // HAVE_NEON +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P(SVE, SumSquares1DTest, + ::testing::Values(TestFuncs1D( + aom_sum_squares_i16_c, aom_sum_squares_i16_sve))); + +#endif // HAVE_SVE + typedef int64_t (*SSEFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int width, int height); typedef libaom_test::FuncParam TestSSEFuncs; @@ -443,6 +458,15 @@ Combine(ValuesIn(sse_avx2), Range(4, 129, 4))); #endif // HAVE_AVX2 +#if HAVE_SVE +#if CONFIG_AV1_HIGHBITDEPTH +TestSSEFuncs sse_sve[] = { TestSSEFuncs(&aom_highbd_sse_c, + &aom_highbd_sse_sve) }; +INSTANTIATE_TEST_SUITE_P(SVE, SSETest, + Combine(ValuesIn(sse_sve), Range(4, 129, 4))); +#endif +#endif // HAVE_SVE + ////////////////////////////////////////////////////////////////////////////// // get_blk sum squares test functions ////////////////////////////////////////////////////////////////////////////// @@ -595,6 +619,14 @@ ValuesIn(kValidBlockSize))); #endif // HAVE_NEON +#if HAVE_SVE +TestSSE_SumFuncs sse_sum_sve[] = { TestSSE_SumFuncs(&aom_get_blk_sse_sum_c, + &aom_get_blk_sse_sum_sve) }; +INSTANTIATE_TEST_SUITE_P(SVE, SSE_Sum_Test, + Combine(ValuesIn(sse_sum_sve), + ValuesIn(kValidBlockSize))); +#endif // HAVE_SVE + ////////////////////////////////////////////////////////////////////////////// // 2D Variance test functions ////////////////////////////////////////////////////////////////////////////// @@ -885,4 +917,12 @@ ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, &aom_var_2d_u16_neon))); #endif // HAVE_NEON + +#if HAVE_SVE + +INSTANTIATE_TEST_SUITE_P(SVE, Highbd2dVarTest, + ::testing::Values(TestFuncVar2D(&aom_var_2d_u16_c, + &aom_var_2d_u16_sve))); + +#endif // HAVE_SVE } // namespace diff -Nru aom-3.8.2/test/test.cmake aom-3.9.0/test/test.cmake --- aom-3.8.2/test/test.cmake 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/test.cmake 2024-05-07 19:57:03.726000000 +0000 @@ -157,12 +157,6 @@ "${AOM_ROOT}/test/simd_cmp_impl.h" "${AOM_ROOT}/test/simd_impl.h") - if(HAVE_NEON) - list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_NEON - "${AOM_ROOT}/test/simd_cmp_neon.cc") - add_to_libaom_test_srcs(AOM_UNIT_TEST_COMMON_INTRIN_NEON) - endif() - if(HAVE_SSE2) list(APPEND AOM_UNIT_TEST_COMMON_INTRIN_SSE2 "${AOM_ROOT}/test/simd_cmp_sse2.cc") @@ -283,29 +277,24 @@ list(APPEND AOM_UNIT_TEST_COMMON_SOURCES "${AOM_ROOT}/test/coding_path_sync.cc") endif() - if(CONFIG_REALTIME_ONLY) - list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES - "${AOM_ROOT}/test/altref_test.cc" - "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc" - "${AOM_ROOT}/test/av1_ext_tile_test.cc" - "${AOM_ROOT}/test/cnn_test.cc" - "${AOM_ROOT}/test/decode_multithreaded_test.cc" - "${AOM_ROOT}/test/error_resilience_test.cc" - "${AOM_ROOT}/test/kf_test.cc" - "${AOM_ROOT}/test/lossless_test.cc" - "${AOM_ROOT}/test/sb_multipass_test.cc" - "${AOM_ROOT}/test/sb_qp_sweep_test.cc" - "${AOM_ROOT}/test/selfguided_filter_test.cc" - "${AOM_ROOT}/test/screen_content_test.cc" - "${AOM_ROOT}/test/still_picture_test.cc" - "${AOM_ROOT}/test/tile_independence_test.cc" - "${AOM_ROOT}/test/tpl_model_test.cc") - endif() endif() - - if(HAVE_NEON) - list(APPEND AOM_UNIT_TEST_COMMON_SOURCES - "${AOM_ROOT}/test/simd_neon_test.cc") + if(CONFIG_REALTIME_ONLY) + list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES + "${AOM_ROOT}/test/altref_test.cc" + "${AOM_ROOT}/test/av1_encoder_parms_get_to_decoder.cc" + "${AOM_ROOT}/test/av1_ext_tile_test.cc" + "${AOM_ROOT}/test/cnn_test.cc" + "${AOM_ROOT}/test/decode_multithreaded_test.cc" + "${AOM_ROOT}/test/error_resilience_test.cc" + "${AOM_ROOT}/test/kf_test.cc" + "${AOM_ROOT}/test/lossless_test.cc" + "${AOM_ROOT}/test/sb_multipass_test.cc" + "${AOM_ROOT}/test/sb_qp_sweep_test.cc" + "${AOM_ROOT}/test/selfguided_filter_test.cc" + "${AOM_ROOT}/test/screen_content_test.cc" + "${AOM_ROOT}/test/still_picture_test.cc" + "${AOM_ROOT}/test/tile_independence_test.cc" + "${AOM_ROOT}/test/tpl_model_test.cc") endif() if(CONFIG_FPMT_TEST AND (NOT CONFIG_REALTIME_ONLY)) diff -Nru aom-3.8.2/test/test_libaom.cc aom-3.9.0/test/test_libaom.cc --- aom-3.8.2/test/test_libaom.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/test_libaom.cc 2024-05-07 19:57:03.730000000 +0000 @@ -62,6 +62,7 @@ if (!(caps & HAS_NEON_DOTPROD)) append_negative_gtest_filter("NEON_DOTPROD"); if (!(caps & HAS_NEON_I8MM)) append_negative_gtest_filter("NEON_I8MM"); if (!(caps & HAS_SVE)) append_negative_gtest_filter("SVE"); + if (!(caps & HAS_SVE2)) append_negative_gtest_filter("SVE2"); #elif AOM_ARCH_ARM const int caps = aom_arm_cpu_caps(); if (!(caps & HAS_NEON)) append_negative_gtest_filter("NEON"); diff -Nru aom-3.8.2/test/tools_common.sh aom-3.9.0/test/tools_common.sh --- aom-3.8.2/test/tools_common.sh 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/tools_common.sh 2024-05-07 19:57:03.734000000 +0000 @@ -312,7 +312,11 @@ # Combine environment and actual tests. local tests_to_run="${env_tests} ${tests_to_filter}" - check_version_strings + # av1_c_vs_simd_encode is a standalone test, and it doesn't need to check the + # version string. + if [ "${test_name}" != "av1_c_vs_simd_encode" ]; then + check_version_strings + fi # Run tests. for test in ${tests_to_run}; do @@ -464,6 +468,8 @@ AOM_TEST_PRESERVE_OUTPUT=${AOM_TEST_PRESERVE_OUTPUT:-no} +# This checking requires config/aom_config.c that is available in Jenkins +# testing. if [ "$(is_windows_target)" = "yes" ]; then AOM_TEST_EXE_SUFFIX=".exe" fi diff -Nru aom-3.8.2/test/variance_test.cc aom-3.9.0/test/variance_test.cc --- aom-3.8.2/test/variance_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/variance_test.cc 2024-05-07 19:57:03.737000000 +0000 @@ -2147,6 +2147,27 @@ MseParams(3, 3, &aom_highbd_8_mse8x8_neon_dotprod, 8))); #endif // HAVE_NEON_DOTPROD +#if HAVE_SVE +INSTANTIATE_TEST_SUITE_P( + SVE, MseHBDWxHTest, + ::testing::Values(MseHBDWxHParams(3, 3, &aom_mse_wxh_16bit_highbd_sve, 10), + MseHBDWxHParams(3, 2, &aom_mse_wxh_16bit_highbd_sve, 10), + MseHBDWxHParams(2, 3, &aom_mse_wxh_16bit_highbd_sve, 10), + MseHBDWxHParams(2, 2, &aom_mse_wxh_16bit_highbd_sve, + 10))); + +INSTANTIATE_TEST_SUITE_P( + SVE, AvxHBDMseTest, + ::testing::Values(MseParams(4, 4, &aom_highbd_12_mse16x16_sve, 12), + MseParams(4, 3, &aom_highbd_12_mse16x8_sve, 12), + MseParams(3, 4, &aom_highbd_12_mse8x16_sve, 12), + MseParams(3, 3, &aom_highbd_12_mse8x8_sve, 12), + MseParams(4, 4, &aom_highbd_10_mse16x16_sve, 10), + MseParams(4, 3, &aom_highbd_10_mse16x8_sve, 10), + MseParams(3, 4, &aom_highbd_10_mse8x16_sve, 10), + MseParams(3, 3, &aom_highbd_10_mse8x8_sve, 10))); +#endif // HAVE_SVE + const VarianceParams kArrayHBDVariance_c[] = { VarianceParams(7, 7, &aom_highbd_12_variance128x128_c, 12), VarianceParams(7, 6, &aom_highbd_12_variance128x64_c, 12), @@ -2764,64 +2785,6 @@ INSTANTIATE_TEST_SUITE_P(SSE2, GetSseSum16x16DualTest, ::testing::ValuesIn(kArrayGetSseSum16x16Dual_sse2)); -const SubpelVarianceParams kArraySubpelVariance_sse2[] = { - SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_sse2, 0), - SubpelVarianceParams(7, 6, &aom_sub_pixel_variance128x64_sse2, 0), - SubpelVarianceParams(6, 7, &aom_sub_pixel_variance64x128_sse2, 0), - SubpelVarianceParams(6, 6, &aom_sub_pixel_variance64x64_sse2, 0), - SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_sse2, 0), - SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_sse2, 0), - SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_sse2, 0), - SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_sse2, 0), - SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_sse2, 0), - SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_sse2, 0), - SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_sse2, 0), - SubpelVarianceParams(3, 4, &aom_sub_pixel_variance8x16_sse2, 0), - SubpelVarianceParams(3, 3, &aom_sub_pixel_variance8x8_sse2, 0), - SubpelVarianceParams(3, 2, &aom_sub_pixel_variance8x4_sse2, 0), - SubpelVarianceParams(2, 3, &aom_sub_pixel_variance4x8_sse2, 0), - SubpelVarianceParams(2, 2, &aom_sub_pixel_variance4x4_sse2, 0), -#if !CONFIG_REALTIME_ONLY - SubpelVarianceParams(6, 4, &aom_sub_pixel_variance64x16_sse2, 0), - SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_sse2, 0), - SubpelVarianceParams(5, 3, &aom_sub_pixel_variance32x8_sse2, 0), - SubpelVarianceParams(3, 5, &aom_sub_pixel_variance8x32_sse2, 0), - SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_sse2, 0), - SubpelVarianceParams(2, 4, &aom_sub_pixel_variance4x16_sse2, 0), -#endif -}; -INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelVarianceTest, - ::testing::ValuesIn(kArraySubpelVariance_sse2)); - -const SubpelAvgVarianceParams kArraySubpelAvgVariance_sse2[] = { - SubpelAvgVarianceParams(7, 7, &aom_sub_pixel_avg_variance128x128_sse2, 0), - SubpelAvgVarianceParams(7, 6, &aom_sub_pixel_avg_variance128x64_sse2, 0), - SubpelAvgVarianceParams(6, 7, &aom_sub_pixel_avg_variance64x128_sse2, 0), - SubpelAvgVarianceParams(6, 6, &aom_sub_pixel_avg_variance64x64_sse2, 0), - SubpelAvgVarianceParams(6, 5, &aom_sub_pixel_avg_variance64x32_sse2, 0), - SubpelAvgVarianceParams(5, 6, &aom_sub_pixel_avg_variance32x64_sse2, 0), - SubpelAvgVarianceParams(5, 5, &aom_sub_pixel_avg_variance32x32_sse2, 0), - SubpelAvgVarianceParams(5, 4, &aom_sub_pixel_avg_variance32x16_sse2, 0), - SubpelAvgVarianceParams(4, 5, &aom_sub_pixel_avg_variance16x32_sse2, 0), - SubpelAvgVarianceParams(4, 4, &aom_sub_pixel_avg_variance16x16_sse2, 0), - SubpelAvgVarianceParams(4, 3, &aom_sub_pixel_avg_variance16x8_sse2, 0), - SubpelAvgVarianceParams(3, 4, &aom_sub_pixel_avg_variance8x16_sse2, 0), - SubpelAvgVarianceParams(3, 3, &aom_sub_pixel_avg_variance8x8_sse2, 0), - SubpelAvgVarianceParams(3, 2, &aom_sub_pixel_avg_variance8x4_sse2, 0), - SubpelAvgVarianceParams(2, 3, &aom_sub_pixel_avg_variance4x8_sse2, 0), - SubpelAvgVarianceParams(2, 2, &aom_sub_pixel_avg_variance4x4_sse2, 0), -#if !CONFIG_REALTIME_ONLY - SubpelAvgVarianceParams(6, 4, &aom_sub_pixel_avg_variance64x16_sse2, 0), - SubpelAvgVarianceParams(4, 6, &aom_sub_pixel_avg_variance16x64_sse2, 0), - SubpelAvgVarianceParams(5, 3, &aom_sub_pixel_avg_variance32x8_sse2, 0), - SubpelAvgVarianceParams(3, 5, &aom_sub_pixel_avg_variance8x32_sse2, 0), - SubpelAvgVarianceParams(4, 2, &aom_sub_pixel_avg_variance16x4_sse2, 0), - SubpelAvgVarianceParams(2, 4, &aom_sub_pixel_avg_variance4x16_sse2, 0), -#endif -}; -INSTANTIATE_TEST_SUITE_P(SSE2, AvxSubpelAvgVarianceTest, - ::testing::ValuesIn(kArraySubpelAvgVariance_sse2)); - #if CONFIG_AV1_HIGHBITDEPTH #if HAVE_SSE2 INSTANTIATE_TEST_SUITE_P( @@ -4262,4 +4225,84 @@ #endif // HAVE_NEON_DOTPROD +#if HAVE_SVE + +#if CONFIG_AV1_HIGHBITDEPTH +const VarianceParams kArrayHBDVariance_sve[] = { + VarianceParams(7, 7, &aom_highbd_12_variance128x128_sve, 12), + VarianceParams(7, 6, &aom_highbd_12_variance128x64_sve, 12), + VarianceParams(6, 7, &aom_highbd_12_variance64x128_sve, 12), + VarianceParams(6, 6, &aom_highbd_12_variance64x64_sve, 12), + VarianceParams(6, 5, &aom_highbd_12_variance64x32_sve, 12), + VarianceParams(5, 6, &aom_highbd_12_variance32x64_sve, 12), + VarianceParams(5, 5, &aom_highbd_12_variance32x32_sve, 12), + VarianceParams(5, 4, &aom_highbd_12_variance32x16_sve, 12), + VarianceParams(4, 5, &aom_highbd_12_variance16x32_sve, 12), + VarianceParams(4, 4, &aom_highbd_12_variance16x16_sve, 12), + VarianceParams(4, 3, &aom_highbd_12_variance16x8_sve, 12), + VarianceParams(3, 4, &aom_highbd_12_variance8x16_sve, 12), + VarianceParams(3, 3, &aom_highbd_12_variance8x8_sve, 12), + VarianceParams(3, 2, &aom_highbd_12_variance8x4_sve, 12), + VarianceParams(2, 3, &aom_highbd_12_variance4x8_sve, 12), + VarianceParams(2, 2, &aom_highbd_12_variance4x4_sve, 12), + VarianceParams(7, 7, &aom_highbd_10_variance128x128_sve, 10), + VarianceParams(7, 6, &aom_highbd_10_variance128x64_sve, 10), + VarianceParams(6, 7, &aom_highbd_10_variance64x128_sve, 10), + VarianceParams(6, 6, &aom_highbd_10_variance64x64_sve, 10), + VarianceParams(6, 5, &aom_highbd_10_variance64x32_sve, 10), + VarianceParams(5, 6, &aom_highbd_10_variance32x64_sve, 10), + VarianceParams(5, 5, &aom_highbd_10_variance32x32_sve, 10), + VarianceParams(5, 4, &aom_highbd_10_variance32x16_sve, 10), + VarianceParams(4, 5, &aom_highbd_10_variance16x32_sve, 10), + VarianceParams(4, 4, &aom_highbd_10_variance16x16_sve, 10), + VarianceParams(4, 3, &aom_highbd_10_variance16x8_sve, 10), + VarianceParams(3, 4, &aom_highbd_10_variance8x16_sve, 10), + VarianceParams(3, 3, &aom_highbd_10_variance8x8_sve, 10), + VarianceParams(3, 2, &aom_highbd_10_variance8x4_sve, 10), + VarianceParams(2, 3, &aom_highbd_10_variance4x8_sve, 10), + VarianceParams(2, 2, &aom_highbd_10_variance4x4_sve, 10), + VarianceParams(7, 7, &aom_highbd_8_variance128x128_sve, 8), + VarianceParams(7, 6, &aom_highbd_8_variance128x64_sve, 8), + VarianceParams(6, 7, &aom_highbd_8_variance64x128_sve, 8), + VarianceParams(6, 6, &aom_highbd_8_variance64x64_sve, 8), + VarianceParams(6, 5, &aom_highbd_8_variance64x32_sve, 8), + VarianceParams(5, 6, &aom_highbd_8_variance32x64_sve, 8), + VarianceParams(5, 5, &aom_highbd_8_variance32x32_sve, 8), + VarianceParams(5, 4, &aom_highbd_8_variance32x16_sve, 8), + VarianceParams(4, 5, &aom_highbd_8_variance16x32_sve, 8), + VarianceParams(4, 4, &aom_highbd_8_variance16x16_sve, 8), + VarianceParams(4, 3, &aom_highbd_8_variance16x8_sve, 8), + VarianceParams(3, 4, &aom_highbd_8_variance8x16_sve, 8), + VarianceParams(3, 3, &aom_highbd_8_variance8x8_sve, 8), + VarianceParams(3, 2, &aom_highbd_8_variance8x4_sve, 8), + VarianceParams(2, 3, &aom_highbd_8_variance4x8_sve, 8), + VarianceParams(2, 2, &aom_highbd_8_variance4x4_sve, 8), +#if !CONFIG_REALTIME_ONLY + VarianceParams(6, 4, &aom_highbd_12_variance64x16_sve, 12), + VarianceParams(4, 6, &aom_highbd_12_variance16x64_sve, 12), + VarianceParams(5, 3, &aom_highbd_12_variance32x8_sve, 12), + VarianceParams(3, 5, &aom_highbd_12_variance8x32_sve, 12), + VarianceParams(4, 2, &aom_highbd_12_variance16x4_sve, 12), + VarianceParams(2, 4, &aom_highbd_12_variance4x16_sve, 12), + VarianceParams(6, 4, &aom_highbd_10_variance64x16_sve, 10), + VarianceParams(4, 6, &aom_highbd_10_variance16x64_sve, 10), + VarianceParams(5, 3, &aom_highbd_10_variance32x8_sve, 10), + VarianceParams(3, 5, &aom_highbd_10_variance8x32_sve, 10), + VarianceParams(4, 2, &aom_highbd_10_variance16x4_sve, 10), + VarianceParams(2, 4, &aom_highbd_10_variance4x16_sve, 10), + VarianceParams(6, 4, &aom_highbd_8_variance64x16_sve, 8), + VarianceParams(4, 6, &aom_highbd_8_variance16x64_sve, 8), + VarianceParams(5, 3, &aom_highbd_8_variance32x8_sve, 8), + VarianceParams(3, 5, &aom_highbd_8_variance8x32_sve, 8), + VarianceParams(4, 2, &aom_highbd_8_variance16x4_sve, 8), + VarianceParams(2, 4, &aom_highbd_8_variance4x16_sve, 8), +#endif +}; + +INSTANTIATE_TEST_SUITE_P(SVE, AvxHBDVarianceTest, + ::testing::ValuesIn(kArrayHBDVariance_sve)); + +#endif // CONFIG_AV1_HIGHBITDEPTH +#endif // HAVE_SVE + } // namespace diff -Nru aom-3.8.2/test/warp_filter_test.cc aom-3.9.0/test/warp_filter_test.cc --- aom-3.8.2/test/warp_filter_test.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/test/warp_filter_test.cc 2024-05-07 19:57:03.744000000 +0000 @@ -88,6 +88,12 @@ INSTANTIATE_TEST_SUITE_P( SVE, AV1WarpFilterTest, libaom_test::AV1WarpFilter::BuildParams(av1_warp_affine_sve)); + +#if CONFIG_AV1_HIGHBITDEPTH +INSTANTIATE_TEST_SUITE_P( + SVE, AV1HighbdWarpFilterTest, + libaom_test::AV1HighbdWarpFilter::BuildParams(av1_highbd_warp_affine_sve)); +#endif // CONFIG_AV1_HIGHBITDEPTH #endif // HAVE_SVE } // namespace diff -Nru aom-3.8.2/third_party/libwebm/README.libaom aom-3.9.0/third_party/libwebm/README.libaom --- aom-3.8.2/third_party/libwebm/README.libaom 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/third_party/libwebm/README.libaom 2024-05-07 19:57:03.883000000 +0000 @@ -1,5 +1,5 @@ URL: https://chromium.googlesource.com/webm/libwebm -Version: 1930e3ca23b007f3ff11d98a570077be6201957e +Version: affd7f4d9644aa2b65981fa6c7616400be760e6e License: BSD License File: LICENSE.TXT diff -Nru aom-3.8.2/third_party/libwebm/mkvmuxer/mkvmuxer.cc aom-3.9.0/third_party/libwebm/mkvmuxer/mkvmuxer.cc --- aom-3.8.2/third_party/libwebm/mkvmuxer/mkvmuxer.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/third_party/libwebm/mkvmuxer/mkvmuxer.cc 2024-05-07 19:57:03.885000000 +0000 @@ -65,7 +65,8 @@ if (dst == NULL) return false; - strcpy(dst, src); // NOLINT + memcpy(dst, src, size - 1); + dst[size - 1] = '\0'; return true; } @@ -919,11 +920,8 @@ const size_t length = strlen(codec_id) + 1; codec_id_ = new (std::nothrow) char[length]; // NOLINT if (codec_id_) { -#ifdef _MSC_VER - strcpy_s(codec_id_, length, codec_id); -#else - strcpy(codec_id_, codec_id); -#endif + memcpy(codec_id_, codec_id, length - 1); + codec_id_[length - 1] = '\0'; } } } @@ -936,11 +934,8 @@ const size_t length = strlen(language) + 1; language_ = new (std::nothrow) char[length]; // NOLINT if (language_) { -#ifdef _MSC_VER - strcpy_s(language_, length, language); -#else - strcpy(language_, language); -#endif + memcpy(language_, language, length - 1); + language_[length - 1] = '\0'; } } } @@ -952,11 +947,8 @@ const size_t length = strlen(name) + 1; name_ = new (std::nothrow) char[length]; // NOLINT if (name_) { -#ifdef _MSC_VER - strcpy_s(name_, length, name); -#else - strcpy(name_, name); -#endif + memcpy(name_, name, length - 1); + name_[length - 1] = '\0'; } } } @@ -1559,11 +1551,8 @@ const size_t length = strlen(colour_space) + 1; colour_space_ = new (std::nothrow) char[length]; // NOLINT if (colour_space_) { -#ifdef _MSC_VER - strcpy_s(colour_space_, length, colour_space); -#else - strcpy(colour_space_, colour_space); -#endif + memcpy(colour_space_, colour_space, length - 1); + colour_space_[length - 1] = '\0'; } } } @@ -2856,13 +2845,13 @@ uint32_t SeekHead::GetId(int index) const { if (index < 0 || index >= kSeekEntryCount) - return UINT_MAX; + return UINT32_MAX; return seek_entry_id_[index]; } uint64_t SeekHead::GetPosition(int index) const { if (index < 0 || index >= kSeekEntryCount) - return ULLONG_MAX; + return UINT64_MAX; return seek_entry_pos_[index]; } @@ -2896,7 +2885,7 @@ muxing_app_(NULL), timecode_scale_(1000000ULL), writing_app_(NULL), - date_utc_(LLONG_MIN), + date_utc_(INT64_MIN), duration_pos_(-1) {} SegmentInfo::~SegmentInfo() { @@ -2927,11 +2916,8 @@ if (!muxing_app_) return false; -#ifdef _MSC_VER - strcpy_s(muxing_app_, app_len, temp); -#else - strcpy(muxing_app_, temp); -#endif + memcpy(muxing_app_, temp, app_len - 1); + muxing_app_[app_len - 1] = '\0'; set_writing_app(temp); if (!writing_app_) @@ -2974,7 +2960,7 @@ if (duration_ > 0.0) size += EbmlElementSize(libwebm::kMkvDuration, static_cast(duration_)); - if (date_utc_ != LLONG_MIN) + if (date_utc_ != INT64_MIN) size += EbmlDateElementSize(libwebm::kMkvDateUTC); size += EbmlElementSize(libwebm::kMkvMuxingApp, muxing_app_); size += EbmlElementSize(libwebm::kMkvWritingApp, writing_app_); @@ -2999,7 +2985,7 @@ return false; } - if (date_utc_ != LLONG_MIN) + if (date_utc_ != INT64_MIN) WriteEbmlDateElement(writer, libwebm::kMkvDateUTC, date_utc_); if (!WriteEbmlElement(writer, libwebm::kMkvMuxingApp, muxing_app_)) @@ -3022,11 +3008,8 @@ if (!temp_str) return; -#ifdef _MSC_VER - strcpy_s(temp_str, length, app); -#else - strcpy(temp_str, app); -#endif + memcpy(temp_str, app, length - 1); + temp_str[length - 1] = '\0'; delete[] muxing_app_; muxing_app_ = temp_str; @@ -3040,11 +3023,8 @@ if (!temp_str) return; -#ifdef _MSC_VER - strcpy_s(temp_str, length, app); -#else - strcpy(temp_str, app); -#endif + memcpy(temp_str, app, length - 1); + temp_str[length - 1] = '\0'; delete[] writing_app_; writing_app_ = temp_str; @@ -3628,19 +3608,17 @@ if (chunking_ && !strcmp(filename, chunking_base_name_)) return true; - const size_t name_length = strlen(filename) + 1; - char* const temp = new (std::nothrow) char[name_length]; // NOLINT + const size_t filename_length = strlen(filename); + char* const temp = new (std::nothrow) char[filename_length + 1]; // NOLINT if (!temp) return false; -#ifdef _MSC_VER - strcpy_s(temp, name_length, filename); -#else - strcpy(temp, filename); -#endif + memcpy(temp, filename, filename_length); + temp[filename_length] = '\0'; delete[] chunking_base_name_; chunking_base_name_ = temp; + // From this point, strlen(chunking_base_name_) == filename_length if (!UpdateChunkName("chk", &chunk_name_)) return false; @@ -3666,18 +3644,16 @@ if (!chunk_writer_cluster_->Open(chunk_name_)) return false; - const size_t header_length = strlen(filename) + strlen(".hdr") + 1; + const size_t hdr_length = strlen(".hdr"); + const size_t header_length = filename_length + hdr_length + 1; char* const header = new (std::nothrow) char[header_length]; // NOLINT if (!header) return false; -#ifdef _MSC_VER - strcpy_s(header, header_length - strlen(".hdr"), chunking_base_name_); - strcat_s(header, header_length, ".hdr"); -#else - strcpy(header, chunking_base_name_); - strcat(header, ".hdr"); -#endif + memcpy(header, chunking_base_name_, filename_length); + memcpy(&header[filename_length], ".hdr", hdr_length); + header[filename_length + hdr_length] = '\0'; + if (!chunk_writer_header_->Open(header)) { delete[] header; return false; @@ -4022,18 +3998,16 @@ snprintf(ext_chk, sizeof(ext_chk), "_%06d.%s", chunk_count_, ext); #endif - const size_t length = strlen(chunking_base_name_) + strlen(ext_chk) + 1; + const size_t chunking_base_name_length = strlen(chunking_base_name_); + const size_t ext_chk_length = strlen(ext_chk); + const size_t length = chunking_base_name_length + ext_chk_length + 1; char* const str = new (std::nothrow) char[length]; // NOLINT if (!str) return false; -#ifdef _MSC_VER - strcpy_s(str, length - strlen(ext_chk), chunking_base_name_); - strcat_s(str, length, ext_chk); -#else - strcpy(str, chunking_base_name_); - strcat(str, ext_chk); -#endif + memcpy(str, chunking_base_name_, chunking_base_name_length); + memcpy(&str[chunking_base_name_length], ext_chk, ext_chk_length); + str[chunking_base_name_length + ext_chk_length] = '\0'; delete[] * name; *name = str; diff -Nru aom-3.8.2/third_party/libwebm/mkvmuxer/mkvmuxer.h aom-3.9.0/third_party/libwebm/mkvmuxer/mkvmuxer.h --- aom-3.8.2/third_party/libwebm/mkvmuxer/mkvmuxer.h 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/third_party/libwebm/mkvmuxer/mkvmuxer.h 2024-05-07 19:57:03.890000000 +0000 @@ -1481,7 +1481,7 @@ uint64_t timecode_scale_; // Initially set to libwebm-%d.%d.%d.%d, major, minor, build, revision. char* writing_app_; - // LLONG_MIN when DateUTC is not set. + // INT64_MIN when DateUTC is not set. int64_t date_utc_; // The file position of the duration element. diff -Nru aom-3.8.2/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc aom-3.9.0/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc --- aom-3.8.2/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc 2024-05-07 19:57:03.894000000 +0000 @@ -607,22 +607,18 @@ void GetVersion(int32* major, int32* minor, int32* build, int32* revision) { *major = 0; *minor = 3; - *build = 1; + *build = 3; *revision = 0; } uint64 MakeUID(unsigned int* seed) { uint64 uid = 0; -#ifdef __MINGW32__ - srand(*seed); -#endif - for (int i = 0; i < 7; ++i) { // avoid problems with 8-byte values uid <<= 8; // TODO(fgalligan): Move random number generation to platform specific code. -#ifdef _MSC_VER +#ifdef _WIN32 (void)seed; const int32 nn = rand(); #elif __ANDROID__ @@ -634,8 +630,6 @@ close(fd); } const int32 nn = temp_num; -#elif defined __MINGW32__ - const int32 nn = rand(); #else const int32 nn = rand_r(seed); #endif diff -Nru aom-3.8.2/third_party/libwebm/mkvparser/mkvparser.cc aom-3.9.0/third_party/libwebm/mkvparser/mkvparser.cc --- aom-3.8.2/third_party/libwebm/mkvparser/mkvparser.cc 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/third_party/libwebm/mkvparser/mkvparser.cc 2024-05-07 19:57:03.896000000 +0000 @@ -55,7 +55,7 @@ void GetVersion(int& major, int& minor, int& build, int& revision) { major = 1; minor = 1; - build = 1; + build = 3; revision = 0; } @@ -246,7 +246,8 @@ if (size == 4) { union { float f; - unsigned long ff; + uint32_t ff; + static_assert(sizeof(float) == sizeof(uint32_t), ""); }; ff = 0; @@ -264,7 +265,8 @@ } else { union { double d; - unsigned long long dd; + uint64_t dd; + static_assert(sizeof(double) == sizeof(uint64_t), ""); }; dd = 0; @@ -4569,7 +4571,8 @@ if (dst == NULL) return -1; - strcpy(dst, src); + memcpy(dst, src, len); + dst[len] = '\0'; return 0; } diff -Nru aom-3.8.2/tools/auto_refactor/c_files/decl_status_code.c aom-3.9.0/tools/auto_refactor/c_files/decl_status_code.c --- aom-3.8.2/tools/auto_refactor/c_files/decl_status_code.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/tools/auto_refactor/c_files/decl_status_code.c 2024-05-07 19:57:04.000000000 +0000 @@ -13,17 +13,17 @@ int x; } T1; -int parse_decl_node_2() { int arr[3]; } +int parse_decl_node_2(void) { int arr[3]; } -int parse_decl_node_3() { int *a; } +int parse_decl_node_3(void) { int *a; } -int parse_decl_node_4() { T1 t1[3]; } +int parse_decl_node_4(void) { T1 t1[3]; } -int parse_decl_node_5() { T1 *t2[3]; } +int parse_decl_node_5(void) { T1 *t2[3]; } -int parse_decl_node_6() { T1 t3[3][3]; } +int parse_decl_node_6(void) { T1 t3[3][3]; } -int main() { +int main(void) { int a; T1 t1; struct S1 s1; diff -Nru aom-3.8.2/tools/auto_refactor/c_files/func_in_out.c aom-3.9.0/tools/auto_refactor/c_files/func_in_out.c --- aom-3.8.2/tools/auto_refactor/c_files/func_in_out.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/tools/auto_refactor/c_files/func_in_out.c 2024-05-07 19:57:04.000000000 +0000 @@ -199,7 +199,7 @@ for (int i = 0; i < 10; ++i) cpi->y--; } -int main() { +int main(void) { int x; VP9_COMP cpi; RD rd; diff -Nru aom-3.8.2/tools/auto_refactor/c_files/parse_lvalue.c aom-3.9.0/tools/auto_refactor/c_files/parse_lvalue.c --- aom-3.8.2/tools/auto_refactor/c_files/parse_lvalue.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/tools/auto_refactor/c_files/parse_lvalue.c 2024-05-07 19:57:04.000000000 +0000 @@ -39,7 +39,7 @@ return 0; } -int main() { +int main(void) { int x = 0; VP9_COMP cpi; func(&cpi, x); diff -Nru aom-3.8.2/tools/auto_refactor/c_files/simple_code.c aom-3.9.0/tools/auto_refactor/c_files/simple_code.c --- aom-3.8.2/tools/auto_refactor/c_files/simple_code.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/tools/auto_refactor/c_files/simple_code.c 2024-05-07 19:57:04.000000000 +0000 @@ -48,11 +48,11 @@ c(1); return 0; } -int e() { +int e(void) { c(0); return 0; } -int main() { +int main(void) { int p = 3; S s; s.x = p + 1; diff -Nru aom-3.8.2/tools/auto_refactor/c_files/struct_code.c aom-3.9.0/tools/auto_refactor/c_files/struct_code.c --- aom-3.8.2/tools/auto_refactor/c_files/struct_code.c 2024-03-09 00:11:52.000000000 +0000 +++ aom-3.9.0/tools/auto_refactor/c_files/struct_code.c 2024-05-07 19:57:04.000000000 +0000 @@ -46,4 +46,4 @@ } z; } T7; -int main() {} +int main(void) {}