|
@@ -22,6 +22,10 @@
|
|
|
|
|
|
|
|
#include <wasm_simd128.h>
|
|
#include <wasm_simd128.h>
|
|
|
|
|
|
|
|
|
|
+#include "libavcodec/defs.h"
|
|
|
|
|
+
|
|
|
|
|
+#define HEVC_MAX_PB_SIZE 64
|
|
|
|
|
+
|
|
|
void ff_hevc_sao_band_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
|
|
void ff_hevc_sao_band_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
|
|
|
ptrdiff_t stride_dst,
|
|
ptrdiff_t stride_dst,
|
|
|
ptrdiff_t stride_src,
|
|
ptrdiff_t stride_src,
|
|
@@ -112,3 +116,139 @@ void ff_hevc_sao_band_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
|
|
|
}
|
|
}
|
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
+void ff_hevc_sao_edge_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
|
|
|
|
|
+ ptrdiff_t stride_dst,
|
|
|
|
|
+ const int16_t *sao_offset_val,
|
|
|
|
|
+ int eo, int width, int height)
|
|
|
|
|
+{
|
|
|
|
|
+ static const int8_t pos[4][2][2] = {
|
|
|
|
|
+ { { -1, 0 }, { 1, 0 } }, // horizontal
|
|
|
|
|
+ { { 0, -1 }, { 0, 1 } }, // vertical
|
|
|
|
|
+ { { -1, -1 }, { 1, 1 } }, // 45 degree
|
|
|
|
|
+ { { 1, -1 }, { -1, 1 } }, // 135 degree
|
|
|
|
|
+ };
|
|
|
|
|
+ int a_stride, b_stride;
|
|
|
|
|
+ ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
|
|
|
|
|
+ const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
|
|
|
|
|
+ 4, 0, 0, 0,
|
|
|
|
|
+ 0, 0, 0, 0,
|
|
|
|
|
+ 0, 0, 0, 0);
|
|
|
|
|
+ v128_t sao_offset = wasm_v128_load(sao_offset_val);
|
|
|
|
|
+ v128_t one = wasm_i8x16_const_splat(1);
|
|
|
|
|
+ v128_t two = wasm_i8x16_const_splat(2);
|
|
|
|
|
+
|
|
|
|
|
+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
|
|
|
|
|
+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
|
|
|
|
|
+ for (int y = height; y > 0; y -= 2) {
|
|
|
|
|
+ v128_t v0, v1, v2;
|
|
|
|
|
+ v128_t diff0, diff1;
|
|
|
|
|
+
|
|
|
|
|
+ v0 = wasm_v128_load64_zero(src);
|
|
|
|
|
+ v1 = wasm_v128_load64_zero(src + a_stride);
|
|
|
|
|
+ v2 = wasm_v128_load64_zero(src + b_stride);
|
|
|
|
|
+ src += stride_src;
|
|
|
|
|
+ v0 = wasm_v128_load64_lane(src, v0, 1);
|
|
|
|
|
+ v1 = wasm_v128_load64_lane(src + a_stride, v1, 1);
|
|
|
|
|
+ v2 = wasm_v128_load64_lane(src + b_stride, v2, 1);
|
|
|
|
|
+ src += stride_src;
|
|
|
|
|
+
|
|
|
|
|
+ diff0 = wasm_u8x16_gt(v0, v1);
|
|
|
|
|
+ v1 = wasm_u8x16_lt(v0, v1);
|
|
|
|
|
+ diff0 = wasm_i8x16_sub(v1, diff0);
|
|
|
|
|
+
|
|
|
|
|
+ diff1 = wasm_u8x16_gt(v0, v2);
|
|
|
|
|
+ v2 = wasm_u8x16_lt(v0, v2);
|
|
|
|
|
+ diff1 = wasm_i8x16_sub(v2, diff1);
|
|
|
|
|
+
|
|
|
|
|
+ v1 = wasm_i8x16_add(diff0, two);
|
|
|
|
|
+ v1 = wasm_i8x16_add(v1, diff1);
|
|
|
|
|
+
|
|
|
|
|
+ v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val
|
|
|
|
|
+ v1 = wasm_i8x16_shl(v2, 1); // Access int16_t
|
|
|
|
|
+ v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t
|
|
|
|
|
+ diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
|
|
|
|
|
+ 20, 5, 21, 6, 22, 7, 23);
|
|
|
|
|
+ diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
|
|
|
|
|
+ 12, 28, 13, 29, 14, 30, 15, 31);
|
|
|
|
|
+ v1 = wasm_u16x8_extend_high_u8x16(v0);
|
|
|
|
|
+ v0 = wasm_u16x8_extend_low_u8x16(v0);
|
|
|
|
|
+ diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
|
|
|
|
|
+ diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
|
|
|
|
|
+
|
|
|
|
|
+ v0 = wasm_i16x8_add_sat(v0, diff0);
|
|
|
|
|
+ v1 = wasm_i16x8_add_sat(v1, diff1);
|
|
|
|
|
+ v0 = wasm_u8x16_narrow_i16x8(v0, v1);
|
|
|
|
|
+
|
|
|
|
|
+ wasm_v128_store64_lane(dst, v0, 0);
|
|
|
|
|
+ dst += stride_dst;
|
|
|
|
|
+ wasm_v128_store64_lane(dst, v0, 1);
|
|
|
|
|
+ dst += stride_dst;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|
|
|
|
|
+void ff_hevc_sao_edge_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
|
|
|
|
|
+ ptrdiff_t stride_dst,
|
|
|
|
|
+ const int16_t *sao_offset_val,
|
|
|
|
|
+ int eo, int width, int height)
|
|
|
|
|
+{
|
|
|
|
|
+ static const int8_t pos[4][2][2] = {
|
|
|
|
|
+ { { -1, 0 }, { 1, 0 } }, // horizontal
|
|
|
|
|
+ { { 0, -1 }, { 0, 1 } }, // vertical
|
|
|
|
|
+ { { -1, -1 }, { 1, 1 } }, // 45 degree
|
|
|
|
|
+ { { 1, -1 }, { -1, 1 } }, // 135 degree
|
|
|
|
|
+ };
|
|
|
|
|
+ int a_stride, b_stride;
|
|
|
|
|
+ ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
|
|
|
|
|
+ const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
|
|
|
|
|
+ 4, 0, 0, 0,
|
|
|
|
|
+ 0, 0, 0, 0,
|
|
|
|
|
+ 0, 0, 0, 0);
|
|
|
|
|
+ v128_t sao_offset = wasm_v128_load(sao_offset_val);
|
|
|
|
|
+ v128_t one = wasm_i8x16_const_splat(1);
|
|
|
|
|
+ v128_t two = wasm_i8x16_const_splat(2);
|
|
|
|
|
+
|
|
|
|
|
+ a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
|
|
|
|
|
+ b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
|
|
|
|
|
+ for (int y = height; y > 0; y--) {
|
|
|
|
|
+ for (int x = 0; x < width; x += 16) {
|
|
|
|
|
+ v128_t v0, v1, v2;
|
|
|
|
|
+ v128_t diff0, diff1;
|
|
|
|
|
+
|
|
|
|
|
+ v0 = wasm_v128_load(&src[x]);
|
|
|
|
|
+ v1 = wasm_v128_load(&src[x + a_stride]);
|
|
|
|
|
+ v2 = wasm_v128_load(&src[x + b_stride]);
|
|
|
|
|
+
|
|
|
|
|
+ diff0 = wasm_u8x16_gt(v0, v1);
|
|
|
|
|
+ v1 = wasm_u8x16_lt(v0, v1);
|
|
|
|
|
+ diff0 = wasm_i8x16_sub(v1, diff0);
|
|
|
|
|
+
|
|
|
|
|
+ diff1 = wasm_u8x16_gt(v0, v2);
|
|
|
|
|
+ v2 = wasm_u8x16_lt(v0, v2);
|
|
|
|
|
+ diff1 = wasm_i8x16_sub(v2, diff1);
|
|
|
|
|
+
|
|
|
|
|
+ v1 = wasm_i8x16_add(diff0, two);
|
|
|
|
|
+ v1 = wasm_i8x16_add(v1, diff1);
|
|
|
|
|
+
|
|
|
|
|
+ v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val
|
|
|
|
|
+ v1 = wasm_i8x16_shl(v2, 1); // Access int16_t
|
|
|
|
|
+ v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t
|
|
|
|
|
+ diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
|
|
|
|
|
+ 20, 5, 21, 6, 22, 7, 23);
|
|
|
|
|
+ diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
|
|
|
|
|
+ 12, 28, 13, 29, 14, 30, 15, 31);
|
|
|
|
|
+ v1 = wasm_u16x8_extend_high_u8x16(v0);
|
|
|
|
|
+ v0 = wasm_u16x8_extend_low_u8x16(v0);
|
|
|
|
|
+ diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
|
|
|
|
|
+ diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
|
|
|
|
|
+
|
|
|
|
|
+ v0 = wasm_i16x8_add_sat(v0, diff0);
|
|
|
|
|
+ v1 = wasm_i16x8_add_sat(v1, diff1);
|
|
|
|
|
+ v0 = wasm_u8x16_narrow_i16x8(v0, v1);
|
|
|
|
|
+ wasm_v128_store(&dst[x], v0);
|
|
|
|
|
+ }
|
|
|
|
|
+
|
|
|
|
|
+ src += stride_src;
|
|
|
|
|
+ dst += stride_dst;
|
|
|
|
|
+ }
|
|
|
|
|
+}
|
|
|
|
|
+
|