sao.c 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253
  1. /*
  2. * Copyright (c) 2025 Zhao Zhili
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "sao.h"
  21. #include <wasm_simd128.h>
  22. #include "libavcodec/defs.h"
  23. #define HEVC_MAX_PB_SIZE 64
  24. void ff_hevc_sao_band_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
  25. ptrdiff_t stride_dst,
  26. ptrdiff_t stride_src,
  27. const int16_t *sao_offset_val,
  28. int sao_left_class, int width,
  29. int height)
  30. {
  31. int8_t offset_table[32] = {0};
  32. v128_t offset_low, offset_high;
  33. for (int k = 0; k < 4; k++)
  34. offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];
  35. offset_low = wasm_v128_load(offset_table);
  36. offset_high = wasm_v128_load(&offset_table[16]);
  37. for (int y = height; y > 0; y -= 2) {
  38. v128_t src_v, src_high;
  39. v128_t v0, v1;
  40. src_v = wasm_v128_load64_zero(src);
  41. src += stride_src;
  42. src_v = wasm_v128_load64_lane(src, src_v, 1);
  43. src += stride_src;
  44. v0 = wasm_u8x16_shr(src_v, 3);
  45. v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
  46. v0 = wasm_i8x16_swizzle(offset_low, v0);
  47. v1 = wasm_i8x16_swizzle(offset_high, v1);
  48. v0 = wasm_v128_or(v0, v1);
  49. src_high = wasm_u16x8_extend_high_u8x16(src_v);
  50. v1 = wasm_i16x8_extend_high_i8x16(v0);
  51. src_v = wasm_u16x8_extend_low_u8x16(src_v);
  52. v0 = wasm_i16x8_extend_low_i8x16(v0);
  53. v0 = wasm_i16x8_add_sat(src_v, v0);
  54. v1 = wasm_i16x8_add_sat(src_high, v1);
  55. v0 = wasm_u8x16_narrow_i16x8(v0, v1);
  56. wasm_v128_store64_lane(dst, v0, 0);
  57. dst += stride_dst;
  58. wasm_v128_store64_lane(dst, v0, 1);
  59. dst += stride_dst;
  60. }
  61. }
  62. void ff_hevc_sao_band_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
  63. ptrdiff_t stride_dst,
  64. ptrdiff_t stride_src,
  65. const int16_t *sao_offset_val,
  66. int sao_left_class, int width,
  67. int height)
  68. {
  69. int8_t offset_table[32] = {0};
  70. v128_t offset_low, offset_high;
  71. for (int k = 0; k < 4; k++)
  72. offset_table[(k + sao_left_class) & 31] = (int8_t)sao_offset_val[k + 1];
  73. offset_low = wasm_v128_load(offset_table);
  74. offset_high = wasm_v128_load(&offset_table[16]);
  75. for (int y = height; y > 0; y--) {
  76. for (int x = 0; x < width; x += 16) {
  77. v128_t src_v, src_high;
  78. v128_t v0, v1;
  79. src_v = wasm_v128_load(&src[x]);
  80. v0 = wasm_u8x16_shr(src_v, 3);
  81. v1 = wasm_i8x16_sub(v0, wasm_i8x16_const_splat(16));
  82. v0 = wasm_i8x16_swizzle(offset_low, v0);
  83. v1 = wasm_i8x16_swizzle(offset_high, v1);
  84. v0 = wasm_v128_or(v0, v1);
  85. src_high = wasm_u16x8_extend_high_u8x16(src_v);
  86. v1 = wasm_i16x8_extend_high_i8x16(v0);
  87. src_v = wasm_u16x8_extend_low_u8x16(src_v);
  88. v0 = wasm_i16x8_extend_low_i8x16(v0);
  89. v0 = wasm_i16x8_add_sat(src_v, v0);
  90. v1 = wasm_i16x8_add_sat(src_high, v1);
  91. v0 = wasm_u8x16_narrow_i16x8(v0, v1);
  92. wasm_v128_store(&dst[x], v0);
  93. }
  94. dst += stride_dst;
  95. src += stride_src;
  96. }
  97. }
  98. void ff_hevc_sao_edge_filter_8x8_8_simd128(uint8_t *dst, const uint8_t *src,
  99. ptrdiff_t stride_dst,
  100. const int16_t *sao_offset_val,
  101. int eo, int width, int height)
  102. {
  103. static const int8_t pos[4][2][2] = {
  104. { { -1, 0 }, { 1, 0 } }, // horizontal
  105. { { 0, -1 }, { 0, 1 } }, // vertical
  106. { { -1, -1 }, { 1, 1 } }, // 45 degree
  107. { { 1, -1 }, { -1, 1 } }, // 135 degree
  108. };
  109. int a_stride, b_stride;
  110. ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
  111. const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
  112. 4, 0, 0, 0,
  113. 0, 0, 0, 0,
  114. 0, 0, 0, 0);
  115. v128_t sao_offset = wasm_v128_load(sao_offset_val);
  116. v128_t one = wasm_i8x16_const_splat(1);
  117. v128_t two = wasm_i8x16_const_splat(2);
  118. a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
  119. b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
  120. for (int y = height; y > 0; y -= 2) {
  121. v128_t v0, v1, v2;
  122. v128_t diff0, diff1;
  123. v0 = wasm_v128_load64_zero(src);
  124. v1 = wasm_v128_load64_zero(src + a_stride);
  125. v2 = wasm_v128_load64_zero(src + b_stride);
  126. src += stride_src;
  127. v0 = wasm_v128_load64_lane(src, v0, 1);
  128. v1 = wasm_v128_load64_lane(src + a_stride, v1, 1);
  129. v2 = wasm_v128_load64_lane(src + b_stride, v2, 1);
  130. src += stride_src;
  131. diff0 = wasm_u8x16_gt(v0, v1);
  132. v1 = wasm_u8x16_lt(v0, v1);
  133. diff0 = wasm_i8x16_sub(v1, diff0);
  134. diff1 = wasm_u8x16_gt(v0, v2);
  135. v2 = wasm_u8x16_lt(v0, v2);
  136. diff1 = wasm_i8x16_sub(v2, diff1);
  137. v1 = wasm_i8x16_add(diff0, two);
  138. v1 = wasm_i8x16_add(v1, diff1);
  139. v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val
  140. v1 = wasm_i8x16_shl(v2, 1); // Access int16_t
  141. v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t
  142. diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
  143. 20, 5, 21, 6, 22, 7, 23);
  144. diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
  145. 12, 28, 13, 29, 14, 30, 15, 31);
  146. v1 = wasm_u16x8_extend_high_u8x16(v0);
  147. v0 = wasm_u16x8_extend_low_u8x16(v0);
  148. diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
  149. diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
  150. v0 = wasm_i16x8_add_sat(v0, diff0);
  151. v1 = wasm_i16x8_add_sat(v1, diff1);
  152. v0 = wasm_u8x16_narrow_i16x8(v0, v1);
  153. wasm_v128_store64_lane(dst, v0, 0);
  154. dst += stride_dst;
  155. wasm_v128_store64_lane(dst, v0, 1);
  156. dst += stride_dst;
  157. }
  158. }
  159. void ff_hevc_sao_edge_filter_16x16_8_simd128(uint8_t *dst, const uint8_t *src,
  160. ptrdiff_t stride_dst,
  161. const int16_t *sao_offset_val,
  162. int eo, int width, int height)
  163. {
  164. static const int8_t pos[4][2][2] = {
  165. { { -1, 0 }, { 1, 0 } }, // horizontal
  166. { { 0, -1 }, { 0, 1 } }, // vertical
  167. { { -1, -1 }, { 1, 1 } }, // 45 degree
  168. { { 1, -1 }, { -1, 1 } }, // 135 degree
  169. };
  170. int a_stride, b_stride;
  171. ptrdiff_t stride_src = (2 * HEVC_MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE);
  172. const v128_t edge_idx = wasm_u8x16_make(1, 2, 0, 3,
  173. 4, 0, 0, 0,
  174. 0, 0, 0, 0,
  175. 0, 0, 0, 0);
  176. v128_t sao_offset = wasm_v128_load(sao_offset_val);
  177. v128_t one = wasm_i8x16_const_splat(1);
  178. v128_t two = wasm_i8x16_const_splat(2);
  179. a_stride = pos[eo][0][0] + pos[eo][0][1] * stride_src;
  180. b_stride = pos[eo][1][0] + pos[eo][1][1] * stride_src;
  181. for (int y = height; y > 0; y--) {
  182. for (int x = 0; x < width; x += 16) {
  183. v128_t v0, v1, v2;
  184. v128_t diff0, diff1;
  185. v0 = wasm_v128_load(&src[x]);
  186. v1 = wasm_v128_load(&src[x + a_stride]);
  187. v2 = wasm_v128_load(&src[x + b_stride]);
  188. diff0 = wasm_u8x16_gt(v0, v1);
  189. v1 = wasm_u8x16_lt(v0, v1);
  190. diff0 = wasm_i8x16_sub(v1, diff0);
  191. diff1 = wasm_u8x16_gt(v0, v2);
  192. v2 = wasm_u8x16_lt(v0, v2);
  193. diff1 = wasm_i8x16_sub(v2, diff1);
  194. v1 = wasm_i8x16_add(diff0, two);
  195. v1 = wasm_i8x16_add(v1, diff1);
  196. v2 = wasm_i8x16_swizzle(edge_idx, v1); // offset_val
  197. v1 = wasm_i8x16_shl(v2, 1); // Access int16_t
  198. v2 = wasm_i8x16_add(v1, one); // Access upper half of int16_t
  199. diff0 = wasm_i8x16_shuffle(v1, v2, 0, 16, 1, 17, 2, 18, 3, 19, 4,
  200. 20, 5, 21, 6, 22, 7, 23);
  201. diff1 = wasm_i8x16_shuffle(v1, v2, 8, 24, 9, 25, 10, 26, 11, 27,
  202. 12, 28, 13, 29, 14, 30, 15, 31);
  203. v1 = wasm_u16x8_extend_high_u8x16(v0);
  204. v0 = wasm_u16x8_extend_low_u8x16(v0);
  205. diff0 = wasm_i8x16_swizzle(sao_offset, diff0);
  206. diff1 = wasm_i8x16_swizzle(sao_offset, diff1);
  207. v0 = wasm_i16x8_add_sat(v0, diff0);
  208. v1 = wasm_i16x8_add_sat(v1, diff1);
  209. v0 = wasm_u8x16_narrow_i16x8(v0, v1);
  210. wasm_v128_store(&dst[x], v0);
  211. }
  212. src += stride_src;
  213. dst += stride_dst;
  214. }
  215. }