|
|
@@ -57,40 +57,53 @@ SECTION .text
|
|
|
%macro SHUFFLE_BYTES 4
|
|
|
cglobal shuffle_bytes_%1%2%3%4, 3, 5, 2, src, dst, w, tmp, x
|
|
|
VBROADCASTI128 m0, [pb_shuffle%1%2%3%4]
|
|
|
- movsxdifnidn wq, wd
|
|
|
- mov xq, wq
|
|
|
-
|
|
|
- add srcq, wq
|
|
|
- add dstq, wq
|
|
|
- neg wq
|
|
|
-
|
|
|
-;calc scalar loop
|
|
|
+ movsxdifnidn wq, wd
|
|
|
+ mov xq, wq
|
|
|
+
|
|
|
+ add srcq, wq
|
|
|
+ add dstq, wq
|
|
|
+ neg wq
|
|
|
+
|
|
|
+%if mmsize == 64
|
|
|
+ and xq, mmsize - 4
|
|
|
+ shr xq, 2
|
|
|
+ mov tmpd, -1
|
|
|
+ shlx tmpd, tmpd, xd
|
|
|
+ not tmpd
|
|
|
+ kmovw k7, tmpd
|
|
|
+ vmovdqu32 m1{k7}{z}, [srcq + wq]
|
|
|
+ pshufb m1, m0
|
|
|
+ vmovdqu32 [dstq + wq]{k7}, m1
|
|
|
+ lea wq, [wq + 4 * xq]
|
|
|
+%else
|
|
|
+ ;calc scalar loop
|
|
|
and xq, mmsize-4
|
|
|
je .loop_simd
|
|
|
|
|
|
-.loop_scalar:
|
|
|
- mov tmpb, [srcq + wq + %1]
|
|
|
- mov [dstq+wq + 0], tmpb
|
|
|
- mov tmpb, [srcq + wq + %2]
|
|
|
- mov [dstq+wq + 1], tmpb
|
|
|
- mov tmpb, [srcq + wq + %3]
|
|
|
- mov [dstq+wq + 2], tmpb
|
|
|
- mov tmpb, [srcq + wq + %4]
|
|
|
- mov [dstq+wq + 3], tmpb
|
|
|
- add wq, 4
|
|
|
- sub xq, 4
|
|
|
- jg .loop_scalar
|
|
|
-
|
|
|
-;check if src_size < mmsize
|
|
|
-cmp wq, 0
|
|
|
-jge .end
|
|
|
-
|
|
|
-.loop_simd:
|
|
|
- movu m1, [srcq+wq]
|
|
|
- pshufb m1, m0
|
|
|
- movu [dstq+wq], m1
|
|
|
- add wq, mmsize
|
|
|
- jl .loop_simd
|
|
|
+ .loop_scalar:
|
|
|
+ mov tmpb, [srcq + wq + %1]
|
|
|
+ mov [dstq+wq + 0], tmpb
|
|
|
+ mov tmpb, [srcq + wq + %2]
|
|
|
+ mov [dstq+wq + 1], tmpb
|
|
|
+ mov tmpb, [srcq + wq + %3]
|
|
|
+ mov [dstq+wq + 2], tmpb
|
|
|
+ mov tmpb, [srcq + wq + %4]
|
|
|
+ mov [dstq+wq + 3], tmpb
|
|
|
+ add wq, 4
|
|
|
+ sub xq, 4
|
|
|
+ jg .loop_scalar
|
|
|
+%endif
|
|
|
+
|
|
|
+ ;check if src_size < mmsize
|
|
|
+ cmp wq, 0
|
|
|
+ jge .end
|
|
|
+
|
|
|
+ .loop_simd:
|
|
|
+ movu m1, [srcq + wq]
|
|
|
+ pshufb m1, m0
|
|
|
+ movu [dstq + wq], m1
|
|
|
+ add wq, mmsize
|
|
|
+ jl .loop_simd
|
|
|
|
|
|
.end:
|
|
|
RET
|
|
|
@@ -122,6 +135,21 @@ SHUFFLE_BYTES 1, 2, 0, 3
|
|
|
%endif
|
|
|
%endif
|
|
|
|
|
|
+%if ARCH_X86_64
|
|
|
+%if HAVE_AVX512ICL_EXTERNAL
|
|
|
+INIT_ZMM avx512icl
|
|
|
+SHUFFLE_BYTES 2, 1, 0, 3
|
|
|
+SHUFFLE_BYTES 0, 3, 2, 1
|
|
|
+SHUFFLE_BYTES 1, 2, 3, 0
|
|
|
+SHUFFLE_BYTES 3, 0, 1, 2
|
|
|
+SHUFFLE_BYTES 3, 2, 1, 0
|
|
|
+SHUFFLE_BYTES 3, 1, 0, 2
|
|
|
+SHUFFLE_BYTES 2, 0, 1, 3
|
|
|
+SHUFFLE_BYTES 2, 1, 3, 0
|
|
|
+SHUFFLE_BYTES 1, 2, 0, 3
|
|
|
+%endif
|
|
|
+%endif
|
|
|
+
|
|
|
;-----------------------------------------------------------------------------------------------
|
|
|
; uyvytoyuv422(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
|
|
|
; const uint8_t *src, int width, int height,
|