| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945 |
- /*
- * Loongson asm helper.
- *
- * Copyright (c) 2022 Loongson Technology Corporation Limited
- * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
- * Shiyou Yin(yinshiyou-hf@loongson.cn)
- *
- * This file is part of FFmpeg.
- *
- * FFmpeg is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License as published by the Free Software Foundation; either
- * version 2.1 of the License, or (at your option) any later version.
- *
- * FFmpeg is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * Lesser General Public License for more details.
- *
- * You should have received a copy of the GNU Lesser General Public
- * License along with FFmpeg; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
- */
- /**
- * MAJOR version: Macro usage changes.
- * MINOR version: Add new functions, or bug fixes.
- * MICRO version: Comment changes or implementation changes.
- */
- #define LML_VERSION_MAJOR 0
- #define LML_VERSION_MINOR 2
- #define LML_VERSION_MICRO 0
- /*
- *============================================================================
- * macros for specific projetc, set them as needed.
- * Following LoongML macros for your reference.
- *============================================================================
- */
- #define ASM_PREF
- #define DEFAULT_ALIGN 5
- .macro function name, align=DEFAULT_ALIGN
- .macro endfunc
- jirl $r0, $r1, 0x0
- .size ASM_PREF\name, . - ASM_PREF\name
- .purgem endfunc
- .endm
- .text ;
- .align \align ;
- .globl ASM_PREF\name ;
- .type ASM_PREF\name, @function ;
- ASM_PREF\name: ;
- .endm
- /**
- * Attention: If align is not zero, the macro will use
- * t7 until the end of function
- */
- .macro alloc_stack size, align=0
- .if \align
- .macro clean_stack
- add.d sp, sp, t7
- .endm
- addi.d sp, sp, - \size
- andi.d t7, sp, \align - 1
- sub.d sp, sp, t7
- addi.d t7, t7, \size
- .else
- .macro clean_stack
- addi.d sp, sp, \size
- .endm
- addi.d sp, sp, - \size
- .endif
- .endm
- .macro const name, align=DEFAULT_ALIGN
- .macro endconst
- .size \name, . - \name
- .purgem endconst
- .endm
- .section .rodata
- .align \align
- \name:
- .endm
- /*
- *============================================================================
- * LoongArch register alias
- *============================================================================
- */
- #define a0 $a0
- #define a1 $a1
- #define a2 $a2
- #define a3 $a3
- #define a4 $a4
- #define a5 $a5
- #define a6 $a6
- #define a7 $a7
- #define t0 $t0
- #define t1 $t1
- #define t2 $t2
- #define t3 $t3
- #define t4 $t4
- #define t5 $t5
- #define t6 $t6
- #define t7 $t7
- #define t8 $t8
- #define s0 $s0
- #define s1 $s1
- #define s2 $s2
- #define s3 $s3
- #define s4 $s4
- #define s5 $s5
- #define s6 $s6
- #define s7 $s7
- #define s8 $s8
- #define zero $zero
- #define sp $sp
- #define ra $ra
- #define f0 $f0
- #define f1 $f1
- #define f2 $f2
- #define f3 $f3
- #define f4 $f4
- #define f5 $f5
- #define f6 $f6
- #define f7 $f7
- #define f8 $f8
- #define f9 $f9
- #define f10 $f10
- #define f11 $f11
- #define f12 $f12
- #define f13 $f13
- #define f14 $f14
- #define f15 $f15
- #define f16 $f16
- #define f17 $f17
- #define f18 $f18
- #define f19 $f19
- #define f20 $f20
- #define f21 $f21
- #define f22 $f22
- #define f23 $f23
- #define f24 $f24
- #define f25 $f25
- #define f26 $f26
- #define f27 $f27
- #define f28 $f28
- #define f29 $f29
- #define f30 $f30
- #define f31 $f31
- #define vr0 $vr0
- #define vr1 $vr1
- #define vr2 $vr2
- #define vr3 $vr3
- #define vr4 $vr4
- #define vr5 $vr5
- #define vr6 $vr6
- #define vr7 $vr7
- #define vr8 $vr8
- #define vr9 $vr9
- #define vr10 $vr10
- #define vr11 $vr11
- #define vr12 $vr12
- #define vr13 $vr13
- #define vr14 $vr14
- #define vr15 $vr15
- #define vr16 $vr16
- #define vr17 $vr17
- #define vr18 $vr18
- #define vr19 $vr19
- #define vr20 $vr20
- #define vr21 $vr21
- #define vr22 $vr22
- #define vr23 $vr23
- #define vr24 $vr24
- #define vr25 $vr25
- #define vr26 $vr26
- #define vr27 $vr27
- #define vr28 $vr28
- #define vr29 $vr29
- #define vr30 $vr30
- #define vr31 $vr31
- #define xr0 $xr0
- #define xr1 $xr1
- #define xr2 $xr2
- #define xr3 $xr3
- #define xr4 $xr4
- #define xr5 $xr5
- #define xr6 $xr6
- #define xr7 $xr7
- #define xr8 $xr8
- #define xr9 $xr9
- #define xr10 $xr10
- #define xr11 $xr11
- #define xr12 $xr12
- #define xr13 $xr13
- #define xr14 $xr14
- #define xr15 $xr15
- #define xr16 $xr16
- #define xr17 $xr17
- #define xr18 $xr18
- #define xr19 $xr19
- #define xr20 $xr20
- #define xr21 $xr21
- #define xr22 $xr22
- #define xr23 $xr23
- #define xr24 $xr24
- #define xr25 $xr25
- #define xr26 $xr26
- #define xr27 $xr27
- #define xr28 $xr28
- #define xr29 $xr29
- #define xr30 $xr30
- #define xr31 $xr31
- /*
- *============================================================================
- * LSX/LASX synthesize instructions
- *============================================================================
- */
- /*
- * Description : Dot product of byte vector elements
- * Arguments : Inputs - vj, vk
- * Outputs - vd
- * Return Type - halfword
- */
- .macro vdp2.h.bu vd, vj, vk
- vmulwev.h.bu \vd, \vj, \vk
- vmaddwod.h.bu \vd, \vj, \vk
- .endm
- .macro vdp2.h.bu.b vd, vj, vk
- vmulwev.h.bu.b \vd, \vj, \vk
- vmaddwod.h.bu.b \vd, \vj, \vk
- .endm
- .macro vdp2.w.h vd, vj, vk
- vmulwev.w.h \vd, \vj, \vk
- vmaddwod.w.h \vd, \vj, \vk
- .endm
- .macro xvdp2.h.bu xd, xj, xk
- xvmulwev.h.bu \xd, \xj, \xk
- xvmaddwod.h.bu \xd, \xj, \xk
- .endm
- .macro xvdp2.h.bu.b xd, xj, xk
- xvmulwev.h.bu.b \xd, \xj, \xk
- xvmaddwod.h.bu.b \xd, \xj, \xk
- .endm
- .macro xvdp2.w.h xd, xj, xk
- xvmulwev.w.h \xd, \xj, \xk
- xvmaddwod.w.h \xd, \xj, \xk
- .endm
- /*
- * Description : Dot product & addition of halfword vector elements
- * Arguments : Inputs - vj, vk
- * Outputs - vd
- * Return Type - twice size of input
- */
- .macro vdp2add.h.bu vd, vj, vk
- vmaddwev.h.bu \vd, \vj, \vk
- vmaddwod.h.bu \vd, \vj, \vk
- .endm
- .macro vdp2add.h.bu.b vd, vj, vk
- vmaddwev.h.bu.b \vd, \vj, \vk
- vmaddwod.h.bu.b \vd, \vj, \vk
- .endm
- .macro vdp2add.w.h vd, vj, vk
- vmaddwev.w.h \vd, \vj, \vk
- vmaddwod.w.h \vd, \vj, \vk
- .endm
- .macro xvdp2add.h.bu.b xd, xj, xk
- xvmaddwev.h.bu.b \xd, \xj, \xk
- xvmaddwod.h.bu.b \xd, \xj, \xk
- .endm
- .macro xvdp2add.w.h xd, xj, xk
- xvmaddwev.w.h \xd, \xj, \xk
- xvmaddwod.w.h \xd, \xj, \xk
- .endm
- /*
- * Description : Range each element of vector
- * clip: vj > vk ? vj : vk && vj < va ? vj : va
- * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
- */
- .macro vclip.h vd, vj, vk, va
- vmax.h \vd, \vj, \vk
- vmin.h \vd, \vd, \va
- .endm
- .macro vclip255.w vd, vj
- vmaxi.w \vd, \vj, 0
- vsat.wu \vd, \vd, 7
- .endm
- .macro vclip255.h vd, vj
- vmaxi.h \vd, \vj, 0
- vsat.hu \vd, \vd, 7
- .endm
- .macro xvclip.h xd, xj, xk, xa
- xvmax.h \xd, \xj, \xk
- xvmin.h \xd, \xd, \xa
- .endm
- .macro xvclip255.h xd, xj
- xvmaxi.h \xd, \xj, 0
- xvsat.hu \xd, \xd, 7
- .endm
- .macro xvclip255.w xd, xj
- xvmaxi.w \xd, \xj, 0
- xvsat.wu \xd, \xd, 7
- .endm
- /*
- * Description : Store elements of vector
- * vd : Data vector to be stored
- * rk : Address of data storage
- * ra : Offset of address
- * si : Index of data in vd
- */
- .macro vstelmx.b vd, rk, ra, si
- add.d \rk, \rk, \ra
- vstelm.b \vd, \rk, 0, \si
- .endm
- .macro vstelmx.h vd, rk, ra, si
- add.d \rk, \rk, \ra
- vstelm.h \vd, \rk, 0, \si
- .endm
- .macro vstelmx.w vd, rk, ra, si
- add.d \rk, \rk, \ra
- vstelm.w \vd, \rk, 0, \si
- .endm
- .macro vstelmx.d vd, rk, ra, si
- add.d \rk, \rk, \ra
- vstelm.d \vd, \rk, 0, \si
- .endm
- .macro vmov xd, xj
- vor.v \xd, \xj, \xj
- .endm
- .macro xmov xd, xj
- xvor.v \xd, \xj, \xj
- .endm
- .macro xvstelmx.d xd, rk, ra, si
- add.d \rk, \rk, \ra
- xvstelm.d \xd, \rk, 0, \si
- .endm
- /*
- *============================================================================
- * LSX/LASX custom macros
- *============================================================================
- */
- /*
- * Load 4 float, double, V128, v256 elements with stride.
- */
- .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
- fld.s \out0, \src, 0
- fldx.s \out1, \src, \stride
- fldx.s \out2, \src, \stride2
- fldx.s \out3, \src, \stride3
- .endm
- .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
- fld.d \out0, \src, 0
- fldx.d \out1, \src, \stride
- fldx.d \out2, \src, \stride2
- fldx.d \out3, \src, \stride3
- .endm
- .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
- vld \out0, \src, 0
- vldx \out1, \src, \stride
- vldx \out2, \src, \stride2
- vldx \out3, \src, \stride3
- .endm
- .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
- xvld \out0, \src, 0
- xvldx \out1, \src, \stride
- xvldx \out2, \src, \stride2
- xvldx \out3, \src, \stride3
- .endm
- /*
- * Description : Transpose 4x4 block with half-word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3
- * Outputs - out0, out1, out2, out3
- */
- .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
- tmp0, tmp1
- vilvl.h \tmp0, \in1, \in0
- vilvl.h \tmp1, \in3, \in2
- vilvl.w \out0, \tmp1, \tmp0
- vilvh.w \out2, \tmp1, \tmp0
- vilvh.d \out1, \out0, \out0
- vilvh.d \out3, \out0, \out2
- .endm
- /*
- * Description : Transpose 4x4 block with word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3
- * Outputs - out0, out1, out2, out3
- * Details :
- * Example :
- * 1, 2, 3, 4 1, 5, 9,13
- * 5, 6, 7, 8 to 2, 6,10,14
- * 9,10,11,12 =====> 3, 7,11,15
- * 13,14,15,16 4, 8,12,16
- */
- .macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
- _tmp0, _tmp1
- vilvl.w \_tmp0, \_in1, \_in0
- vilvh.w \_out1, \_in1, \_in0
- vilvl.w \_tmp1, \_in3, \_in2
- vilvh.w \_out3, \_in3, \_in2
- vilvl.d \_out0, \_tmp1, \_tmp0
- vilvl.d \_out2, \_out3, \_out1
- vilvh.d \_out3, \_out3, \_out1
- vilvh.d \_out1, \_tmp1, \_tmp0
- .endm
- /*
- * Description : Transpose 8x8 block with half-word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- */
- .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
- tmp3, tmp4, tmp5, tmp6, tmp7
- vilvl.h \tmp0, \in6, \in4
- vilvl.h \tmp1, \in7, \in5
- vilvl.h \tmp2, \in2, \in0
- vilvl.h \tmp3, \in3, \in1
- vilvl.h \tmp4, \tmp1, \tmp0
- vilvh.h \tmp5, \tmp1, \tmp0
- vilvl.h \tmp6, \tmp3, \tmp2
- vilvh.h \tmp7, \tmp3, \tmp2
- vilvh.h \tmp0, \in6, \in4
- vilvh.h \tmp1, \in7, \in5
- vilvh.h \tmp2, \in2, \in0
- vilvh.h \tmp3, \in3, \in1
- vpickev.d \out0, \tmp4, \tmp6
- vpickod.d \out1, \tmp4, \tmp6
- vpickev.d \out2, \tmp5, \tmp7
- vpickod.d \out3, \tmp5, \tmp7
- vilvl.h \tmp4, \tmp1, \tmp0
- vilvh.h \tmp5, \tmp1, \tmp0
- vilvl.h \tmp6, \tmp3, \tmp2
- vilvh.h \tmp7, \tmp3, \tmp2
- vpickev.d \out4, \tmp4, \tmp6
- vpickod.d \out5, \tmp4, \tmp6
- vpickev.d \out6, \tmp5, \tmp7
- vpickod.d \out7, \tmp5, \tmp7
- .endm
- /*
- * Description : Transpose 16x8 block with byte elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- */
- .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7,\
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
- xvilvl.b \tmp0, \in2, \in0
- xvilvl.b \tmp1, \in3, \in1
- xvilvl.b \tmp2, \in6, \in4
- xvilvl.b \tmp3, \in7, \in5
- xvilvl.b \tmp4, \in10, \in8
- xvilvl.b \tmp5, \in11, \in9
- xvilvl.b \tmp6, \in14, \in12
- xvilvl.b \tmp7, \in15, \in13
- xvilvl.b \out0, \tmp1, \tmp0
- xvilvh.b \out1, \tmp1, \tmp0
- xvilvl.b \out2, \tmp3, \tmp2
- xvilvh.b \out3, \tmp3, \tmp2
- xvilvl.b \out4, \tmp5, \tmp4
- xvilvh.b \out5, \tmp5, \tmp4
- xvilvl.b \out6, \tmp7, \tmp6
- xvilvh.b \out7, \tmp7, \tmp6
- xvilvl.w \tmp0, \out2, \out0
- xvilvh.w \tmp2, \out2, \out0
- xvilvl.w \tmp4, \out3, \out1
- xvilvh.w \tmp6, \out3, \out1
- xvilvl.w \tmp1, \out6, \out4
- xvilvh.w \tmp3, \out6, \out4
- xvilvl.w \tmp5, \out7, \out5
- xvilvh.w \tmp7, \out7, \out5
- xvilvl.d \out0, \tmp1, \tmp0
- xvilvh.d \out1, \tmp1, \tmp0
- xvilvl.d \out2, \tmp3, \tmp2
- xvilvh.d \out3, \tmp3, \tmp2
- xvilvl.d \out4, \tmp5, \tmp4
- xvilvh.d \out5, \tmp5, \tmp4
- xvilvl.d \out6, \tmp7, \tmp6
- xvilvh.d \out7, \tmp7, \tmp6
- .endm
- /*
- * Description : Transpose 16x8 block with byte elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- */
- .macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
- in8, in9, in10, in11, in12, in13, in14, in15, \
- out0, out1, out2, out3, out4, out5, out6, out7,\
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
- vilvl.b \tmp0, \in2, \in0
- vilvl.b \tmp1, \in3, \in1
- vilvl.b \tmp2, \in6, \in4
- vilvl.b \tmp3, \in7, \in5
- vilvl.b \tmp4, \in10, \in8
- vilvl.b \tmp5, \in11, \in9
- vilvl.b \tmp6, \in14, \in12
- vilvl.b \tmp7, \in15, \in13
- vilvl.b \out0, \tmp1, \tmp0
- vilvh.b \out1, \tmp1, \tmp0
- vilvl.b \out2, \tmp3, \tmp2
- vilvh.b \out3, \tmp3, \tmp2
- vilvl.b \out4, \tmp5, \tmp4
- vilvh.b \out5, \tmp5, \tmp4
- vilvl.b \out6, \tmp7, \tmp6
- vilvh.b \out7, \tmp7, \tmp6
- vilvl.w \tmp0, \out2, \out0
- vilvh.w \tmp2, \out2, \out0
- vilvl.w \tmp4, \out3, \out1
- vilvh.w \tmp6, \out3, \out1
- vilvl.w \tmp1, \out6, \out4
- vilvh.w \tmp3, \out6, \out4
- vilvl.w \tmp5, \out7, \out5
- vilvh.w \tmp7, \out7, \out5
- vilvl.d \out0, \tmp1, \tmp0
- vilvh.d \out1, \tmp1, \tmp0
- vilvl.d \out2, \tmp3, \tmp2
- vilvh.d \out3, \tmp3, \tmp2
- vilvl.d \out4, \tmp5, \tmp4
- vilvh.d \out5, \tmp5, \tmp4
- vilvl.d \out6, \tmp7, \tmp6
- vilvh.d \out7, \tmp7, \tmp6
- .endm
- /*
- * Description : Transpose 4x4 block with half-word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3
- * Outputs - out0, out1, out2, out3
- */
- .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
- tmp0, tmp1
- xvilvl.h \tmp0, \in1, \in0
- xvilvl.h \tmp1, \in3, \in2
- xvilvl.w \out0, \tmp1, \tmp0
- xvilvh.w \out2, \tmp1, \tmp0
- xvilvh.d \out1, \out0, \out0
- xvilvh.d \out3, \out0, \out2
- .endm
- /*
- * Description : Transpose 4x8 block with half-word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3
- * Outputs - out0, out1, out2, out3
- */
- .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
- tmp0, tmp1
- xvilvl.h \tmp0, \in2, \in0
- xvilvl.h \tmp1, \in3, \in1
- xvilvl.h \out2, \tmp1, \tmp0
- xvilvh.h \out3, \tmp1, \tmp0
- xvilvl.d \out0, \out2, \out2
- xvilvh.d \out1, \out2, \out2
- xvilvl.d \out2, \out3, \out3
- xvilvh.d \out3, \out3, \out3
- .endm
- /*
- * Description : Transpose 8x8 block with half-word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
- * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
- */
- .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
- out0, out1, out2, out3, out4, out5, out6, out7, \
- tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
- xvilvl.h \tmp0, \in6, \in4
- xvilvl.h \tmp1, \in7, \in5
- xvilvl.h \tmp2, \in2, \in0
- xvilvl.h \tmp3, \in3, \in1
- xvilvl.h \tmp4, \tmp1, \tmp0
- xvilvh.h \tmp5, \tmp1, \tmp0
- xvilvl.h \tmp6, \tmp3, \tmp2
- xvilvh.h \tmp7, \tmp3, \tmp2
- xvilvh.h \tmp0, \in6, \in4
- xvilvh.h \tmp1, \in7, \in5
- xvilvh.h \tmp2, \in2, \in0
- xvilvh.h \tmp3, \in3, \in1
- xvpickev.d \out0, \tmp4, \tmp6
- xvpickod.d \out1, \tmp4, \tmp6
- xvpickev.d \out2, \tmp5, \tmp7
- xvpickod.d \out3, \tmp5, \tmp7
- xvilvl.h \tmp4, \tmp1, \tmp0
- xvilvh.h \tmp5, \tmp1, \tmp0
- xvilvl.h \tmp6, \tmp3, \tmp2
- xvilvh.h \tmp7, \tmp3, \tmp2
- xvpickev.d \out4, \tmp4, \tmp6
- xvpickod.d \out5, \tmp4, \tmp6
- xvpickev.d \out6, \tmp5, \tmp7
- xvpickod.d \out7, \tmp5, \tmp7
- .endm
- /*
- * Description : Transpose 2x4x4 block with half-word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3
- * Outputs - out0, out1, out2, out3
- */
- .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
- tmp0, tmp1, tmp2
- xvilvh.h \tmp1, \in0, \in1
- xvilvl.h \out1, \in0, \in1
- xvilvh.h \tmp0, \in2, \in3
- xvilvl.h \out3, \in2, \in3
- xvilvh.w \tmp2, \out3, \out1
- xvilvl.w \out3, \out3, \out1
- xvilvl.w \out2, \tmp0, \tmp1
- xvilvh.w \tmp1, \tmp0, \tmp1
- xvilvh.d \out0, \out2, \out3
- xvilvl.d \out2, \out2, \out3
- xvilvh.d \out1, \tmp1, \tmp2
- xvilvl.d \out3, \tmp1, \tmp2
- .endm
- /*
- * Description : Transpose 4x4 block with word elements in vectors
- * Arguments : Inputs - in0, in1, in2, in3
- * Outputs - out0, out1, out2, out3
- * Details :
- * Example :
- * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
- * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
- * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
- * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
- */
- .macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
- _tmp0, _tmp1
- xvilvl.w \_tmp0, \_in1, \_in0
- xvilvh.w \_out1, \_in1, \_in0
- xvilvl.w \_tmp1, \_in3, \_in2
- xvilvh.w \_out3, \_in3, \_in2
- xvilvl.d \_out0, \_tmp1, \_tmp0
- xvilvl.d \_out2, \_out3, \_out1
- xvilvh.d \_out3, \_out3, \_out1
- xvilvh.d \_out1, \_tmp1, \_tmp0
- .endm
- /*
- * Description : Transpose 8x8 block with word elements in vectors
- * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
- * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
- * _out7
- * Example : LASX_TRANSPOSE8x8_W
- * _in0 : 1,2,3,4,5,6,7,8
- * _in1 : 2,2,3,4,5,6,7,8
- * _in2 : 3,2,3,4,5,6,7,8
- * _in3 : 4,2,3,4,5,6,7,8
- * _in4 : 5,2,3,4,5,6,7,8
- * _in5 : 6,2,3,4,5,6,7,8
- * _in6 : 7,2,3,4,5,6,7,8
- * _in7 : 8,2,3,4,5,6,7,8
- *
- * _out0 : 1,2,3,4,5,6,7,8
- * _out1 : 2,2,2,2,2,2,2,2
- * _out2 : 3,3,3,3,3,3,3,3
- * _out3 : 4,4,4,4,4,4,4,4
- * _out4 : 5,5,5,5,5,5,5,5
- * _out5 : 6,6,6,6,6,6,6,6
- * _out6 : 7,7,7,7,7,7,7,7
- * _out7 : 8,8,8,8,8,8,8,8
- */
- .macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
- _tmp0, _tmp1, _tmp2, _tmp3
- xvilvl.w \_tmp0, \_in2, \_in0
- xvilvl.w \_tmp1, \_in3, \_in1
- xvilvh.w \_tmp2, \_in2, \_in0
- xvilvh.w \_tmp3, \_in3, \_in1
- xvilvl.w \_out0, \_tmp1, \_tmp0
- xvilvh.w \_out1, \_tmp1, \_tmp0
- xvilvl.w \_out2, \_tmp3, \_tmp2
- xvilvh.w \_out3, \_tmp3, \_tmp2
- xvilvl.w \_tmp0, \_in6, \_in4
- xvilvl.w \_tmp1, \_in7, \_in5
- xvilvh.w \_tmp2, \_in6, \_in4
- xvilvh.w \_tmp3, \_in7, \_in5
- xvilvl.w \_out4, \_tmp1, \_tmp0
- xvilvh.w \_out5, \_tmp1, \_tmp0
- xvilvl.w \_out6, \_tmp3, \_tmp2
- xvilvh.w \_out7, \_tmp3, \_tmp2
- xmov \_tmp0, \_out0
- xmov \_tmp1, \_out1
- xmov \_tmp2, \_out2
- xmov \_tmp3, \_out3
- xvpermi.q \_out0, \_out4, 0x02
- xvpermi.q \_out1, \_out5, 0x02
- xvpermi.q \_out2, \_out6, 0x02
- xvpermi.q \_out3, \_out7, 0x02
- xvpermi.q \_out4, \_tmp0, 0x31
- xvpermi.q \_out5, \_tmp1, 0x31
- xvpermi.q \_out6, \_tmp2, 0x31
- xvpermi.q \_out7, \_tmp3, 0x31
- .endm
- /*
- * Description : Transpose 4x4 block with double-word elements in vectors
- * Arguments : Inputs - _in0, _in1, _in2, _in3
- * Outputs - _out0, _out1, _out2, _out3
- * Example : LASX_TRANSPOSE4x4_D
- * _in0 : 1,2,3,4
- * _in1 : 1,2,3,4
- * _in2 : 1,2,3,4
- * _in3 : 1,2,3,4
- *
- * _out0 : 1,1,1,1
- * _out1 : 2,2,2,2
- * _out2 : 3,3,3,3
- * _out3 : 4,4,4,4
- */
- .macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
- _tmp0, _tmp1
- xvilvl.d \_tmp0, \_in1, \_in0
- xvilvh.d \_out1, \_in1, \_in0
- xvilvh.d \_tmp1, \_in3, \_in2
- xvilvl.d \_out2, \_in3, \_in2
- xvor.v \_out0, \_tmp0, \_tmp0
- xvor.v \_out3, \_tmp1, \_tmp1
- xvpermi.q \_out0, \_out2, 0x02
- xvpermi.q \_out2, \_tmp0, 0x31
- xvpermi.q \_out3, \_out1, 0x31
- xvpermi.q \_out1, \_tmp1, 0x02
- .endm
- /*
- * Description : Butterfly of 4 input vectors
- * Arguments : Inputs - _in0, _in1, _in2, _in3
- * Outputs - _out0, _out1, _out2, _out3
- * Details : Butterfly operation
- * Example : LSX_BUTTERFLY_4
- * _out0 = _in0 + _in3;
- * _out1 = _in1 + _in2;
- * _out2 = _in1 - _in2;
- * _out3 = _in0 - _in3;
- */
- .macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- vadd.b \_out0, \_in0, \_in3
- vadd.b \_out1, \_in1, \_in2
- vsub.b \_out2, \_in1, \_in2
- vsub.b \_out3, \_in0, \_in3
- .endm
- .macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- vadd.h \_out0, \_in0, \_in3
- vadd.h \_out1, \_in1, \_in2
- vsub.h \_out2, \_in1, \_in2
- vsub.h \_out3, \_in0, \_in3
- .endm
- .macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- vadd.w \_out0, \_in0, \_in3
- vadd.w \_out1, \_in1, \_in2
- vsub.w \_out2, \_in1, \_in2
- vsub.w \_out3, \_in0, \_in3
- .endm
- .macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- vadd.d \_out0, \_in0, \_in3
- vadd.d \_out1, \_in1, \_in2
- vsub.d \_out2, \_in1, \_in2
- vsub.d \_out3, \_in0, \_in3
- .endm
- .macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- xvadd.b \_out0, \_in0, \_in3
- xvadd.b \_out1, \_in1, \_in2
- xvsub.b \_out2, \_in1, \_in2
- xvsub.b \_out3, \_in0, \_in3
- .endm
- .macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- xvadd.h \_out0, \_in0, \_in3
- xvadd.h \_out1, \_in1, \_in2
- xvsub.h \_out2, \_in1, \_in2
- xvsub.h \_out3, \_in0, \_in3
- .endm
- .macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- xvadd.w \_out0, \_in0, \_in3
- xvadd.w \_out1, \_in1, \_in2
- xvsub.w \_out2, \_in1, \_in2
- xvsub.w \_out3, \_in0, \_in3
- .endm
- .macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
- xvadd.d \_out0, \_in0, \_in3
- xvadd.d \_out1, \_in1, \_in2
- xvsub.d \_out2, \_in1, \_in2
- xvsub.d \_out3, \_in0, \_in3
- .endm
- /*
- * Description : Butterfly of 8 input vectors
- * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
- * Outputs - _out0, _out1, _out2, _out3, ~
- * Details : Butterfly operation
- * Example : LASX_BUTTERFLY_8
- * _out0 = _in0 + _in7;
- * _out1 = _in1 + _in6;
- * _out2 = _in2 + _in5;
- * _out3 = _in3 + _in4;
- * _out4 = _in3 - _in4;
- * _out5 = _in2 - _in5;
- * _out6 = _in1 - _in6;
- * _out7 = _in0 - _in7;
- */
- .macro LSX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- vadd.b \_out0, \_in0, \_in7
- vadd.b \_out1, \_in1, \_in6
- vadd.b \_out2, \_in2, \_in5
- vadd.b \_out3, \_in3, \_in4
- vsub.b \_out4, \_in3, \_in4
- vsub.b \_out5, \_in2, \_in5
- vsub.b \_out6, \_in1, \_in6
- vsub.b \_out7, \_in0, \_in7
- .endm
- .macro LSX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- vadd.h \_out0, \_in0, \_in7
- vadd.h \_out1, \_in1, \_in6
- vadd.h \_out2, \_in2, \_in5
- vadd.h \_out3, \_in3, \_in4
- vsub.h \_out4, \_in3, \_in4
- vsub.h \_out5, \_in2, \_in5
- vsub.h \_out6, \_in1, \_in6
- vsub.h \_out7, \_in0, \_in7
- .endm
- .macro LSX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- vadd.w \_out0, \_in0, \_in7
- vadd.w \_out1, \_in1, \_in6
- vadd.w \_out2, \_in2, \_in5
- vadd.w \_out3, \_in3, \_in4
- vsub.w \_out4, \_in3, \_in4
- vsub.w \_out5, \_in2, \_in5
- vsub.w \_out6, \_in1, \_in6
- vsub.w \_out7, \_in0, \_in7
- .endm
- .macro LSX_BUTTERFLY_8_D _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- vadd.d \_out0, \_in0, \_in7
- vadd.d \_out1, \_in1, \_in6
- vadd.d \_out2, \_in2, \_in5
- vadd.d \_out3, \_in3, \_in4
- vsub.d \_out4, \_in3, \_in4
- vsub.d \_out5, \_in2, \_in5
- vsub.d \_out6, \_in1, \_in6
- vsub.d \_out7, \_in0, \_in7
- .endm
- .macro LASX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- xvadd.b \_out0, \_in0, \_in7
- xvadd.b \_out1, \_in1, \_in6
- xvadd.b \_out2, \_in2, \_in5
- xvadd.b \_out3, \_in3, \_in4
- xvsub.b \_out4, \_in3, \_in4
- xvsub.b \_out5, \_in2, \_in5
- xvsub.b \_out6, \_in1, \_in6
- xvsub.b \_out7, \_in0, \_in7
- .endm
- .macro LASX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- xvadd.h \_out0, \_in0, \_in7
- xvadd.h \_out1, \_in1, \_in6
- xvadd.h \_out2, \_in2, \_in5
- xvadd.h \_out3, \_in3, \_in4
- xvsub.h \_out4, \_in3, \_in4
- xvsub.h \_out5, \_in2, \_in5
- xvsub.h \_out6, \_in1, \_in6
- xvsub.h \_out7, \_in0, \_in7
- .endm
- .macro LASX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
- _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
- xvadd.w \_out0, \_in0, \_in7
- xvadd.w \_out1, \_in1, \_in6
- xvadd.w \_out2, \_in2, \_in5
- xvadd.w \_out3, \_in3, \_in4
- xvsub.w \_out4, \_in3, \_in4
- xvsub.w \_out5, \_in2, \_in5
- xvsub.w \_out6, \_in1, \_in6
- xvsub.w \_out7, \_in0, \_in7
- .endm
|