loongson_asm.S 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945
  1. /*
  2. * Loongson asm helper.
  3. *
  4. * Copyright (c) 2022 Loongson Technology Corporation Limited
  5. * Contributed by Gu Xiwei(guxiwei-hf@loongson.cn)
  6. * Shiyou Yin(yinshiyou-hf@loongson.cn)
  7. *
  8. * This file is part of FFmpeg.
  9. *
  10. * FFmpeg is free software; you can redistribute it and/or
  11. * modify it under the terms of the GNU Lesser General Public
  12. * License as published by the Free Software Foundation; either
  13. * version 2.1 of the License, or (at your option) any later version.
  14. *
  15. * FFmpeg is distributed in the hope that it will be useful,
  16. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. * Lesser General Public License for more details.
  19. *
  20. * You should have received a copy of the GNU Lesser General Public
  21. * License along with FFmpeg; if not, write to the Free Software
  22. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. */
  24. /**
  25. * MAJOR version: Macro usage changes.
  26. * MINOR version: Add new functions, or bug fixes.
  27. * MICRO version: Comment changes or implementation changes.
  28. */
  29. #define LML_VERSION_MAJOR 0
  30. #define LML_VERSION_MINOR 2
  31. #define LML_VERSION_MICRO 0
  32. /*
  33. *============================================================================
  34. * macros for specific projetc, set them as needed.
  35. * Following LoongML macros for your reference.
  36. *============================================================================
  37. */
  38. #define ASM_PREF
  39. #define DEFAULT_ALIGN 5
  40. .macro function name, align=DEFAULT_ALIGN
  41. .macro endfunc
  42. jirl $r0, $r1, 0x0
  43. .size ASM_PREF\name, . - ASM_PREF\name
  44. .purgem endfunc
  45. .endm
  46. .text ;
  47. .align \align ;
  48. .globl ASM_PREF\name ;
  49. .type ASM_PREF\name, @function ;
  50. ASM_PREF\name: ;
  51. .endm
  52. /**
  53. * Attention: If align is not zero, the macro will use
  54. * t7 until the end of function
  55. */
  56. .macro alloc_stack size, align=0
  57. .if \align
  58. .macro clean_stack
  59. add.d sp, sp, t7
  60. .endm
  61. addi.d sp, sp, - \size
  62. andi.d t7, sp, \align - 1
  63. sub.d sp, sp, t7
  64. addi.d t7, t7, \size
  65. .else
  66. .macro clean_stack
  67. addi.d sp, sp, \size
  68. .endm
  69. addi.d sp, sp, - \size
  70. .endif
  71. .endm
  72. .macro const name, align=DEFAULT_ALIGN
  73. .macro endconst
  74. .size \name, . - \name
  75. .purgem endconst
  76. .endm
  77. .section .rodata
  78. .align \align
  79. \name:
  80. .endm
  81. /*
  82. *============================================================================
  83. * LoongArch register alias
  84. *============================================================================
  85. */
  86. #define a0 $a0
  87. #define a1 $a1
  88. #define a2 $a2
  89. #define a3 $a3
  90. #define a4 $a4
  91. #define a5 $a5
  92. #define a6 $a6
  93. #define a7 $a7
  94. #define t0 $t0
  95. #define t1 $t1
  96. #define t2 $t2
  97. #define t3 $t3
  98. #define t4 $t4
  99. #define t5 $t5
  100. #define t6 $t6
  101. #define t7 $t7
  102. #define t8 $t8
  103. #define s0 $s0
  104. #define s1 $s1
  105. #define s2 $s2
  106. #define s3 $s3
  107. #define s4 $s4
  108. #define s5 $s5
  109. #define s6 $s6
  110. #define s7 $s7
  111. #define s8 $s8
  112. #define zero $zero
  113. #define sp $sp
  114. #define ra $ra
  115. #define f0 $f0
  116. #define f1 $f1
  117. #define f2 $f2
  118. #define f3 $f3
  119. #define f4 $f4
  120. #define f5 $f5
  121. #define f6 $f6
  122. #define f7 $f7
  123. #define f8 $f8
  124. #define f9 $f9
  125. #define f10 $f10
  126. #define f11 $f11
  127. #define f12 $f12
  128. #define f13 $f13
  129. #define f14 $f14
  130. #define f15 $f15
  131. #define f16 $f16
  132. #define f17 $f17
  133. #define f18 $f18
  134. #define f19 $f19
  135. #define f20 $f20
  136. #define f21 $f21
  137. #define f22 $f22
  138. #define f23 $f23
  139. #define f24 $f24
  140. #define f25 $f25
  141. #define f26 $f26
  142. #define f27 $f27
  143. #define f28 $f28
  144. #define f29 $f29
  145. #define f30 $f30
  146. #define f31 $f31
  147. #define vr0 $vr0
  148. #define vr1 $vr1
  149. #define vr2 $vr2
  150. #define vr3 $vr3
  151. #define vr4 $vr4
  152. #define vr5 $vr5
  153. #define vr6 $vr6
  154. #define vr7 $vr7
  155. #define vr8 $vr8
  156. #define vr9 $vr9
  157. #define vr10 $vr10
  158. #define vr11 $vr11
  159. #define vr12 $vr12
  160. #define vr13 $vr13
  161. #define vr14 $vr14
  162. #define vr15 $vr15
  163. #define vr16 $vr16
  164. #define vr17 $vr17
  165. #define vr18 $vr18
  166. #define vr19 $vr19
  167. #define vr20 $vr20
  168. #define vr21 $vr21
  169. #define vr22 $vr22
  170. #define vr23 $vr23
  171. #define vr24 $vr24
  172. #define vr25 $vr25
  173. #define vr26 $vr26
  174. #define vr27 $vr27
  175. #define vr28 $vr28
  176. #define vr29 $vr29
  177. #define vr30 $vr30
  178. #define vr31 $vr31
  179. #define xr0 $xr0
  180. #define xr1 $xr1
  181. #define xr2 $xr2
  182. #define xr3 $xr3
  183. #define xr4 $xr4
  184. #define xr5 $xr5
  185. #define xr6 $xr6
  186. #define xr7 $xr7
  187. #define xr8 $xr8
  188. #define xr9 $xr9
  189. #define xr10 $xr10
  190. #define xr11 $xr11
  191. #define xr12 $xr12
  192. #define xr13 $xr13
  193. #define xr14 $xr14
  194. #define xr15 $xr15
  195. #define xr16 $xr16
  196. #define xr17 $xr17
  197. #define xr18 $xr18
  198. #define xr19 $xr19
  199. #define xr20 $xr20
  200. #define xr21 $xr21
  201. #define xr22 $xr22
  202. #define xr23 $xr23
  203. #define xr24 $xr24
  204. #define xr25 $xr25
  205. #define xr26 $xr26
  206. #define xr27 $xr27
  207. #define xr28 $xr28
  208. #define xr29 $xr29
  209. #define xr30 $xr30
  210. #define xr31 $xr31
  211. /*
  212. *============================================================================
  213. * LSX/LASX synthesize instructions
  214. *============================================================================
  215. */
  216. /*
  217. * Description : Dot product of byte vector elements
  218. * Arguments : Inputs - vj, vk
  219. * Outputs - vd
  220. * Return Type - halfword
  221. */
  222. .macro vdp2.h.bu vd, vj, vk
  223. vmulwev.h.bu \vd, \vj, \vk
  224. vmaddwod.h.bu \vd, \vj, \vk
  225. .endm
  226. .macro vdp2.h.bu.b vd, vj, vk
  227. vmulwev.h.bu.b \vd, \vj, \vk
  228. vmaddwod.h.bu.b \vd, \vj, \vk
  229. .endm
  230. .macro vdp2.w.h vd, vj, vk
  231. vmulwev.w.h \vd, \vj, \vk
  232. vmaddwod.w.h \vd, \vj, \vk
  233. .endm
  234. .macro xvdp2.h.bu xd, xj, xk
  235. xvmulwev.h.bu \xd, \xj, \xk
  236. xvmaddwod.h.bu \xd, \xj, \xk
  237. .endm
  238. .macro xvdp2.h.bu.b xd, xj, xk
  239. xvmulwev.h.bu.b \xd, \xj, \xk
  240. xvmaddwod.h.bu.b \xd, \xj, \xk
  241. .endm
  242. .macro xvdp2.w.h xd, xj, xk
  243. xvmulwev.w.h \xd, \xj, \xk
  244. xvmaddwod.w.h \xd, \xj, \xk
  245. .endm
  246. /*
  247. * Description : Dot product & addition of halfword vector elements
  248. * Arguments : Inputs - vj, vk
  249. * Outputs - vd
  250. * Return Type - twice size of input
  251. */
  252. .macro vdp2add.h.bu vd, vj, vk
  253. vmaddwev.h.bu \vd, \vj, \vk
  254. vmaddwod.h.bu \vd, \vj, \vk
  255. .endm
  256. .macro vdp2add.h.bu.b vd, vj, vk
  257. vmaddwev.h.bu.b \vd, \vj, \vk
  258. vmaddwod.h.bu.b \vd, \vj, \vk
  259. .endm
  260. .macro vdp2add.w.h vd, vj, vk
  261. vmaddwev.w.h \vd, \vj, \vk
  262. vmaddwod.w.h \vd, \vj, \vk
  263. .endm
  264. .macro xvdp2add.h.bu.b xd, xj, xk
  265. xvmaddwev.h.bu.b \xd, \xj, \xk
  266. xvmaddwod.h.bu.b \xd, \xj, \xk
  267. .endm
  268. .macro xvdp2add.w.h xd, xj, xk
  269. xvmaddwev.w.h \xd, \xj, \xk
  270. xvmaddwod.w.h \xd, \xj, \xk
  271. .endm
  272. /*
  273. * Description : Range each element of vector
  274. * clip: vj > vk ? vj : vk && vj < va ? vj : va
  275. * clip255: vj < 255 ? vj : 255 && vj > 0 ? vj : 0
  276. */
  277. .macro vclip.h vd, vj, vk, va
  278. vmax.h \vd, \vj, \vk
  279. vmin.h \vd, \vd, \va
  280. .endm
  281. .macro vclip255.w vd, vj
  282. vmaxi.w \vd, \vj, 0
  283. vsat.wu \vd, \vd, 7
  284. .endm
  285. .macro vclip255.h vd, vj
  286. vmaxi.h \vd, \vj, 0
  287. vsat.hu \vd, \vd, 7
  288. .endm
  289. .macro xvclip.h xd, xj, xk, xa
  290. xvmax.h \xd, \xj, \xk
  291. xvmin.h \xd, \xd, \xa
  292. .endm
  293. .macro xvclip255.h xd, xj
  294. xvmaxi.h \xd, \xj, 0
  295. xvsat.hu \xd, \xd, 7
  296. .endm
  297. .macro xvclip255.w xd, xj
  298. xvmaxi.w \xd, \xj, 0
  299. xvsat.wu \xd, \xd, 7
  300. .endm
  301. /*
  302. * Description : Store elements of vector
  303. * vd : Data vector to be stored
  304. * rk : Address of data storage
  305. * ra : Offset of address
  306. * si : Index of data in vd
  307. */
  308. .macro vstelmx.b vd, rk, ra, si
  309. add.d \rk, \rk, \ra
  310. vstelm.b \vd, \rk, 0, \si
  311. .endm
  312. .macro vstelmx.h vd, rk, ra, si
  313. add.d \rk, \rk, \ra
  314. vstelm.h \vd, \rk, 0, \si
  315. .endm
  316. .macro vstelmx.w vd, rk, ra, si
  317. add.d \rk, \rk, \ra
  318. vstelm.w \vd, \rk, 0, \si
  319. .endm
  320. .macro vstelmx.d vd, rk, ra, si
  321. add.d \rk, \rk, \ra
  322. vstelm.d \vd, \rk, 0, \si
  323. .endm
  324. .macro vmov xd, xj
  325. vor.v \xd, \xj, \xj
  326. .endm
  327. .macro xmov xd, xj
  328. xvor.v \xd, \xj, \xj
  329. .endm
  330. .macro xvstelmx.d xd, rk, ra, si
  331. add.d \rk, \rk, \ra
  332. xvstelm.d \xd, \rk, 0, \si
  333. .endm
  334. /*
  335. *============================================================================
  336. * LSX/LASX custom macros
  337. *============================================================================
  338. */
  339. /*
  340. * Load 4 float, double, V128, v256 elements with stride.
  341. */
  342. .macro FLDS_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
  343. fld.s \out0, \src, 0
  344. fldx.s \out1, \src, \stride
  345. fldx.s \out2, \src, \stride2
  346. fldx.s \out3, \src, \stride3
  347. .endm
  348. .macro FLDD_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
  349. fld.d \out0, \src, 0
  350. fldx.d \out1, \src, \stride
  351. fldx.d \out2, \src, \stride2
  352. fldx.d \out3, \src, \stride3
  353. .endm
  354. .macro LSX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
  355. vld \out0, \src, 0
  356. vldx \out1, \src, \stride
  357. vldx \out2, \src, \stride2
  358. vldx \out3, \src, \stride3
  359. .endm
  360. .macro LASX_LOADX_4 src, stride, stride2, stride3, out0, out1, out2, out3
  361. xvld \out0, \src, 0
  362. xvldx \out1, \src, \stride
  363. xvldx \out2, \src, \stride2
  364. xvldx \out3, \src, \stride3
  365. .endm
  366. /*
  367. * Description : Transpose 4x4 block with half-word elements in vectors
  368. * Arguments : Inputs - in0, in1, in2, in3
  369. * Outputs - out0, out1, out2, out3
  370. */
  371. .macro LSX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
  372. tmp0, tmp1
  373. vilvl.h \tmp0, \in1, \in0
  374. vilvl.h \tmp1, \in3, \in2
  375. vilvl.w \out0, \tmp1, \tmp0
  376. vilvh.w \out2, \tmp1, \tmp0
  377. vilvh.d \out1, \out0, \out0
  378. vilvh.d \out3, \out0, \out2
  379. .endm
  380. /*
  381. * Description : Transpose 4x4 block with word elements in vectors
  382. * Arguments : Inputs - in0, in1, in2, in3
  383. * Outputs - out0, out1, out2, out3
  384. * Details :
  385. * Example :
  386. * 1, 2, 3, 4 1, 5, 9,13
  387. * 5, 6, 7, 8 to 2, 6,10,14
  388. * 9,10,11,12 =====> 3, 7,11,15
  389. * 13,14,15,16 4, 8,12,16
  390. */
  391. .macro LSX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
  392. _tmp0, _tmp1
  393. vilvl.w \_tmp0, \_in1, \_in0
  394. vilvh.w \_out1, \_in1, \_in0
  395. vilvl.w \_tmp1, \_in3, \_in2
  396. vilvh.w \_out3, \_in3, \_in2
  397. vilvl.d \_out0, \_tmp1, \_tmp0
  398. vilvl.d \_out2, \_out3, \_out1
  399. vilvh.d \_out3, \_out3, \_out1
  400. vilvh.d \_out1, \_tmp1, \_tmp0
  401. .endm
  402. /*
  403. * Description : Transpose 8x8 block with half-word elements in vectors
  404. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
  405. * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  406. */
  407. .macro LSX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
  408. out2, out3, out4, out5, out6, out7, tmp0, tmp1, tmp2, \
  409. tmp3, tmp4, tmp5, tmp6, tmp7
  410. vilvl.h \tmp0, \in6, \in4
  411. vilvl.h \tmp1, \in7, \in5
  412. vilvl.h \tmp2, \in2, \in0
  413. vilvl.h \tmp3, \in3, \in1
  414. vilvl.h \tmp4, \tmp1, \tmp0
  415. vilvh.h \tmp5, \tmp1, \tmp0
  416. vilvl.h \tmp6, \tmp3, \tmp2
  417. vilvh.h \tmp7, \tmp3, \tmp2
  418. vilvh.h \tmp0, \in6, \in4
  419. vilvh.h \tmp1, \in7, \in5
  420. vilvh.h \tmp2, \in2, \in0
  421. vilvh.h \tmp3, \in3, \in1
  422. vpickev.d \out0, \tmp4, \tmp6
  423. vpickod.d \out1, \tmp4, \tmp6
  424. vpickev.d \out2, \tmp5, \tmp7
  425. vpickod.d \out3, \tmp5, \tmp7
  426. vilvl.h \tmp4, \tmp1, \tmp0
  427. vilvh.h \tmp5, \tmp1, \tmp0
  428. vilvl.h \tmp6, \tmp3, \tmp2
  429. vilvh.h \tmp7, \tmp3, \tmp2
  430. vpickev.d \out4, \tmp4, \tmp6
  431. vpickod.d \out5, \tmp4, \tmp6
  432. vpickev.d \out6, \tmp5, \tmp7
  433. vpickod.d \out7, \tmp5, \tmp7
  434. .endm
  435. /*
  436. * Description : Transpose 16x8 block with byte elements in vectors
  437. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
  438. * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  439. */
  440. .macro LASX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
  441. in8, in9, in10, in11, in12, in13, in14, in15, \
  442. out0, out1, out2, out3, out4, out5, out6, out7,\
  443. tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
  444. xvilvl.b \tmp0, \in2, \in0
  445. xvilvl.b \tmp1, \in3, \in1
  446. xvilvl.b \tmp2, \in6, \in4
  447. xvilvl.b \tmp3, \in7, \in5
  448. xvilvl.b \tmp4, \in10, \in8
  449. xvilvl.b \tmp5, \in11, \in9
  450. xvilvl.b \tmp6, \in14, \in12
  451. xvilvl.b \tmp7, \in15, \in13
  452. xvilvl.b \out0, \tmp1, \tmp0
  453. xvilvh.b \out1, \tmp1, \tmp0
  454. xvilvl.b \out2, \tmp3, \tmp2
  455. xvilvh.b \out3, \tmp3, \tmp2
  456. xvilvl.b \out4, \tmp5, \tmp4
  457. xvilvh.b \out5, \tmp5, \tmp4
  458. xvilvl.b \out6, \tmp7, \tmp6
  459. xvilvh.b \out7, \tmp7, \tmp6
  460. xvilvl.w \tmp0, \out2, \out0
  461. xvilvh.w \tmp2, \out2, \out0
  462. xvilvl.w \tmp4, \out3, \out1
  463. xvilvh.w \tmp6, \out3, \out1
  464. xvilvl.w \tmp1, \out6, \out4
  465. xvilvh.w \tmp3, \out6, \out4
  466. xvilvl.w \tmp5, \out7, \out5
  467. xvilvh.w \tmp7, \out7, \out5
  468. xvilvl.d \out0, \tmp1, \tmp0
  469. xvilvh.d \out1, \tmp1, \tmp0
  470. xvilvl.d \out2, \tmp3, \tmp2
  471. xvilvh.d \out3, \tmp3, \tmp2
  472. xvilvl.d \out4, \tmp5, \tmp4
  473. xvilvh.d \out5, \tmp5, \tmp4
  474. xvilvl.d \out6, \tmp7, \tmp6
  475. xvilvh.d \out7, \tmp7, \tmp6
  476. .endm
  477. /*
  478. * Description : Transpose 16x8 block with byte elements in vectors
  479. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
  480. * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  481. */
  482. .macro LSX_TRANSPOSE16X8_B in0, in1, in2, in3, in4, in5, in6, in7, \
  483. in8, in9, in10, in11, in12, in13, in14, in15, \
  484. out0, out1, out2, out3, out4, out5, out6, out7,\
  485. tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
  486. vilvl.b \tmp0, \in2, \in0
  487. vilvl.b \tmp1, \in3, \in1
  488. vilvl.b \tmp2, \in6, \in4
  489. vilvl.b \tmp3, \in7, \in5
  490. vilvl.b \tmp4, \in10, \in8
  491. vilvl.b \tmp5, \in11, \in9
  492. vilvl.b \tmp6, \in14, \in12
  493. vilvl.b \tmp7, \in15, \in13
  494. vilvl.b \out0, \tmp1, \tmp0
  495. vilvh.b \out1, \tmp1, \tmp0
  496. vilvl.b \out2, \tmp3, \tmp2
  497. vilvh.b \out3, \tmp3, \tmp2
  498. vilvl.b \out4, \tmp5, \tmp4
  499. vilvh.b \out5, \tmp5, \tmp4
  500. vilvl.b \out6, \tmp7, \tmp6
  501. vilvh.b \out7, \tmp7, \tmp6
  502. vilvl.w \tmp0, \out2, \out0
  503. vilvh.w \tmp2, \out2, \out0
  504. vilvl.w \tmp4, \out3, \out1
  505. vilvh.w \tmp6, \out3, \out1
  506. vilvl.w \tmp1, \out6, \out4
  507. vilvh.w \tmp3, \out6, \out4
  508. vilvl.w \tmp5, \out7, \out5
  509. vilvh.w \tmp7, \out7, \out5
  510. vilvl.d \out0, \tmp1, \tmp0
  511. vilvh.d \out1, \tmp1, \tmp0
  512. vilvl.d \out2, \tmp3, \tmp2
  513. vilvh.d \out3, \tmp3, \tmp2
  514. vilvl.d \out4, \tmp5, \tmp4
  515. vilvh.d \out5, \tmp5, \tmp4
  516. vilvl.d \out6, \tmp7, \tmp6
  517. vilvh.d \out7, \tmp7, \tmp6
  518. .endm
  519. /*
  520. * Description : Transpose 4x4 block with half-word elements in vectors
  521. * Arguments : Inputs - in0, in1, in2, in3
  522. * Outputs - out0, out1, out2, out3
  523. */
  524. .macro LASX_TRANSPOSE4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
  525. tmp0, tmp1
  526. xvilvl.h \tmp0, \in1, \in0
  527. xvilvl.h \tmp1, \in3, \in2
  528. xvilvl.w \out0, \tmp1, \tmp0
  529. xvilvh.w \out2, \tmp1, \tmp0
  530. xvilvh.d \out1, \out0, \out0
  531. xvilvh.d \out3, \out0, \out2
  532. .endm
  533. /*
  534. * Description : Transpose 4x8 block with half-word elements in vectors
  535. * Arguments : Inputs - in0, in1, in2, in3
  536. * Outputs - out0, out1, out2, out3
  537. */
  538. .macro LASX_TRANSPOSE4x8_H in0, in1, in2, in3, out0, out1, out2, out3, \
  539. tmp0, tmp1
  540. xvilvl.h \tmp0, \in2, \in0
  541. xvilvl.h \tmp1, \in3, \in1
  542. xvilvl.h \out2, \tmp1, \tmp0
  543. xvilvh.h \out3, \tmp1, \tmp0
  544. xvilvl.d \out0, \out2, \out2
  545. xvilvh.d \out1, \out2, \out2
  546. xvilvl.d \out2, \out3, \out3
  547. xvilvh.d \out3, \out3, \out3
  548. .endm
  549. /*
  550. * Description : Transpose 8x8 block with half-word elements in vectors
  551. * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
  552. * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
  553. */
  554. .macro LASX_TRANSPOSE8x8_H in0, in1, in2, in3, in4, in5, in6, in7, \
  555. out0, out1, out2, out3, out4, out5, out6, out7, \
  556. tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7
  557. xvilvl.h \tmp0, \in6, \in4
  558. xvilvl.h \tmp1, \in7, \in5
  559. xvilvl.h \tmp2, \in2, \in0
  560. xvilvl.h \tmp3, \in3, \in1
  561. xvilvl.h \tmp4, \tmp1, \tmp0
  562. xvilvh.h \tmp5, \tmp1, \tmp0
  563. xvilvl.h \tmp6, \tmp3, \tmp2
  564. xvilvh.h \tmp7, \tmp3, \tmp2
  565. xvilvh.h \tmp0, \in6, \in4
  566. xvilvh.h \tmp1, \in7, \in5
  567. xvilvh.h \tmp2, \in2, \in0
  568. xvilvh.h \tmp3, \in3, \in1
  569. xvpickev.d \out0, \tmp4, \tmp6
  570. xvpickod.d \out1, \tmp4, \tmp6
  571. xvpickev.d \out2, \tmp5, \tmp7
  572. xvpickod.d \out3, \tmp5, \tmp7
  573. xvilvl.h \tmp4, \tmp1, \tmp0
  574. xvilvh.h \tmp5, \tmp1, \tmp0
  575. xvilvl.h \tmp6, \tmp3, \tmp2
  576. xvilvh.h \tmp7, \tmp3, \tmp2
  577. xvpickev.d \out4, \tmp4, \tmp6
  578. xvpickod.d \out5, \tmp4, \tmp6
  579. xvpickev.d \out6, \tmp5, \tmp7
  580. xvpickod.d \out7, \tmp5, \tmp7
  581. .endm
  582. /*
  583. * Description : Transpose 2x4x4 block with half-word elements in vectors
  584. * Arguments : Inputs - in0, in1, in2, in3
  585. * Outputs - out0, out1, out2, out3
  586. */
  587. .macro LASX_TRANSPOSE2x4x4_H in0, in1, in2, in3, out0, out1, out2, out3, \
  588. tmp0, tmp1, tmp2
  589. xvilvh.h \tmp1, \in0, \in1
  590. xvilvl.h \out1, \in0, \in1
  591. xvilvh.h \tmp0, \in2, \in3
  592. xvilvl.h \out3, \in2, \in3
  593. xvilvh.w \tmp2, \out3, \out1
  594. xvilvl.w \out3, \out3, \out1
  595. xvilvl.w \out2, \tmp0, \tmp1
  596. xvilvh.w \tmp1, \tmp0, \tmp1
  597. xvilvh.d \out0, \out2, \out3
  598. xvilvl.d \out2, \out2, \out3
  599. xvilvh.d \out1, \tmp1, \tmp2
  600. xvilvl.d \out3, \tmp1, \tmp2
  601. .endm
  602. /*
  603. * Description : Transpose 4x4 block with word elements in vectors
  604. * Arguments : Inputs - in0, in1, in2, in3
  605. * Outputs - out0, out1, out2, out3
  606. * Details :
  607. * Example :
  608. * 1, 2, 3, 4, 1, 2, 3, 4 1,5, 9,13, 1,5, 9,13
  609. * 5, 6, 7, 8, 5, 6, 7, 8 to 2,6,10,14, 2,6,10,14
  610. * 9,10,11,12, 9,10,11,12 =====> 3,7,11,15, 3,7,11,15
  611. * 13,14,15,16, 13,14,15,16 4,8,12,16, 4,8,12,16
  612. */
  613. .macro LASX_TRANSPOSE4x4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
  614. _tmp0, _tmp1
  615. xvilvl.w \_tmp0, \_in1, \_in0
  616. xvilvh.w \_out1, \_in1, \_in0
  617. xvilvl.w \_tmp1, \_in3, \_in2
  618. xvilvh.w \_out3, \_in3, \_in2
  619. xvilvl.d \_out0, \_tmp1, \_tmp0
  620. xvilvl.d \_out2, \_out3, \_out1
  621. xvilvh.d \_out3, \_out3, \_out1
  622. xvilvh.d \_out1, \_tmp1, \_tmp0
  623. .endm
  624. /*
  625. * Description : Transpose 8x8 block with word elements in vectors
  626. * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
  627. * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
  628. * _out7
  629. * Example : LASX_TRANSPOSE8x8_W
  630. * _in0 : 1,2,3,4,5,6,7,8
  631. * _in1 : 2,2,3,4,5,6,7,8
  632. * _in2 : 3,2,3,4,5,6,7,8
  633. * _in3 : 4,2,3,4,5,6,7,8
  634. * _in4 : 5,2,3,4,5,6,7,8
  635. * _in5 : 6,2,3,4,5,6,7,8
  636. * _in6 : 7,2,3,4,5,6,7,8
  637. * _in7 : 8,2,3,4,5,6,7,8
  638. *
  639. * _out0 : 1,2,3,4,5,6,7,8
  640. * _out1 : 2,2,2,2,2,2,2,2
  641. * _out2 : 3,3,3,3,3,3,3,3
  642. * _out3 : 4,4,4,4,4,4,4,4
  643. * _out4 : 5,5,5,5,5,5,5,5
  644. * _out5 : 6,6,6,6,6,6,6,6
  645. * _out6 : 7,7,7,7,7,7,7,7
  646. * _out7 : 8,8,8,8,8,8,8,8
  647. */
  648. .macro LASX_TRANSPOSE8x8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,\
  649. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7,\
  650. _tmp0, _tmp1, _tmp2, _tmp3
  651. xvilvl.w \_tmp0, \_in2, \_in0
  652. xvilvl.w \_tmp1, \_in3, \_in1
  653. xvilvh.w \_tmp2, \_in2, \_in0
  654. xvilvh.w \_tmp3, \_in3, \_in1
  655. xvilvl.w \_out0, \_tmp1, \_tmp0
  656. xvilvh.w \_out1, \_tmp1, \_tmp0
  657. xvilvl.w \_out2, \_tmp3, \_tmp2
  658. xvilvh.w \_out3, \_tmp3, \_tmp2
  659. xvilvl.w \_tmp0, \_in6, \_in4
  660. xvilvl.w \_tmp1, \_in7, \_in5
  661. xvilvh.w \_tmp2, \_in6, \_in4
  662. xvilvh.w \_tmp3, \_in7, \_in5
  663. xvilvl.w \_out4, \_tmp1, \_tmp0
  664. xvilvh.w \_out5, \_tmp1, \_tmp0
  665. xvilvl.w \_out6, \_tmp3, \_tmp2
  666. xvilvh.w \_out7, \_tmp3, \_tmp2
  667. xmov \_tmp0, \_out0
  668. xmov \_tmp1, \_out1
  669. xmov \_tmp2, \_out2
  670. xmov \_tmp3, \_out3
  671. xvpermi.q \_out0, \_out4, 0x02
  672. xvpermi.q \_out1, \_out5, 0x02
  673. xvpermi.q \_out2, \_out6, 0x02
  674. xvpermi.q \_out3, \_out7, 0x02
  675. xvpermi.q \_out4, \_tmp0, 0x31
  676. xvpermi.q \_out5, \_tmp1, 0x31
  677. xvpermi.q \_out6, \_tmp2, 0x31
  678. xvpermi.q \_out7, \_tmp3, 0x31
  679. .endm
  680. /*
  681. * Description : Transpose 4x4 block with double-word elements in vectors
  682. * Arguments : Inputs - _in0, _in1, _in2, _in3
  683. * Outputs - _out0, _out1, _out2, _out3
  684. * Example : LASX_TRANSPOSE4x4_D
  685. * _in0 : 1,2,3,4
  686. * _in1 : 1,2,3,4
  687. * _in2 : 1,2,3,4
  688. * _in3 : 1,2,3,4
  689. *
  690. * _out0 : 1,1,1,1
  691. * _out1 : 2,2,2,2
  692. * _out2 : 3,3,3,3
  693. * _out3 : 4,4,4,4
  694. */
  695. .macro LASX_TRANSPOSE4x4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3, \
  696. _tmp0, _tmp1
  697. xvilvl.d \_tmp0, \_in1, \_in0
  698. xvilvh.d \_out1, \_in1, \_in0
  699. xvilvh.d \_tmp1, \_in3, \_in2
  700. xvilvl.d \_out2, \_in3, \_in2
  701. xvor.v \_out0, \_tmp0, \_tmp0
  702. xvor.v \_out3, \_tmp1, \_tmp1
  703. xvpermi.q \_out0, \_out2, 0x02
  704. xvpermi.q \_out2, \_tmp0, 0x31
  705. xvpermi.q \_out3, \_out1, 0x31
  706. xvpermi.q \_out1, \_tmp1, 0x02
  707. .endm
  708. /*
  709. * Description : Butterfly of 4 input vectors
  710. * Arguments : Inputs - _in0, _in1, _in2, _in3
  711. * Outputs - _out0, _out1, _out2, _out3
  712. * Details : Butterfly operation
  713. * Example : LSX_BUTTERFLY_4
  714. * _out0 = _in0 + _in3;
  715. * _out1 = _in1 + _in2;
  716. * _out2 = _in1 - _in2;
  717. * _out3 = _in0 - _in3;
  718. */
  719. .macro LSX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  720. vadd.b \_out0, \_in0, \_in3
  721. vadd.b \_out1, \_in1, \_in2
  722. vsub.b \_out2, \_in1, \_in2
  723. vsub.b \_out3, \_in0, \_in3
  724. .endm
  725. .macro LSX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  726. vadd.h \_out0, \_in0, \_in3
  727. vadd.h \_out1, \_in1, \_in2
  728. vsub.h \_out2, \_in1, \_in2
  729. vsub.h \_out3, \_in0, \_in3
  730. .endm
  731. .macro LSX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  732. vadd.w \_out0, \_in0, \_in3
  733. vadd.w \_out1, \_in1, \_in2
  734. vsub.w \_out2, \_in1, \_in2
  735. vsub.w \_out3, \_in0, \_in3
  736. .endm
  737. .macro LSX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  738. vadd.d \_out0, \_in0, \_in3
  739. vadd.d \_out1, \_in1, \_in2
  740. vsub.d \_out2, \_in1, \_in2
  741. vsub.d \_out3, \_in0, \_in3
  742. .endm
  743. .macro LASX_BUTTERFLY_4_B _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  744. xvadd.b \_out0, \_in0, \_in3
  745. xvadd.b \_out1, \_in1, \_in2
  746. xvsub.b \_out2, \_in1, \_in2
  747. xvsub.b \_out3, \_in0, \_in3
  748. .endm
  749. .macro LASX_BUTTERFLY_4_H _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  750. xvadd.h \_out0, \_in0, \_in3
  751. xvadd.h \_out1, \_in1, \_in2
  752. xvsub.h \_out2, \_in1, \_in2
  753. xvsub.h \_out3, \_in0, \_in3
  754. .endm
  755. .macro LASX_BUTTERFLY_4_W _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  756. xvadd.w \_out0, \_in0, \_in3
  757. xvadd.w \_out1, \_in1, \_in2
  758. xvsub.w \_out2, \_in1, \_in2
  759. xvsub.w \_out3, \_in0, \_in3
  760. .endm
  761. .macro LASX_BUTTERFLY_4_D _in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3
  762. xvadd.d \_out0, \_in0, \_in3
  763. xvadd.d \_out1, \_in1, \_in2
  764. xvsub.d \_out2, \_in1, \_in2
  765. xvsub.d \_out3, \_in0, \_in3
  766. .endm
  767. /*
  768. * Description : Butterfly of 8 input vectors
  769. * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
  770. * Outputs - _out0, _out1, _out2, _out3, ~
  771. * Details : Butterfly operation
  772. * Example : LASX_BUTTERFLY_8
  773. * _out0 = _in0 + _in7;
  774. * _out1 = _in1 + _in6;
  775. * _out2 = _in2 + _in5;
  776. * _out3 = _in3 + _in4;
  777. * _out4 = _in3 - _in4;
  778. * _out5 = _in2 - _in5;
  779. * _out6 = _in1 - _in6;
  780. * _out7 = _in0 - _in7;
  781. */
  782. .macro LSX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  783. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
  784. vadd.b \_out0, \_in0, \_in7
  785. vadd.b \_out1, \_in1, \_in6
  786. vadd.b \_out2, \_in2, \_in5
  787. vadd.b \_out3, \_in3, \_in4
  788. vsub.b \_out4, \_in3, \_in4
  789. vsub.b \_out5, \_in2, \_in5
  790. vsub.b \_out6, \_in1, \_in6
  791. vsub.b \_out7, \_in0, \_in7
  792. .endm
  793. .macro LSX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  794. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
  795. vadd.h \_out0, \_in0, \_in7
  796. vadd.h \_out1, \_in1, \_in6
  797. vadd.h \_out2, \_in2, \_in5
  798. vadd.h \_out3, \_in3, \_in4
  799. vsub.h \_out4, \_in3, \_in4
  800. vsub.h \_out5, \_in2, \_in5
  801. vsub.h \_out6, \_in1, \_in6
  802. vsub.h \_out7, \_in0, \_in7
  803. .endm
  804. .macro LSX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  805. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
  806. vadd.w \_out0, \_in0, \_in7
  807. vadd.w \_out1, \_in1, \_in6
  808. vadd.w \_out2, \_in2, \_in5
  809. vadd.w \_out3, \_in3, \_in4
  810. vsub.w \_out4, \_in3, \_in4
  811. vsub.w \_out5, \_in2, \_in5
  812. vsub.w \_out6, \_in1, \_in6
  813. vsub.w \_out7, \_in0, \_in7
  814. .endm
  815. .macro LSX_BUTTERFLY_8_D _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  816. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
  817. vadd.d \_out0, \_in0, \_in7
  818. vadd.d \_out1, \_in1, \_in6
  819. vadd.d \_out2, \_in2, \_in5
  820. vadd.d \_out3, \_in3, \_in4
  821. vsub.d \_out4, \_in3, \_in4
  822. vsub.d \_out5, \_in2, \_in5
  823. vsub.d \_out6, \_in1, \_in6
  824. vsub.d \_out7, \_in0, \_in7
  825. .endm
  826. .macro LASX_BUTTERFLY_8_B _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  827. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
  828. xvadd.b \_out0, \_in0, \_in7
  829. xvadd.b \_out1, \_in1, \_in6
  830. xvadd.b \_out2, \_in2, \_in5
  831. xvadd.b \_out3, \_in3, \_in4
  832. xvsub.b \_out4, \_in3, \_in4
  833. xvsub.b \_out5, \_in2, \_in5
  834. xvsub.b \_out6, \_in1, \_in6
  835. xvsub.b \_out7, \_in0, \_in7
  836. .endm
  837. .macro LASX_BUTTERFLY_8_H _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  838. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
  839. xvadd.h \_out0, \_in0, \_in7
  840. xvadd.h \_out1, \_in1, \_in6
  841. xvadd.h \_out2, \_in2, \_in5
  842. xvadd.h \_out3, \_in3, \_in4
  843. xvsub.h \_out4, \_in3, \_in4
  844. xvsub.h \_out5, \_in2, \_in5
  845. xvsub.h \_out6, \_in1, \_in6
  846. xvsub.h \_out7, \_in0, \_in7
  847. .endm
  848. .macro LASX_BUTTERFLY_8_W _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
  849. _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7
  850. xvadd.w \_out0, \_in0, \_in7
  851. xvadd.w \_out1, \_in1, \_in6
  852. xvadd.w \_out2, \_in2, \_in5
  853. xvadd.w \_out3, \_in3, \_in4
  854. xvsub.w \_out4, \_in3, \_in4
  855. xvsub.w \_out5, \_in2, \_in5
  856. xvsub.w \_out6, \_in1, \_in6
  857. xvsub.w \_out7, \_in0, \_in7
  858. .endm