me_cmp_neon.S 66 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675
  1. /*
  2. * Copyright (c) 2022 Jonathan Swinney <jswinney@amazon.com>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/aarch64/asm.S"
  21. function ff_pix_abs16_neon, export=1
  22. // x0 unused
  23. // x1 uint8_t *pix1
  24. // x2 uint8_t *pix2
  25. // x3 ptrdiff_t stride
  26. // w4 int h
  27. cmp w4, #4 // if h < 4, jump to completion section
  28. movi v16.8h, #0 // clear result accumulator
  29. movi v17.8h, #0 // clear result accumulator
  30. b.lt 2f
  31. 1:
  32. ld1 {v0.16b}, [x1], x3 // load pix1
  33. ld1 {v4.16b}, [x2], x3 // load pix2
  34. ld1 {v1.16b}, [x1], x3 // load pix1
  35. ld1 {v5.16b}, [x2], x3 // load pix2
  36. uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
  37. uabal2 v17.8h, v0.16b, v4.16b
  38. ld1 {v2.16b}, [x1], x3 // load pix1
  39. ld1 {v6.16b}, [x2], x3 // load pix2
  40. uabal v16.8h, v1.8b, v5.8b // absolute difference accumulate
  41. uabal2 v17.8h, v1.16b, v5.16b
  42. ld1 {v3.16b}, [x1], x3
  43. ld1 {v7.16b}, [x2], x3
  44. uabal v16.8h, v2.8b, v6.8b
  45. uabal2 v17.8h, v2.16b, v6.16b
  46. sub w4, w4, #4 // h -= 4
  47. uabal v16.8h, v3.8b, v7.8b
  48. uabal2 v17.8h, v3.16b, v7.16b
  49. cmp w4, #4 // if h >= 4, loop
  50. b.ge 1b
  51. cbnz w4, 2f // if iterations remain, jump to completion section
  52. add v16.8h, v16.8h, v17.8h
  53. uaddlv s16, v16.8h // add up everything in v16 accumulator
  54. fmov w0, s16 // copy result to general purpose register
  55. ret
  56. 2:
  57. ld1 {v0.16b}, [x1], x3 // load pix1
  58. ld1 {v4.16b}, [x2], x3 // load pix2
  59. subs w4, w4, #1 // h -= 1
  60. uabal v16.8h, v0.8b, v4.8b // absolute difference accumulate
  61. uabal2 v17.8h, v0.16b, v4.16b
  62. b.ne 2b
  63. add v16.8h, v16.8h, v17.8h
  64. uaddlv s16, v16.8h // add up everything in v16 accumulator
  65. fmov w0, s16 // copy result to general purpose register
  66. ret
  67. endfunc
  68. function ff_pix_abs8_neon, export=1
  69. // x0 unused
  70. // x1 uint8_t *pix1
  71. // x2 uint8_t *pix2
  72. // x3 ptrdiff_t stride
  73. // w4 int h
  74. movi v30.8h, #0
  75. cmp w4, #4
  76. b.lt 2f
  77. // make 4 iterations at once
  78. 1:
  79. ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration
  80. ld1 {v1.8b}, [x2], x3 // Load pix2 for first iteration
  81. ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration
  82. uabal v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
  83. ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration
  84. ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration
  85. uabal v30.8h, v2.8b, v3.8b // Absolute difference, second iteration
  86. ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration
  87. sub w4, w4, #4 // h -= 4
  88. ld1 {v6.8b}, [x1], x3 // Load pix1 for foruth iteration
  89. ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration
  90. uabal v30.8h, v4.8b, v5.8b // Absolute difference, third iteration
  91. cmp w4, #4
  92. uabal v30.8h, v6.8b, v7.8b // Absolute difference, foruth iteration
  93. b.ge 1b
  94. cbz w4, 3f
  95. // iterate by one
  96. 2:
  97. ld1 {v0.8b}, [x1], x3 // Load pix1
  98. ld1 {v1.8b}, [x2], x3 // Load pix2
  99. subs w4, w4, #1
  100. uabal v30.8h, v0.8b, v1.8b
  101. b.ne 2b
  102. 3:
  103. uaddlv s20, v30.8h // Add up vector
  104. fmov w0, s20
  105. ret
  106. endfunc
  107. function ff_pix_abs8_x2_neon, export=1
  108. // x0 unused
  109. // x1 uint8_t *pix1
  110. // x2 uint8_t *pix2
  111. // x3 ptrdiff_t stride
  112. // w4 int h
  113. cmp w4, #4
  114. movi v26.8h, #0
  115. add x5, x2, #1 // pix2 + 1
  116. b.lt 2f
  117. // make 4 iterations at once
  118. 1:
  119. ld1 {v1.8b}, [x2], x3
  120. ld1 {v2.8b}, [x5], x3
  121. ld1 {v0.8b}, [x1], x3
  122. ld1 {v4.8b}, [x2], x3
  123. urhadd v30.8b, v1.8b, v2.8b
  124. ld1 {v5.8b}, [x5], x3
  125. uabal v26.8h, v0.8b, v30.8b
  126. ld1 {v6.8b}, [x1], x3
  127. urhadd v29.8b, v4.8b, v5.8b
  128. ld1 {v7.8b}, [x2], x3
  129. ld1 {v20.8b}, [x5], x3
  130. uabal v26.8h, v6.8b, v29.8b
  131. ld1 {v21.8b}, [x1], x3
  132. urhadd v28.8b, v7.8b, v20.8b
  133. ld1 {v22.8b}, [x2], x3
  134. ld1 {v23.8b}, [x5], x3
  135. uabal v26.8h, v21.8b, v28.8b
  136. sub w4, w4, #4
  137. ld1 {v24.8b}, [x1], x3
  138. urhadd v27.8b, v22.8b, v23.8b
  139. cmp w4, #4
  140. uabal v26.8h, v24.8b, v27.8b
  141. b.ge 1b
  142. cbz w4, 3f
  143. // iterate by one
  144. 2:
  145. ld1 {v1.8b}, [x2], x3
  146. ld1 {v2.8b}, [x5], x3
  147. ld1 {v0.8b}, [x1], x3
  148. urhadd v30.8b, v1.8b, v2.8b
  149. subs w4, w4, #1
  150. uabal v26.8h, v0.8b, v30.8b
  151. b.ne 2b
  152. 3:
  153. uaddlv s20, v26.8h
  154. fmov w0, s20
  155. ret
  156. endfunc
  157. function ff_pix_abs8_y2_neon, export=1
  158. // x0 unused
  159. // x1 uint8_t *pix1
  160. // x2 uint8_t *pix2
  161. // x3 ptrdiff_t stride
  162. // w4 int h
  163. cmp w4, #4
  164. movi v26.8h, #0
  165. ld1 {v1.8b}, [x2], x3
  166. b.lt 2f
  167. // make 4 iterations at once
  168. 1:
  169. ld1 {v2.8b}, [x2], x3
  170. ld1 {v0.8b}, [x1], x3
  171. urhadd v30.8b, v1.8b, v2.8b
  172. ld1 {v5.8b}, [x2], x3
  173. ld1 {v6.8b}, [x1], x3
  174. uabal v26.8h, v0.8b, v30.8b
  175. urhadd v29.8b, v2.8b, v5.8b
  176. ld1 {v20.8b}, [x2], x3
  177. ld1 {v21.8b}, [x1], x3
  178. uabal v26.8h, v6.8b, v29.8b
  179. urhadd v28.8b, v5.8b, v20.8b
  180. ld1 {v1.8b}, [x2], x3
  181. ld1 {v24.8b}, [x1], x3
  182. urhadd v27.8b, v20.8b, v1.8b
  183. sub w4, w4, #4
  184. uabal v26.8h, v21.8b, v28.8b
  185. cmp w4, #4
  186. uabal v26.8h, v24.8b, v27.8b
  187. b.ge 1b
  188. cbz w4, 3f
  189. // iterate by one
  190. 2:
  191. ld1 {v0.8b}, [x1], x3
  192. ld1 {v2.8b}, [x2], x3
  193. urhadd v30.8b, v1.8b, v2.8b
  194. subs w4, w4, #1
  195. uabal v26.8h, v0.8b, v30.8b
  196. mov v1.8b, v2.8b
  197. b.ne 2b
  198. 3:
  199. uaddlv s20, v26.8h
  200. fmov w0, s20
  201. ret
  202. endfunc
  203. function ff_pix_abs8_xy2_neon, export=1
  204. // x0 unused
  205. // x1 uint8_t *pix1
  206. // x2 uint8_t *pix2
  207. // x3 ptrdiff_t stride
  208. // w4 int h
  209. movi v31.8h, #0
  210. add x0, x2, 1 // pix2 + 1
  211. add x5, x2, x3 // pix2 + stride = pix3
  212. cmp w4, #4
  213. add x6, x5, 1 // pix3 + stride + 1
  214. ld1 {v0.8b}, [x2], x3
  215. ld1 {v1.8b}, [x0], x3
  216. uaddl v2.8h, v0.8b, v1.8b
  217. b.lt 2f
  218. // make 4 iterations at once
  219. 1:
  220. ld1 {v4.8b}, [x5], x3
  221. ld1 {v5.8b}, [x6], x3
  222. ld1 {v7.8b}, [x5], x3
  223. uaddl v0.8h, v4.8b, v5.8b
  224. ld1 {v16.8b}, [x6], x3
  225. add v4.8h, v0.8h, v2.8h
  226. ld1 {v5.8b}, [x1], x3
  227. rshrn v4.8b, v4.8h, #2
  228. uaddl v7.8h, v7.8b, v16.8b
  229. uabal v31.8h, v5.8b, v4.8b
  230. add v2.8h, v0.8h, v7.8h
  231. ld1 {v17.8b}, [x1], x3
  232. rshrn v2.8b, v2.8h, #2
  233. ld1 {v20.8b}, [x5], x3
  234. uabal v31.8h, v17.8b, v2.8b
  235. ld1 {v21.8b}, [x6], x3
  236. ld1 {v25.8b}, [x5], x3
  237. uaddl v20.8h, v20.8b, v21.8b
  238. ld1 {v26.8b}, [x6], x3
  239. add v7.8h, v7.8h, v20.8h
  240. uaddl v25.8h, v25.8b, v26.8b
  241. rshrn v7.8b, v7.8h, #2
  242. ld1 {v22.8b}, [x1], x3
  243. mov v2.16b, v25.16b
  244. uabal v31.8h, v22.8b, v7.8b
  245. add v20.8h, v20.8h, v25.8h
  246. ld1 {v27.8b}, [x1], x3
  247. sub w4, w4, #4
  248. rshrn v20.8b, v20.8h, #2
  249. cmp w4, #4
  250. uabal v31.8h, v27.8b, v20.8b
  251. b.ge 1b
  252. cbz w4, 3f
  253. // iterate by one
  254. 2:
  255. ld1 {v0.8b}, [x5], x3
  256. ld1 {v1.8b}, [x6], x3
  257. ld1 {v4.8b}, [x1], x3
  258. uaddl v21.8h, v0.8b, v1.8b
  259. subs w4, w4, #1
  260. add v3.8h, v2.8h, v21.8h
  261. mov v2.16b, v21.16b
  262. rshrn v3.8b, v3.8h, #2
  263. uabal v31.8h, v4.8b, v3.8b
  264. b.ne 2b
  265. 3:
  266. uaddlv s18, v31.8h
  267. fmov w0, s18
  268. ret
  269. endfunc
  270. function ff_pix_abs16_xy2_neon, export=1
  271. // x0 unused
  272. // x1 uint8_t *pix1
  273. // x2 uint8_t *pix2
  274. // x3 ptrdiff_t stride
  275. // w4 int h
  276. add x5, x2, x3 // use x5 to hold uint8_t *pix3
  277. movi v21.8h, #0 // initialize the result register
  278. movi v22.8h, #0 // initialize the result register
  279. // Load initial pix2 values for either the unrolled version or completion version.
  280. ldur q4, [x2, #1] // load pix2+1
  281. ldr q3, [x2] // load pix2
  282. uaddl v2.8h, v4.8b, v3.8b // pix2 + pix2+1 0..7
  283. uaddl2 v3.8h, v4.16b, v3.16b // pix2 + pix2+1 8..15
  284. cmp w4, #4 // if h < 4 jump to the completion version
  285. b.lt 2f
  286. 1:
  287. // This is an unrolled implementation. It completes 4 iterations of the C for each branch.
  288. // In each iteration, pix2[i+1] == pix3[i]. This means we need only three loads per iteration,
  289. // plus two at the beginning to start.
  290. ldur q5, [x5, #1] // load pix3+1
  291. ld1 {v4.16b}, [x5], x3 // load pix3
  292. ld1 {v1.16b}, [x1], x3 // load pix1
  293. ldur q7, [x5, #1] // load pix3+1
  294. ld1 {v6.16b}, [x5], x3 // load pix3
  295. ld1 {v16.16b}, [x1], x3 // load pix1
  296. // These blocks compute the average: avg(pix2[n], pix2[n+1], pix3[n], pix3[n+1])
  297. uaddl v30.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
  298. uaddl2 v31.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
  299. ldur q19, [x5, #1] // load pix3+1
  300. add v23.8h, v2.8h, v30.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
  301. add v24.8h, v3.8h, v31.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
  302. ld1 {v18.16b}, [x5], x3 // load pix3
  303. ld1 {v17.16b}, [x1], x3 // load pix1
  304. rshrn v23.8b, v23.8h, #2 // shift right 2 0..7 (rounding shift right)
  305. rshrn2 v23.16b, v24.8h, #2 // shift right 2 8..15
  306. uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
  307. uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
  308. ldur q7, [x5, #1] // load pix3+1
  309. add v26.8h, v30.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
  310. add v27.8h, v31.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
  311. uabal v21.8h, v1.8b, v23.8b // absolute difference 0..7, i=0
  312. uabal2 v22.8h, v1.16b, v23.16b // absolute difference 8..15, i=0
  313. ld1 {v6.16b}, [x5], x3 // load pix3
  314. ld1 {v20.16b}, [x1], x3 // load pix1
  315. rshrn v26.8b, v26.8h, #2 // shift right 2 0..7 (rounding shift right)
  316. rshrn2 v26.16b, v27.8h, #2 // shift right 2 8..15
  317. uaddl v4.8h, v18.8b, v19.8b // pix3 + pix3+1 0..7
  318. uaddl2 v5.8h, v18.16b, v19.16b // pix3 + pix3+1 8..15
  319. add v28.8h, v2.8h, v4.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
  320. add v29.8h, v3.8h, v5.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
  321. rshrn v28.8b, v28.8h, #2 // shift right 2 0..7 (rounding shift right)
  322. rshrn2 v28.16b, v29.8h, #2 // shift right 2 8..15
  323. uabal v21.8h, v16.8b, v26.8b // absolute difference 0..7, i=1
  324. uabal2 v22.8h, v16.16b, v26.16b // absolute difference 8..15, i=1
  325. uaddl v2.8h, v6.8b, v7.8b // pix3 + pix3+1 0..7
  326. uaddl2 v3.8h, v6.16b, v7.16b // pix3 + pix3+1 8..15
  327. add v30.8h, v4.8h, v2.8h // add up 0..7, using pix2 + pix2+1 values from pix3 above
  328. add v31.8h, v5.8h, v3.8h // add up 8..15, using pix2 + pix2+1 values from pix3 above
  329. rshrn v30.8b, v30.8h, #2 // shift right 2 0..7 (rounding shift right)
  330. rshrn2 v30.16b, v31.8h, #2 // shift right 2 8..15
  331. sub w4, w4, #4 // h -= 4
  332. uabal v21.8h, v17.8b, v28.8b // absolute difference 0..7, i=2
  333. uabal2 v22.8h, v17.16b, v28.16b // absolute difference 8..15, i=2
  334. cmp w4, #4 // loop if h >= 4
  335. uabal v21.8h, v20.8b, v30.8b // absolute difference 0..7, i=3
  336. uabal2 v22.8h, v20.16b, v30.16b // absolute difference 8..15, i=3
  337. b.ge 1b
  338. cbnz w4, 2f // if iterations remain jump to completion section
  339. add v4.8h, v21.8h, v22.8h
  340. uaddlv s0, v4.8h // finish adding up accumulated values
  341. fmov w0, s0 // copy result to general purpose register
  342. ret
  343. 2:
  344. // v2 and v3 are set either at the end of this loop or at from the unrolled version
  345. // which branches here to complete iterations when h % 4 != 0.
  346. ldur q5, [x5, #1] // load pix3+1
  347. ld1 {v4.16b}, [x5], x3 // load pix3
  348. ld1 {v1.16b}, [x1], x3 // load pix1
  349. subs w4, w4, #1 // decrement h
  350. uaddl v18.8h, v4.8b, v5.8b // pix3 + pix3+1 0..7
  351. uaddl2 v19.8h, v4.16b, v5.16b // pix3 + pix3+1 8..15
  352. add v16.8h, v2.8h, v18.8h // add up 0..7, using pix2 + pix2+1 values from previous iteration
  353. add v17.8h, v3.8h, v19.8h // add up 8..15, using pix2 + pix2+1 values from previous iteration
  354. // divide by 4 to compute the average of values summed above
  355. rshrn v16.8b, v16.8h, #2 // shift right by 2 0..7 (rounding shift right)
  356. rshrn2 v16.16b, v17.8h, #2 // shift right by 2 8..15
  357. uabal v21.8h, v1.8b, v16.8b // absolute difference 0..7
  358. uabal2 v22.8h, v1.16b, v16.16b // absolute difference accumulate 8..15
  359. mov v2.16b, v18.16b // pix3 -> pix2
  360. mov v3.16b, v19.16b // pix3+1 -> pix2+1
  361. b.ne 2b // loop if h > 0
  362. add v4.8h, v21.8h, v22.8h
  363. uaddlv s0, v4.8h // finish adding up accumulated values
  364. fmov w0, s0 // copy result to general purpose register
  365. ret
  366. endfunc
  367. function ff_pix_abs16_x2_neon, export=1
  368. // x0 unused
  369. // x1 uint8_t *pix1
  370. // x2 uint8_t *pix2
  371. // x3 ptrdiff_t stride
  372. // w4 int h
  373. cmp w4, #4
  374. // initialize buffers
  375. movi v16.8h, #0
  376. movi v17.8h, #0
  377. add x5, x2, #1 // pix2 + 1
  378. b.lt 2f
  379. // make 4 iterations at once
  380. 1:
  381. // abs(pix1[0] - avg2(pix2[0], pix2[1]))
  382. // avg2(a,b) = (((a) + (b) + 1) >> 1)
  383. // abs(x) = (x < 0 ? -x : x)
  384. ld1 {v1.16b}, [x2], x3
  385. ld1 {v2.16b}, [x5], x3
  386. urhadd v30.16b, v1.16b, v2.16b
  387. ld1 {v0.16b}, [x1], x3
  388. uabal v16.8h, v0.8b, v30.8b
  389. ld1 {v4.16b}, [x2], x3
  390. uabal2 v17.8h, v0.16b, v30.16b
  391. ld1 {v5.16b}, [x5], x3
  392. urhadd v29.16b, v4.16b, v5.16b
  393. ld1 {v3.16b}, [x1], x3
  394. uabal v16.8h, v3.8b, v29.8b
  395. ld1 {v7.16b}, [x2], x3
  396. uabal2 v17.8h, v3.16b, v29.16b
  397. ld1 {v22.16b}, [x5], x3
  398. urhadd v28.16b, v7.16b, v22.16b
  399. ld1 {v6.16b}, [x1], x3
  400. uabal v16.8h, v6.8b, v28.8b
  401. ld1 {v24.16b}, [x2], x3
  402. sub w4, w4, #4
  403. uabal2 v17.8h, v6.16b, v28.16b
  404. ld1 {v25.16b}, [x5], x3
  405. urhadd v27.16b, v24.16b, v25.16b
  406. ld1 {v23.16b}, [x1], x3
  407. cmp w4, #4
  408. uabal v16.8h, v23.8b, v27.8b
  409. uabal2 v17.8h, v23.16b, v27.16b
  410. b.ge 1b
  411. cbz w4, 3f
  412. // iterate by one
  413. 2:
  414. ld1 {v1.16b}, [x2], x3
  415. ld1 {v2.16b}, [x5], x3
  416. subs w4, w4, #1
  417. urhadd v29.16b, v1.16b, v2.16b
  418. ld1 {v0.16b}, [x1], x3
  419. uabal v16.8h, v0.8b, v29.8b
  420. uabal2 v17.8h, v0.16b, v29.16b
  421. b.ne 2b
  422. 3:
  423. add v16.8h, v16.8h, v17.8h
  424. uaddlv s16, v16.8h
  425. fmov w0, s16
  426. ret
  427. endfunc
  428. function ff_pix_abs16_y2_neon, export=1
  429. // x0 unused
  430. // x1 uint8_t *pix1
  431. // x2 uint8_t *pix2
  432. // x3 ptrdiff_t stride
  433. // w4 int h
  434. // initialize buffers
  435. ld1 {v1.16b}, [x2], x3 // Load pix2
  436. movi v29.8h, #0 // clear the accumulator
  437. movi v28.8h, #0 // clear the accumulator
  438. cmp w4, #4
  439. b.lt 2f
  440. // make 4 iterations at once
  441. 1:
  442. // abs(pix1[0], avg2(pix2[0], pix2[0 + stride]))
  443. // avg2(a, b) = (((a) + (b) + 1) >> 1)
  444. // abs(x) = (x < 0 ? (-x) : (x))
  445. ld1 {v2.16b}, [x2], x3 // Load pix3 for first iteration
  446. ld1 {v0.16b}, [x1], x3 // Load pix1 for first iteration
  447. urhadd v30.16b, v1.16b, v2.16b // Rounding halving add, first iteration
  448. ld1 {v5.16b}, [x2], x3 // Load pix3 for second iteration
  449. uabal v29.8h, v0.8b, v30.8b // Absolute difference of lower half, first iteration
  450. uabal2 v28.8h, v0.16b, v30.16b // Absolute difference of upper half, first iteration
  451. ld1 {v3.16b}, [x1], x3 // Load pix1 for second iteration
  452. urhadd v27.16b, v2.16b, v5.16b // Rounding halving add, second iteration
  453. ld1 {v20.16b}, [x2], x3 // Load pix3 for third iteration
  454. uabal v29.8h, v3.8b, v27.8b // Absolute difference of lower half for second iteration
  455. uabal2 v28.8h, v3.16b, v27.16b // Absolute difference of upper half for second iteration
  456. ld1 {v6.16b}, [x1], x3 // Load pix1 for third iteration
  457. urhadd v26.16b, v5.16b, v20.16b // Rounding halving add, third iteration
  458. ld1 {v1.16b}, [x2], x3 // Load pix3 for fourth iteration
  459. uabal v29.8h, v6.8b, v26.8b // Absolute difference of lower half for third iteration
  460. uabal2 v28.8h, v6.16b, v26.16b // Absolute difference of upper half for third iteration
  461. ld1 {v21.16b}, [x1], x3 // Load pix1 for fourth iteration
  462. sub w4, w4, #4 // h-= 4
  463. urhadd v25.16b, v20.16b, v1.16b // Rounding halving add
  464. cmp w4, #4
  465. uabal v29.8h, v21.8b, v25.8b // Absolute difference of lower half for fourth iteration
  466. uabal2 v28.8h, v21.16b, v25.16b // Absolute difference of upper half for fourth iteration
  467. b.ge 1b
  468. cbz w4, 3f
  469. // iterate by one
  470. 2:
  471. ld1 {v2.16b}, [x2], x3 // Load pix3
  472. subs w4, w4, #1
  473. ld1 {v0.16b}, [x1], x3 // Load pix1
  474. urhadd v30.16b, v1.16b, v2.16b // Rounding halving add
  475. mov v1.16b, v2.16b // Shift pix3->pix2
  476. uabal v29.8h, v30.8b, v0.8b
  477. uabal2 v28.8h, v30.16b, v0.16b
  478. b.ne 2b
  479. 3:
  480. add v29.8h, v29.8h, v28.8h // Add vectors together
  481. uaddlv s16, v29.8h // Add up vector values
  482. fmov w0, s16
  483. ret
  484. endfunc
  485. function sse16_neon, export=1
  486. // x0 - unused
  487. // x1 - pix1
  488. // x2 - pix2
  489. // x3 - stride
  490. // w4 - h
  491. cmp w4, #4
  492. movi v17.4s, #0
  493. b.lt 2f
  494. // Make 4 iterations at once
  495. 1:
  496. // res = abs(pix1[0] - pix2[0])
  497. // res * res
  498. ld1 {v0.16b}, [x1], x3 // Load pix1 vector for first iteration
  499. ld1 {v1.16b}, [x2], x3 // Load pix2 vector for first iteration
  500. ld1 {v2.16b}, [x1], x3 // Load pix1 vector for second iteration
  501. uabd v30.16b, v0.16b, v1.16b // Absolute difference, first iteration
  502. ld1 {v3.16b}, [x2], x3 // Load pix2 vector for second iteration
  503. umull v29.8h, v30.8b, v30.8b // Multiply lower half of vectors, first iteration
  504. umull2 v28.8h, v30.16b, v30.16b // Multiply upper half of vectors, first iteration
  505. uabd v27.16b, v2.16b, v3.16b // Absolute difference, second iteration
  506. uadalp v17.4s, v29.8h // Pairwise add, first iteration
  507. ld1 {v4.16b}, [x1], x3 // Load pix1 for third iteration
  508. umull v26.8h, v27.8b, v27.8b // Multiply lower half, second iteration
  509. umull2 v25.8h, v27.16b, v27.16b // Multiply upper half, second iteration
  510. ld1 {v5.16b}, [x2], x3 // Load pix2 for third iteration
  511. uadalp v17.4s, v26.8h // Pairwise add and accumulate, second iteration
  512. uabd v24.16b, v4.16b, v5.16b // Absolute difference, third iteration
  513. ld1 {v6.16b}, [x1], x3 // Load pix1 for fourth iteration
  514. uadalp v17.4s, v25.8h // Pairwise add and accumulate, second iteration
  515. umull v23.8h, v24.8b, v24.8b // Multiply lower half, third iteration
  516. umull2 v22.8h, v24.16b, v24.16b // Multiply upper half, third iteration
  517. uadalp v17.4s, v23.8h // Pairwise add and accumulate, third iteration
  518. ld1 {v7.16b}, [x2], x3 // Load pix2 for fourth iteration
  519. uadalp v17.4s, v22.8h // Pairwise add and accumulate, third iteration
  520. uabd v21.16b, v6.16b, v7.16b // Absolute difference, fourth iteration
  521. uadalp v17.4s, v28.8h // Pairwise add and accumulate, first iteration
  522. umull v20.8h, v21.8b, v21.8b // Multiply lower half, fourth iteration
  523. sub w4, w4, #4 // h -= 4
  524. umull2 v19.8h, v21.16b, v21.16b // Multiply upper half, fourth iteration
  525. uadalp v17.4s, v20.8h // Pairwise add and accumulate, fourth iteration
  526. cmp w4, #4
  527. uadalp v17.4s, v19.8h // Pairwise add and accumulate, fourth iteration
  528. b.ge 1b
  529. cbz w4, 3f
  530. // iterate by one
  531. 2:
  532. ld1 {v0.16b}, [x1], x3 // Load pix1
  533. ld1 {v1.16b}, [x2], x3 // Load pix2
  534. uabd v30.16b, v0.16b, v1.16b
  535. umull v29.8h, v30.8b, v30.8b
  536. umull2 v28.8h, v30.16b, v30.16b
  537. uadalp v17.4s, v29.8h
  538. subs w4, w4, #1
  539. uadalp v17.4s, v28.8h
  540. b.ne 2b
  541. 3:
  542. uaddlv d16, v17.4s // add up accumulator vector
  543. fmov w0, s16
  544. ret
  545. endfunc
  546. function sse8_neon, export=1
  547. // x0 - unused
  548. // x1 - pix1
  549. // x2 - pix2
  550. // x3 - stride
  551. // w4 - h
  552. movi v21.4s, #0
  553. movi v20.4s, #0
  554. cmp w4, #4
  555. b.lt 2f
  556. // make 4 iterations at once
  557. 1:
  558. // res = abs(pix1[0] - pix2[0])
  559. // res * res
  560. ld1 {v0.8b}, [x1], x3 // Load pix1 for first iteration
  561. ld1 {v1.8b}, [x2], x3 // Load pix2 for second iteration
  562. ld1 {v2.8b}, [x1], x3 // Load pix1 for second iteration
  563. ld1 {v3.8b}, [x2], x3 // Load pix2 for second iteration
  564. uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
  565. ld1 {v4.8b}, [x1], x3 // Load pix1 for third iteration
  566. ld1 {v5.8b}, [x2], x3 // Load pix2 for third iteration
  567. uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
  568. umlal v21.4s, v30.4h, v30.4h // Multiply lower half, first iteration
  569. ld1 {v6.8b}, [x1], x3 // Load pix1 for fourth iteration
  570. ld1 {v7.8b}, [x2], x3 // Load pix2 for fourth iteration
  571. uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
  572. umlal v21.4s, v29.4h, v29.4h // Multiply lower half, second iteration
  573. umlal2 v20.4s, v30.8h, v30.8h // Multiply upper half, first iteration
  574. uabdl v27.8h, v6.8b, v7.8b // Absolute difference, fourth iteration
  575. umlal v21.4s, v28.4h, v28.4h // Multiply lower half, third iteration
  576. umlal2 v20.4s, v29.8h, v29.8h // Multiply upper half, second iteration
  577. sub w4, w4, #4 // h -= 4
  578. umlal2 v20.4s, v28.8h, v28.8h // Multiply upper half, third iteration
  579. umlal v21.4s, v27.4h, v27.4h // Multiply lower half, fourth iteration
  580. cmp w4, #4
  581. umlal2 v20.4s, v27.8h, v27.8h // Multiply upper half, fourth iteration
  582. b.ge 1b
  583. cbz w4, 3f
  584. // iterate by one
  585. 2:
  586. ld1 {v0.8b}, [x1], x3 // Load pix1
  587. ld1 {v1.8b}, [x2], x3 // Load pix2
  588. subs w4, w4, #1
  589. uabdl v30.8h, v0.8b, v1.8b
  590. umlal v21.4s, v30.4h, v30.4h
  591. umlal2 v20.4s, v30.8h, v30.8h
  592. b.ne 2b
  593. 3:
  594. add v21.4s, v21.4s, v20.4s // Add accumulator vectors together
  595. uaddlv d17, v21.4s // Add up vector
  596. fmov w0, s17
  597. ret
  598. endfunc
  599. function sse4_neon, export=1
  600. // x0 - unused
  601. // x1 - pix1
  602. // x2 - pix2
  603. // x3 - stride
  604. // w4 - h
  605. movi v16.4s, #0 // clear the result accumulator
  606. cmp w4, #4
  607. b.lt 2f
  608. // make 4 iterations at once
  609. 1:
  610. // res = abs(pix1[0] - pix2[0])
  611. // res * res
  612. ld1 {v0.s}[0], [x1], x3 // Load pix1, first iteration
  613. ld1 {v1.s}[0], [x2], x3 // Load pix2, first iteration
  614. ld1 {v2.s}[0], [x1], x3 // Load pix1, second iteration
  615. ld1 {v3.s}[0], [x2], x3 // Load pix2, second iteration
  616. uabdl v30.8h, v0.8b, v1.8b // Absolute difference, first iteration
  617. ld1 {v4.s}[0], [x1], x3 // Load pix1, third iteration
  618. ld1 {v5.s}[0], [x2], x3 // Load pix2, third iteration
  619. uabdl v29.8h, v2.8b, v3.8b // Absolute difference, second iteration
  620. umlal v16.4s, v30.4h, v30.4h // Multiply vectors, first iteration
  621. ld1 {v6.s}[0], [x1], x3 // Load pix1, fourth iteration
  622. ld1 {v7.s}[0], [x2], x3 // Load pix2, fourth iteration
  623. uabdl v28.8h, v4.8b, v5.8b // Absolute difference, third iteration
  624. umlal v16.4s, v29.4h, v29.4h // Multiply and accumulate, second iteration
  625. sub w4, w4, #4
  626. uabdl v27.8h, v6.8b, v7.8b // Absolute difference, fourth iteration
  627. umlal v16.4s, v28.4h, v28.4h // Multiply and accumulate, third iteration
  628. cmp w4, #4
  629. umlal v16.4s, v27.4h, v27.4h // Multiply and accumulate, fourth iteration
  630. b.ge 1b
  631. cbz w4, 3f
  632. // iterate by one
  633. 2:
  634. ld1 {v0.s}[0], [x1], x3 // Load pix1
  635. ld1 {v1.s}[0], [x2], x3 // Load pix2
  636. uabdl v30.8h, v0.8b, v1.8b
  637. subs w4, w4, #1
  638. umlal v16.4s, v30.4h, v30.4h
  639. b.ne 2b
  640. 3:
  641. uaddlv d17, v16.4s // Add vector
  642. fmov w0, s17
  643. ret
  644. endfunc
  645. function vsad16_neon, export=1
  646. // x0 unused
  647. // x1 uint8_t *pix1
  648. // x2 uint8_t *pix2
  649. // x3 ptrdiff_t stride
  650. // w4 int h
  651. ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
  652. ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
  653. sub w4, w4, #1 // we need to make h-1 iterations
  654. movi v16.8h, #0
  655. cmp w4, #3 // check if we can make 3 iterations at once
  656. usubl v31.8h, v0.8b, v1.8b // Signed difference pix1[0] - pix2[0], first iteration
  657. usubl2 v30.8h, v0.16b, v1.16b // Signed difference pix1[0] - pix2[0], first iteration
  658. b.lt 2f
  659. 1:
  660. // abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
  661. ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration
  662. ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration
  663. ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration
  664. ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration
  665. usubl v29.8h, v0.8b, v1.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
  666. usubl2 v28.8h, v0.16b, v1.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], first iteration
  667. ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration
  668. ld1 {v5.16b}, [x2], x3 // Load pix2[0 + stride], third iteration
  669. usubl v27.8h, v2.8b, v3.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
  670. saba v16.8h, v31.8h, v29.8h // Signed absolute difference and accumulate the result. first iteration
  671. usubl2 v26.8h, v2.16b, v3.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], second iteration
  672. saba v16.8h, v30.8h, v28.8h // Signed absolute difference and accumulate the result. first iteration
  673. usubl v25.8h, v4.8b, v5.8b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
  674. usubl2 v24.8h, v4.16b, v5.16b // Signed difference pix1[0 + stride] - pix2[0 + stride], third iteration
  675. saba v16.8h, v29.8h, v27.8h // Signed absolute difference and accumulate the result. second iteration
  676. mov v31.16b, v25.16b
  677. saba v16.8h, v28.8h, v26.8h // Signed absolute difference and accumulate the result. second iteration
  678. sub w4, w4, #3 // h -= 3
  679. mov v30.16b, v24.16b
  680. saba v16.8h, v27.8h, v25.8h // Signed absolute difference and accumulate the result. third iteration
  681. cmp w4, #3
  682. saba v16.8h, v26.8h, v24.8h // Signed absolute difference and accumulate the result. third iteration
  683. b.ge 1b
  684. cbz w4, 3f
  685. 2:
  686. ld1 {v0.16b}, [x1], x3
  687. ld1 {v1.16b}, [x2], x3
  688. subs w4, w4, #1
  689. usubl v29.8h, v0.8b, v1.8b
  690. usubl2 v28.8h, v0.16b, v1.16b
  691. saba v16.8h, v31.8h, v29.8h
  692. mov v31.16b, v29.16b
  693. saba v16.8h, v30.8h, v28.8h
  694. mov v30.16b, v28.16b
  695. b.ne 2b
  696. 3:
  697. uaddlv s17, v16.8h
  698. fmov w0, s17
  699. ret
  700. endfunc
  701. function vsse8_neon, export=1
  702. // x0 unused
  703. // x1 uint8_t *pix1
  704. // x2 uint8_t *pix2
  705. // x3 ptrdiff_t stride
  706. // w4 int h
  707. ld1 {v0.8b}, [x1], x3 // Load pix1[0], first iteration
  708. ld1 {v1.8b}, [x2], x3 // Load pix2[0], first iteration
  709. sub w4, w4, #1 // we need to make h-1 iterations
  710. movi v16.4s, #0
  711. movi v17.4s, #0
  712. cmp w4, #3 // check if we can make 3 iterations at once
  713. usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
  714. b.lt 2f
  715. 1:
  716. // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
  717. // res = (x) * (x)
  718. ld1 {v0.8b}, [x1], x3 // Load pix1[0 + stride], first iteration
  719. ld1 {v1.8b}, [x2], x3 // Load pix2[0 + stride], first iteration
  720. ld1 {v2.8b}, [x1], x3 // Load pix1[0 + stride], second iteration
  721. ld1 {v3.8b}, [x2], x3 // Load pix2[0 + stride], second iteration
  722. usubl v29.8h, v0.8b, v1.8b
  723. usubl2 v28.8h, v0.16b, v1.16b
  724. ld1 {v4.8b}, [x1], x3 // Load pix1[0 + stride], third iteration
  725. ld1 {v5.8b}, [x2], x3 // Load pix1[0 + stride], third iteration
  726. sabd v31.8h, v31.8h, v29.8h
  727. usubl v27.8h, v2.8b, v3.8b
  728. usubl v25.8h, v4.8b, v5.8b
  729. sabd v29.8h, v29.8h, v27.8h
  730. sabd v27.8h, v27.8h, v25.8h
  731. umlal v16.4s, v31.4h, v31.4h
  732. umlal2 v17.4s, v31.8h, v31.8h
  733. mov v31.16b, v25.16b
  734. umlal v16.4s, v29.4h, v29.4h
  735. umlal2 v17.4s, v29.8h, v29.8h
  736. sub w4, w4, #3
  737. umlal v16.4s, v27.4h, v27.4h
  738. umlal2 v17.4s, v27.8h, v27.8h
  739. cmp w4, #3
  740. b.ge 1b
  741. cbz w4, 3f
  742. // iterate by once
  743. 2:
  744. ld1 {v0.8b}, [x1], x3
  745. ld1 {v1.8b}, [x2], x3
  746. subs w4, w4, #1
  747. usubl v29.8h, v0.8b, v1.8b
  748. sabd v31.8h, v31.8h, v29.8h
  749. umlal v16.4s, v31.4h, v31.4h
  750. umlal2 v17.4s, v31.8h, v31.8h
  751. mov v31.16b, v29.16b
  752. b.ne 2b
  753. 3:
  754. add v16.4s, v16.4s, v17.4s
  755. uaddlv d17, v16.4s
  756. fmov w0, s17
  757. ret
  758. endfunc
  759. function vsse16_neon, export=1
  760. // x0 unused
  761. // x1 uint8_t *pix1
  762. // x2 uint8_t *pix2
  763. // x3 ptrdiff_t stride
  764. // w4 int h
  765. ld1 {v0.16b}, [x1], x3 // Load pix1[0], first iteration
  766. ld1 {v1.16b}, [x2], x3 // Load pix2[0], first iteration
  767. sub w4, w4, #1 // we need to make h-1 iterations
  768. movi v16.4s, #0
  769. movi v17.4s, #0
  770. cmp w4, #3 // check if we can make 3 iterations at once
  771. usubl v31.8h, v0.8b, v1.8b // Signed difference of pix1[0] - pix2[0], first iteration
  772. usubl2 v30.8h, v0.16b, v1.16b // Signed difference of pix1[0] - pix2[0], first iteration
  773. b.lt 2f
  774. 1:
  775. // x = abs(pix1[0] - pix2[0] - pix1[0 + stride] + pix2[0 + stride])
  776. // res = (x) * (x)
  777. ld1 {v0.16b}, [x1], x3 // Load pix1[0 + stride], first iteration
  778. ld1 {v1.16b}, [x2], x3 // Load pix2[0 + stride], first iteration
  779. ld1 {v2.16b}, [x1], x3 // Load pix1[0 + stride], second iteration
  780. ld1 {v3.16b}, [x2], x3 // Load pix2[0 + stride], second iteration
  781. usubl v29.8h, v0.8b, v1.8b
  782. usubl2 v28.8h, v0.16b, v1.16b
  783. ld1 {v4.16b}, [x1], x3 // Load pix1[0 + stride], third iteration
  784. ld1 {v5.16b}, [x2], x3 // Load pix1[0 + stride], third iteration
  785. sabd v31.8h, v31.8h, v29.8h
  786. sabd v30.8h, v30.8h, v28.8h
  787. usubl v27.8h, v2.8b, v3.8b
  788. usubl2 v26.8h, v2.16b, v3.16b
  789. usubl v25.8h, v4.8b, v5.8b
  790. usubl2 v24.8h, v4.16b, v5.16b
  791. sabd v29.8h, v29.8h, v27.8h
  792. sabd v27.8h, v27.8h, v25.8h
  793. umlal v16.4s, v31.4h, v31.4h
  794. umlal2 v17.4s, v31.8h, v31.8h
  795. sabd v28.8h, v28.8h, v26.8h
  796. sabd v26.8h, v26.8h, v24.8h
  797. umlal v16.4s, v30.4h, v30.4h
  798. umlal2 v17.4s, v30.8h, v30.8h
  799. mov v31.16b, v25.16b
  800. umlal v16.4s, v29.4h, v29.4h
  801. umlal2 v17.4s, v29.8h, v29.8h
  802. mov v30.16b, v24.16b
  803. umlal v16.4s, v28.4h, v28.4h
  804. umlal2 v17.4s, v28.8h, v28.8h
  805. sub w4, w4, #3
  806. umlal v16.4s, v27.4h, v27.4h
  807. umlal2 v17.4s, v27.8h, v27.8h
  808. cmp w4, #3
  809. umlal v16.4s, v26.4h, v26.4h
  810. umlal2 v17.4s, v26.8h, v26.8h
  811. b.ge 1b
  812. cbz w4, 3f
  813. // iterate by once
  814. 2:
  815. ld1 {v0.16b}, [x1], x3
  816. ld1 {v1.16b}, [x2], x3
  817. subs w4, w4, #1
  818. usubl v29.8h, v0.8b, v1.8b
  819. usubl2 v28.8h, v0.16b, v1.16b
  820. sabd v31.8h, v31.8h, v29.8h
  821. sabd v30.8h, v30.8h, v28.8h
  822. umlal v16.4s, v31.4h, v31.4h
  823. umlal2 v17.4s, v31.8h, v31.8h
  824. mov v31.16b, v29.16b
  825. umlal v16.4s, v30.4h, v30.4h
  826. umlal2 v17.4s, v30.8h, v30.8h
  827. mov v30.16b, v28.16b
  828. b.ne 2b
  829. 3:
  830. add v16.4s, v16.4s, v17.4s
  831. uaddlv d17, v16.4s
  832. fmov w0, s17
  833. ret
  834. endfunc
  835. function vsad_intra16_neon, export=1
  836. // x0 unused
  837. // x1 uint8_t *pix1
  838. // x2 uint8_t *dummy
  839. // x3 ptrdiff_t stride
  840. // w4 int h
  841. ld1 {v0.16b}, [x1], x3
  842. sub w4, w4, #1 // we need to make h-1 iterations
  843. cmp w4, #3
  844. movi v16.8h, #0
  845. b.lt 2f
  846. // make 4 iterations at once
  847. 1:
  848. // v = abs( pix1[0] - pix1[0 + stride] )
  849. // score = sum(v)
  850. ld1 {v1.16b}, [x1], x3
  851. ld1 {v2.16b}, [x1], x3
  852. uabal v16.8h, v0.8b, v1.8b
  853. ld1 {v3.16b}, [x1], x3
  854. uabal2 v16.8h, v0.16b, v1.16b
  855. sub w4, w4, #3
  856. uabal v16.8h, v1.8b, v2.8b
  857. cmp w4, #3
  858. uabal2 v16.8h, v1.16b, v2.16b
  859. mov v0.16b, v3.16b
  860. uabal v16.8h, v2.8b, v3.8b
  861. uabal2 v16.8h, v2.16b, v3.16b
  862. b.ge 1b
  863. cbz w4, 3f
  864. // iterate by one
  865. 2:
  866. ld1 {v1.16b}, [x1], x3
  867. subs w4, w4, #1
  868. uabal v16.8h, v0.8b, v1.8b
  869. uabal2 v16.8h, v0.16b, v1.16b
  870. mov v0.16b, v1.16b
  871. cbnz w4, 2b
  872. 3:
  873. uaddlv s17, v16.8h
  874. fmov w0, s17
  875. ret
  876. endfunc
  877. function vsse_intra16_neon, export=1
  878. // x0 unused
  879. // x1 uint8_t *pix1
  880. // x2 uint8_t *dummy
  881. // x3 ptrdiff_t stride
  882. // w4 int h
  883. ld1 {v0.16b}, [x1], x3
  884. movi v16.4s, #0
  885. movi v17.4s, #0
  886. sub w4, w4, #1 // we need to make h-1 iterations
  887. cmp w4, #3
  888. b.lt 2f
  889. 1:
  890. // v = abs( pix1[0] - pix1[0 + stride] )
  891. // score = sum( v * v )
  892. ld1 {v1.16b}, [x1], x3
  893. ld1 {v2.16b}, [x1], x3
  894. uabd v30.16b, v0.16b, v1.16b
  895. ld1 {v3.16b}, [x1], x3
  896. umull v29.8h, v30.8b, v30.8b
  897. umull2 v28.8h, v30.16b, v30.16b
  898. uabd v27.16b, v1.16b, v2.16b
  899. uadalp v16.4s, v29.8h
  900. umull v26.8h, v27.8b, v27.8b
  901. umull2 v27.8h, v27.16b, v27.16b
  902. uadalp v17.4s, v28.8h
  903. uabd v25.16b, v2.16b, v3.16b
  904. uadalp v16.4s, v26.8h
  905. umull v24.8h, v25.8b, v25.8b
  906. umull2 v25.8h, v25.16b, v25.16b
  907. uadalp v17.4s, v27.8h
  908. sub w4, w4, #3
  909. uadalp v16.4s, v24.8h
  910. cmp w4, #3
  911. uadalp v17.4s, v25.8h
  912. mov v0.16b, v3.16b
  913. b.ge 1b
  914. cbz w4, 3f
  915. // iterate by one
  916. 2:
  917. ld1 {v1.16b}, [x1], x3
  918. subs w4, w4, #1
  919. uabd v30.16b, v0.16b, v1.16b
  920. mov v0.16b, v1.16b
  921. umull v29.8h, v30.8b, v30.8b
  922. umull2 v30.8h, v30.16b, v30.16b
  923. uadalp v16.4s, v29.8h
  924. uadalp v17.4s, v30.8h
  925. cbnz w4, 2b
  926. 3:
  927. add v16.4s, v16.4s, v17.4s
  928. uaddlv d17, v16.4s
  929. fmov w0, s17
  930. ret
  931. endfunc
  932. function vsse_intra8_neon, export=1
  933. // x0 unused
  934. // x1 uint8_t *pix1
  935. // x2 uint8_t *dummy
  936. // x3 ptrdiff_t stride
  937. // w4 int h
  938. sub w4, w4, #1 // we need to make h-1 iterations
  939. ld1 {v0.8b}, [x1], x3
  940. cmp w4, #3
  941. movi v16.4s, #0
  942. b.lt 2f
  943. 1:
  944. // v = abs( pix1[0] - pix1[0 + stride] )
  945. // score = sum( v * v )
  946. ld1 {v1.8b}, [x1], x3
  947. ld1 {v2.8b}, [x1], x3
  948. uabd v30.8b, v0.8b, v1.8b
  949. ld1 {v3.8b}, [x1], x3
  950. uabd v27.8b, v1.8b, v2.8b
  951. umull v29.8h, v30.8b, v30.8b
  952. uabd v25.8b, v2.8b, v3.8b
  953. umull v26.8h, v27.8b, v27.8b
  954. uadalp v16.4s, v29.8h
  955. umull v24.8h, v25.8b, v25.8b
  956. uadalp v16.4s, v26.8h
  957. sub w4, w4, #3
  958. uadalp v16.4s, v24.8h
  959. cmp w4, #3
  960. mov v0.8b, v3.8b
  961. b.ge 1b
  962. cbz w4, 3f
  963. // iterate by one
  964. 2:
  965. ld1 {v1.8b}, [x1], x3
  966. subs w4, w4, #1
  967. uabd v30.8b, v0.8b, v1.8b
  968. mov v0.8b, v1.8b
  969. umull v29.8h, v30.8b, v30.8b
  970. uadalp v16.4s, v29.8h
  971. cbnz w4, 2b
  972. 3:
  973. uaddlv d17, v16.4s
  974. fmov w0, s17
  975. ret
  976. endfunc
  977. function nsse16_neon, export=1
  978. // x0 multiplier
  979. // x1 uint8_t *pix1
  980. // x2 uint8_t *pix2
  981. // x3 ptrdiff_t stride
  982. // w4 int h
  983. str x0, [sp, #-0x40]!
  984. stp x1, x2, [sp, #0x10]
  985. stp x3, x4, [sp, #0x20]
  986. str x30, [sp, #0x30]
  987. bl X(sse16_neon)
  988. ldr x30, [sp, #0x30]
  989. mov w9, w0 // here we store score1
  990. ldp x1, x2, [sp, #0x10]
  991. ldp x3, x4, [sp, #0x20]
  992. ldr x5, [sp], #0x40
  993. movi v16.8h, #0
  994. movi v17.8h, #0
  995. movi v18.8h, #0
  996. movi v19.8h, #0
  997. ld1 {v0.16b}, [x1], x3
  998. subs w4, w4, #1 // we need to make h-1 iterations
  999. ld1 {v2.16b}, [x2], x3
  1000. ext v1.16b, v0.16b, v0.16b, #1 // x1 + 1
  1001. cmp w4, #2
  1002. ext v3.16b, v2.16b, v2.16b, #1 // x2 + 1
  1003. b.lt 2f
  1004. // make 2 iterations at once
  1005. 1:
  1006. ld1 {v4.16b}, [x1], x3
  1007. ld1 {v6.16b}, [x2], x3
  1008. ld1 {v20.16b}, [x1], x3
  1009. ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1
  1010. usubl v31.8h, v0.8b, v4.8b
  1011. usubl2 v30.8h, v0.16b, v4.16b
  1012. ld1 {v22.16b}, [x2], x3
  1013. usubl v29.8h, v1.8b, v5.8b
  1014. usubl2 v28.8h, v1.16b, v5.16b
  1015. ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1
  1016. saba v16.8h, v31.8h, v29.8h
  1017. ext v21.16b, v20.16b, v20.16b, #1
  1018. saba v17.8h, v30.8h, v28.8h
  1019. usubl v27.8h, v2.8b, v6.8b
  1020. usubl2 v26.8h, v2.16b, v6.16b
  1021. ext v23.16b, v22.16b, v22.16b, #1
  1022. usubl v25.8h, v3.8b, v7.8b
  1023. usubl2 v24.8h, v3.16b, v7.16b
  1024. saba v18.8h, v27.8h, v25.8h
  1025. saba v19.8h, v26.8h, v24.8h
  1026. usubl v31.8h, v4.8b, v20.8b
  1027. usubl2 v30.8h, v4.16b, v20.16b
  1028. usubl v29.8h, v5.8b, v21.8b
  1029. usubl2 v28.8h, v5.16b, v21.16b
  1030. saba v16.8h, v31.8h, v29.8h
  1031. saba v17.8h, v30.8h, v28.8h
  1032. usubl v27.8h, v6.8b, v22.8b
  1033. usubl2 v26.8h, v6.16b, v22.16b
  1034. usubl v25.8h, v7.8b, v23.8b
  1035. usubl2 v24.8h, v7.16b, v23.16b
  1036. saba v18.8h, v27.8h, v25.8h
  1037. saba v19.8h, v26.8h, v24.8h
  1038. sub w4, w4, #2
  1039. mov v0.16b, v20.16b
  1040. mov v1.16b, v21.16b
  1041. cmp w4, #2
  1042. mov v2.16b, v22.16b
  1043. mov v3.16b, v23.16b
  1044. b.ge 1b
  1045. cbz w4, 3f
  1046. // iterate by one
  1047. 2:
  1048. ld1 {v4.16b}, [x1], x3
  1049. subs w4, w4, #1
  1050. ld1 {v6.16b}, [x2], x3
  1051. ext v5.16b, v4.16b, v4.16b, #1 // x1 + stride + 1
  1052. usubl v31.8h, v0.8b, v4.8b
  1053. ext v7.16b, v6.16b, v6.16b, #1 // x2 + stride + 1
  1054. usubl2 v30.8h, v0.16b, v4.16b
  1055. usubl v29.8h, v1.8b, v5.8b
  1056. usubl2 v28.8h, v1.16b, v5.16b
  1057. saba v16.8h, v31.8h, v29.8h
  1058. saba v17.8h, v30.8h, v28.8h
  1059. usubl v27.8h, v2.8b, v6.8b
  1060. usubl2 v26.8h, v2.16b, v6.16b
  1061. usubl v25.8h, v3.8b, v7.8b
  1062. usubl2 v24.8h, v3.16b, v7.16b
  1063. saba v18.8h, v27.8h, v25.8h
  1064. saba v19.8h, v26.8h, v24.8h
  1065. mov v0.16b, v4.16b
  1066. mov v1.16b, v5.16b
  1067. mov v2.16b, v6.16b
  1068. mov v3.16b, v7.16b
  1069. cbnz w4, 2b
  1070. 3:
  1071. sqsub v17.8h, v17.8h, v19.8h
  1072. sqsub v16.8h, v16.8h, v18.8h
  1073. ins v17.h[7], wzr
  1074. sqadd v16.8h, v16.8h, v17.8h
  1075. saddlv s16, v16.8h
  1076. sqabs s16, s16
  1077. fmov w0, s16
  1078. mul w0, w0, w5
  1079. add w0, w0, w9
  1080. ret
  1081. endfunc
  1082. function nsse8_neon, export=1
  1083. // x0 multiplier
  1084. // x1 uint8_t *pix1
  1085. // x2 uint8_t *pix2
  1086. // x3 ptrdiff_t stride
  1087. // w4 int h
  1088. str x0, [sp, #-0x40]!
  1089. stp x1, x2, [sp, #0x10]
  1090. stp x3, x4, [sp, #0x20]
  1091. str x30, [sp, #0x30]
  1092. bl X(sse8_neon)
  1093. ldr x30, [sp, #0x30]
  1094. mov w9, w0 // here we store score1
  1095. ldp x1, x2, [sp, #0x10]
  1096. ldp x3, x4, [sp, #0x20]
  1097. ldr x5, [sp], #0x40
  1098. movi v16.8h, #0
  1099. movi v17.8h, #0
  1100. movi v18.8h, #0
  1101. movi v19.8h, #0
  1102. ld1 {v0.8b}, [x1], x3
  1103. subs w4, w4, #1 // we need to make h-1 iterations
  1104. ext v1.8b, v0.8b, v0.8b, #1 // x1 + 1
  1105. ld1 {v2.8b}, [x2], x3
  1106. cmp w4, #2
  1107. ext v3.8b, v2.8b, v2.8b, #1 // x2 + 1
  1108. b.lt 2f
  1109. // make 2 iterations at once
  1110. 1:
  1111. ld1 {v4.8b}, [x1], x3
  1112. ld1 {v20.8b}, [x1], x3
  1113. ld1 {v6.8b}, [x2], x3
  1114. ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
  1115. ext v21.8b, v20.8b, v20.8b, #1
  1116. ld1 {v22.8b}, [x2], x3
  1117. ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
  1118. usubl v31.8h, v0.8b, v4.8b
  1119. ext v23.8b, v22.8b, v22.8b, #1
  1120. usubl v29.8h, v1.8b, v5.8b
  1121. usubl v27.8h, v2.8b, v6.8b
  1122. usubl v25.8h, v3.8b, v7.8b
  1123. saba v16.8h, v31.8h, v29.8h
  1124. usubl v31.8h, v4.8b, v20.8b
  1125. saba v18.8h, v27.8h, v25.8h
  1126. sub w4, w4, #2
  1127. usubl v29.8h, v5.8b, v21.8b
  1128. mov v0.16b, v20.16b
  1129. mov v1.16b, v21.16b
  1130. saba v16.8h, v31.8h, v29.8h
  1131. usubl v27.8h, v6.8b, v22.8b
  1132. usubl v25.8h, v7.8b, v23.8b
  1133. mov v2.16b, v22.16b
  1134. mov v3.16b, v23.16b
  1135. cmp w4, #2
  1136. saba v18.8h, v27.8h, v25.8h
  1137. b.ge 1b
  1138. cbz w4, 3f
  1139. // iterate by one
  1140. 2:
  1141. ld1 {v4.8b}, [x1], x3
  1142. subs w4, w4, #1
  1143. ext v5.8b, v4.8b, v4.8b, #1 // x1 + stride + 1
  1144. ld1 {v6.8b}, [x2], x3
  1145. usubl v31.8h, v0.8b, v4.8b
  1146. ext v7.8b, v6.8b, v6.8b, #1 // x2 + stride + 1
  1147. usubl v29.8h, v1.8b, v5.8b
  1148. saba v16.8h, v31.8h, v29.8h
  1149. usubl v27.8h, v2.8b, v6.8b
  1150. usubl v25.8h, v3.8b, v7.8b
  1151. saba v18.8h, v27.8h, v25.8h
  1152. mov v0.16b, v4.16b
  1153. mov v1.16b, v5.16b
  1154. mov v2.16b, v6.16b
  1155. mov v3.16b, v7.16b
  1156. cbnz w4, 2b
  1157. 3:
  1158. sqsub v16.8h, v16.8h, v18.8h
  1159. ins v16.h[7], wzr
  1160. saddlv s16, v16.8h
  1161. sqabs s16, s16
  1162. fmov w0, s16
  1163. mul w0, w0, w5
  1164. add w0, w0, w9
  1165. ret
  1166. endfunc
  1167. function pix_median_abs16_neon, export=1
  1168. // x0 unused
  1169. // x1 uint8_t *pix1
  1170. // x2 uint8_t *pix2
  1171. // x3 ptrdiff_t stride
  1172. // w4 int h
  1173. ld1 {v2.16b}, [x1], x3
  1174. ld1 {v3.16b}, [x2], x3
  1175. movi v31.8h, #0
  1176. movi v16.8h, #0
  1177. ext v0.16b, v2.16b, v2.16b, #1
  1178. ext v1.16b, v3.16b, v3.16b, #1
  1179. usubl v28.8h, v2.8b, v3.8b
  1180. usubl2 v27.8h, v2.16b, v3.16b
  1181. usubl v26.8h, v0.8b, v1.8b
  1182. usubl2 v25.8h, v0.16b, v1.16b
  1183. sub w4, w4, #1 // we need to make h-1 iterations
  1184. saba v31.8h, v26.8h, v28.8h
  1185. saba v16.8h, v25.8h, v27.8h
  1186. mov h18, v28.h[0]
  1187. cmp w4, #1
  1188. sqabs h18, h18
  1189. movi v0.8h, #0
  1190. b.lt 2f
  1191. 1:
  1192. ld1 {v6.16b}, [x1], x3 // pix1 vector for V(j-1)
  1193. ld1 {v7.16b}, [x2], x3 // pix2 vector for V(j-1)
  1194. subs w4, w4, #1
  1195. ext v4.16b, v6.16b, v6.16b, #1 // pix1 vector for V(j)
  1196. ext v5.16b, v7.16b, v7.16b, #1 // pix2 vector for V(j)
  1197. // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
  1198. // scratch registers: v22, v21, v20, v19, v17
  1199. // To find median of three values, calculate sum of them
  1200. // and subtract max and min value from it.
  1201. usubl v30.8h, v6.8b, v7.8b // V(j-1)
  1202. usubl2 v29.8h, v6.16b, v7.16b // V(j-1)
  1203. usubl v24.8h, v4.8b, v5.8b // V(j)
  1204. usubl2 v23.8h, v4.16b, v5.16b // V(j)
  1205. saba v0.8h, v30.8h, v28.8h
  1206. add v22.8h, v26.8h, v30.8h
  1207. smin v20.8h, v26.8h, v30.8h
  1208. add v21.8h, v25.8h, v29.8h
  1209. smax v19.8h, v26.8h, v30.8h
  1210. sub v22.8h, v22.8h, v28.8h
  1211. sub v21.8h, v21.8h, v27.8h
  1212. smin v17.8h, v19.8h, v22.8h
  1213. smin v22.8h, v25.8h, v29.8h
  1214. mov v28.16b, v30.16b
  1215. smax v20.8h, v20.8h, v17.8h // median values lower half
  1216. smax v19.8h, v25.8h, v29.8h
  1217. saba v31.8h, v24.8h, v20.8h
  1218. mov v27.16b, v29.16b
  1219. smin v19.8h, v19.8h, v21.8h
  1220. mov v26.16b, v24.16b
  1221. smax v17.8h, v22.8h, v19.8h // median values upper half
  1222. mov v25.16b, v23.16b
  1223. saba v16.8h, v23.8h, v17.8h
  1224. b.ne 1b
  1225. 2:
  1226. mov h17, v0.h[0]
  1227. ins v16.h[7], wzr
  1228. add d18, d18, d17
  1229. add v31.8h, v31.8h, v16.8h
  1230. uaddlv s17, v31.8h
  1231. add d18, d18, d17
  1232. fmov w0, s18
  1233. ret
  1234. endfunc
  1235. function vsad_intra8_neon, export=1
  1236. // x0 unused
  1237. // x1 uint8_t *pix1
  1238. // x2 uint8_t *dummy
  1239. // x3 ptrdiff_t stride
  1240. // w4 int h
  1241. ld1 {v0.8b}, [x1], x3
  1242. sub w4, w4, #1 // we need to make h-1 iterations
  1243. cmp w4, #3
  1244. movi v16.8h, #0
  1245. b.lt 2f
  1246. 1:
  1247. // v = abs( pix1[0] - pix1[0 + stride] )
  1248. // score = sum(v)
  1249. ld1 {v1.8b}, [x1], x3
  1250. sub w4, w4, #3
  1251. ld1 {v2.8b}, [x1], x3
  1252. uabal v16.8h, v0.8b, v1.8b
  1253. ld1 {v3.8b}, [x1], x3
  1254. uabal v16.8h, v1.8b, v2.8b
  1255. cmp w4, #3
  1256. mov v0.8b, v3.8b
  1257. uabal v16.8h, v2.8b, v3.8b
  1258. b.ge 1b
  1259. cbz w4, 3f
  1260. 2:
  1261. ld1 {v1.8b}, [x1], x3
  1262. subs w4, w4, #1
  1263. uabal v16.8h, v0.8b, v1.8b
  1264. mov v0.8b, v1.8b
  1265. cbnz w4, 2b
  1266. 3:
  1267. uaddlv s17, v16.8h
  1268. fmov w0, s17
  1269. ret
  1270. endfunc
  1271. function pix_median_abs8_neon, export=1
  1272. // x0 unused
  1273. // x1 uint8_t *pix1
  1274. // x2 uint8_t *pix2
  1275. // x3 ptrdiff_t stride
  1276. // w4 int h
  1277. ld1 {v2.8b}, [x1], x3
  1278. ld1 {v3.8b}, [x2], x3
  1279. movi v31.8h, #0
  1280. ext v0.8b, v2.8b, v2.8b, #1
  1281. ext v1.8b, v3.8b, v3.8b, #1
  1282. usubl v28.8h, v2.8b, v3.8b
  1283. usubl v26.8h, v0.8b, v1.8b
  1284. sub w4, w4, #1 // we need to make h-1 iterations
  1285. saba v31.8h, v26.8h, v28.8h
  1286. mov h18, v28.h[0]
  1287. cmp w4, #1
  1288. sqabs h18, h18
  1289. movi v0.8h, #0
  1290. b.lt 2f
  1291. 1:
  1292. ld1 {v6.8b}, [x1], x3 // pix1 vector for V(j-1)
  1293. ld1 {v7.8b}, [x2], x3 // pix2 vector for V(j-1)
  1294. subs w4, w4, #1
  1295. ext v4.8b, v6.8b, v6.8b, #1 // pix1 vector for V(j)
  1296. ext v5.8b, v7.8b, v7.8b, #1 // pix2 vector for V(j)
  1297. // protected registers: v30, v29, v28, v27, v26, v25, v24, v23
  1298. // scratch registers: v22, v21, v20, v19, v17
  1299. // To find median of three values, calculate sum of them
  1300. // and subtract max and min value from it.
  1301. usubl v30.8h, v6.8b, v7.8b // V(j-1)
  1302. usubl v24.8h, v4.8b, v5.8b // V(j)
  1303. saba v0.8h, v30.8h, v28.8h
  1304. add v22.8h, v26.8h, v30.8h
  1305. smin v20.8h, v26.8h, v30.8h
  1306. smax v19.8h, v26.8h, v30.8h
  1307. sub v22.8h, v22.8h, v28.8h
  1308. smin v17.8h, v19.8h, v22.8h
  1309. mov v28.16b, v30.16b
  1310. smax v20.8h, v20.8h, v17.8h // median values lower half
  1311. smax v19.8h, v25.8h, v29.8h
  1312. saba v31.8h, v24.8h, v20.8h
  1313. mov v26.16b, v24.16b
  1314. smax v17.8h, v22.8h, v19.8h // median values upper half
  1315. b.ne 1b
  1316. 2:
  1317. mov h17, v0.h[0]
  1318. ins v31.h[7], wzr
  1319. add d18, d18, d17
  1320. uaddlv s17, v31.8h
  1321. add d18, d18, d17
  1322. fmov w0, s18
  1323. ret
  1324. endfunc
  1325. #if HAVE_DOTPROD
  1326. ENABLE_DOTPROD
  1327. function sse16_neon_dotprod, export=1
  1328. // x0 - unused
  1329. // x1 - pix1
  1330. // x2 - pix2
  1331. // x3 - stride
  1332. // w4 - h
  1333. cmp w4, #4
  1334. movi v17.4s, #0
  1335. b.lt 2f
  1336. // Make 4 iterations at once
  1337. 1:
  1338. // res = abs(pix1[0] - pix2[0])
  1339. // res * res
  1340. ld1 {v0.16b}, [x1], x3 // Load pix1 vector for first iteration
  1341. ld1 {v1.16b}, [x2], x3 // Load pix2 vector for first iteration
  1342. ld1 {v2.16b}, [x1], x3 // Load pix1 vector for second iteration
  1343. uabd v30.16b, v0.16b, v1.16b // Absolute difference, first iteration
  1344. ld1 {v3.16b}, [x2], x3 // Load pix2 vector for second iteration
  1345. udot v17.4s, v30.16b, v30.16b
  1346. uabd v27.16b, v2.16b, v3.16b // Absolute difference, second iteration
  1347. ld1 {v4.16b}, [x1], x3 // Load pix1 for third iteration
  1348. udot v17.4s, v27.16b, v27.16b
  1349. ld1 {v5.16b}, [x2], x3 // Load pix2 for third iteration
  1350. uabd v24.16b, v4.16b, v5.16b // Absolute difference, third iteration
  1351. ld1 {v6.16b}, [x1], x3 // Load pix1 for fourth iteration
  1352. udot v17.4s, v24.16b, v24.16b
  1353. ld1 {v7.16b}, [x2], x3 // Load pix2 for fourth iteration
  1354. uabd v21.16b, v6.16b, v7.16b // Absolute difference, fourth iteration
  1355. sub w4, w4, #4 // h -= 4
  1356. udot v17.4s, v21.16b, v21.16b
  1357. cmp w4, #4
  1358. b.ge 1b
  1359. cbz w4, 3f
  1360. // iterate by one
  1361. 2:
  1362. ld1 {v0.16b}, [x1], x3 // Load pix1
  1363. ld1 {v1.16b}, [x2], x3 // Load pix2
  1364. uabd v30.16b, v0.16b, v1.16b
  1365. subs w4, w4, #1
  1366. udot v17.4s, v30.16b, v30.16b
  1367. b.ne 2b
  1368. 3:
  1369. uaddlv d16, v17.4s // add up accumulator vector
  1370. fmov w0, s16
  1371. ret
  1372. endfunc
  1373. function vsse_intra16_neon_dotprod, export=1
  1374. // x0 unused
  1375. // x1 uint8_t *pix1
  1376. // x2 uint8_t *dummy
  1377. // x3 ptrdiff_t stride
  1378. // w4 int h
  1379. ld1 {v0.16b}, [x1], x3
  1380. movi v17.4s, #0
  1381. sub w4, w4, #1 // we need to make h-1 iterations
  1382. cmp w4, #3
  1383. b.lt 2f
  1384. 1:
  1385. // v = abs( pix1[0] - pix1[0 + stride] )
  1386. // score = sum( v * v )
  1387. ld1 {v1.16b}, [x1], x3
  1388. ld1 {v2.16b}, [x1], x3
  1389. uabd v30.16b, v0.16b, v1.16b
  1390. ld1 {v3.16b}, [x1], x3
  1391. udot v17.4s, v30.16b, v30.16b
  1392. uabd v27.16b, v1.16b, v2.16b
  1393. udot v17.4s, v27.16b, v27.16b
  1394. uabd v25.16b, v2.16b, v3.16b
  1395. sub w4, w4, #3
  1396. udot v17.4s, v25.16b, v25.16b
  1397. cmp w4, #3
  1398. mov v0.16b, v3.16b
  1399. b.ge 1b
  1400. cbz w4, 3f
  1401. // iterate by one
  1402. 2:
  1403. ld1 {v1.16b}, [x1], x3
  1404. subs w4, w4, #1
  1405. uabd v30.16b, v0.16b, v1.16b
  1406. mov v0.16b, v1.16b
  1407. udot v17.4s, v30.16b, v30.16b
  1408. cbnz w4, 2b
  1409. 3:
  1410. uaddlv d17, v17.4s
  1411. fmov w0, s17
  1412. ret
  1413. endfunc
  1414. DISABLE_DOTPROD
  1415. #endif