You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1017 lines
24KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  29. pd_16384: times 4 dd 16384
  30. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  31. SECTION_TEXT
  32. %macro SCALARPRODUCT 0
  33. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  34. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  35. shl orderq, 1
  36. add v1q, orderq
  37. add v2q, orderq
  38. neg orderq
  39. pxor m2, m2
  40. .loop:
  41. movu m0, [v1q + orderq]
  42. movu m1, [v1q + orderq + mmsize]
  43. pmaddwd m0, [v2q + orderq]
  44. pmaddwd m1, [v2q + orderq + mmsize]
  45. paddd m2, m0
  46. paddd m2, m1
  47. add orderq, mmsize*2
  48. jl .loop
  49. %if mmsize == 16
  50. movhlps m0, m2
  51. paddd m2, m0
  52. pshuflw m0, m2, 0x4e
  53. %else
  54. pshufw m0, m2, 0x4e
  55. %endif
  56. paddd m2, m0
  57. movd eax, m2
  58. RET
  59. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  60. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  61. shl orderq, 1
  62. movd m7, mulm
  63. %if mmsize == 16
  64. pshuflw m7, m7, 0
  65. punpcklqdq m7, m7
  66. %else
  67. pshufw m7, m7, 0
  68. %endif
  69. pxor m6, m6
  70. add v1q, orderq
  71. add v2q, orderq
  72. add v3q, orderq
  73. neg orderq
  74. .loop:
  75. movu m0, [v2q + orderq]
  76. movu m1, [v2q + orderq + mmsize]
  77. mova m4, [v1q + orderq]
  78. mova m5, [v1q + orderq + mmsize]
  79. movu m2, [v3q + orderq]
  80. movu m3, [v3q + orderq + mmsize]
  81. pmaddwd m0, m4
  82. pmaddwd m1, m5
  83. pmullw m2, m7
  84. pmullw m3, m7
  85. paddd m6, m0
  86. paddd m6, m1
  87. paddw m2, m4
  88. paddw m3, m5
  89. mova [v1q + orderq], m2
  90. mova [v1q + orderq + mmsize], m3
  91. add orderq, mmsize*2
  92. jl .loop
  93. %if mmsize == 16
  94. movhlps m0, m6
  95. paddd m6, m0
  96. pshuflw m0, m6, 0x4e
  97. %else
  98. pshufw m0, m6, 0x4e
  99. %endif
  100. paddd m6, m0
  101. movd eax, m6
  102. RET
  103. %endmacro
  104. INIT_MMX mmxext
  105. SCALARPRODUCT
  106. INIT_XMM sse2
  107. SCALARPRODUCT
  108. %macro SCALARPRODUCT_LOOP 1
  109. align 16
  110. .loop%1:
  111. sub orderq, mmsize*2
  112. %if %1
  113. mova m1, m4
  114. mova m4, [v2q + orderq]
  115. mova m0, [v2q + orderq + mmsize]
  116. palignr m1, m0, %1
  117. palignr m0, m4, %1
  118. mova m3, m5
  119. mova m5, [v3q + orderq]
  120. mova m2, [v3q + orderq + mmsize]
  121. palignr m3, m2, %1
  122. palignr m2, m5, %1
  123. %else
  124. mova m0, [v2q + orderq]
  125. mova m1, [v2q + orderq + mmsize]
  126. mova m2, [v3q + orderq]
  127. mova m3, [v3q + orderq + mmsize]
  128. %endif
  129. %define t0 [v1q + orderq]
  130. %define t1 [v1q + orderq + mmsize]
  131. %if ARCH_X86_64
  132. mova m8, t0
  133. mova m9, t1
  134. %define t0 m8
  135. %define t1 m9
  136. %endif
  137. pmaddwd m0, t0
  138. pmaddwd m1, t1
  139. pmullw m2, m7
  140. pmullw m3, m7
  141. paddw m2, t0
  142. paddw m3, t1
  143. paddd m6, m0
  144. paddd m6, m1
  145. mova [v1q + orderq], m2
  146. mova [v1q + orderq + mmsize], m3
  147. jg .loop%1
  148. %if %1
  149. jmp .end
  150. %endif
  151. %endmacro
  152. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  153. INIT_XMM ssse3
  154. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  155. shl orderq, 1
  156. movd m7, mulm
  157. pshuflw m7, m7, 0
  158. punpcklqdq m7, m7
  159. pxor m6, m6
  160. mov r4d, v2d
  161. and r4d, 15
  162. and v2q, ~15
  163. and v3q, ~15
  164. mova m4, [v2q + orderq]
  165. mova m5, [v3q + orderq]
  166. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  167. cmp r4d, 0
  168. je .loop0
  169. cmp r4d, 2
  170. je .loop2
  171. cmp r4d, 4
  172. je .loop4
  173. cmp r4d, 6
  174. je .loop6
  175. cmp r4d, 8
  176. je .loop8
  177. cmp r4d, 10
  178. je .loop10
  179. cmp r4d, 12
  180. je .loop12
  181. SCALARPRODUCT_LOOP 14
  182. SCALARPRODUCT_LOOP 12
  183. SCALARPRODUCT_LOOP 10
  184. SCALARPRODUCT_LOOP 8
  185. SCALARPRODUCT_LOOP 6
  186. SCALARPRODUCT_LOOP 4
  187. SCALARPRODUCT_LOOP 2
  188. SCALARPRODUCT_LOOP 0
  189. .end:
  190. movhlps m0, m6
  191. paddd m6, m0
  192. pshuflw m0, m6, 0x4e
  193. paddd m6, m0
  194. movd eax, m6
  195. RET
  196. ;-----------------------------------------------------------------------------
  197. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  198. ; const int16_t *window, unsigned int len)
  199. ;-----------------------------------------------------------------------------
  200. %macro REVERSE_WORDS 1-2
  201. %if cpuflag(ssse3) && notcpuflag(atom)
  202. pshufb %1, %2
  203. %elif cpuflag(sse2)
  204. pshuflw %1, %1, 0x1B
  205. pshufhw %1, %1, 0x1B
  206. pshufd %1, %1, 0x4E
  207. %elif cpuflag(mmxext)
  208. pshufw %1, %1, 0x1B
  209. %endif
  210. %endmacro
  211. %macro MUL16FIXED 3
  212. %if cpuflag(ssse3) ; dst, src, unused
  213. ; dst = ((dst * src) + (1<<14)) >> 15
  214. pmulhrsw %1, %2
  215. %elif cpuflag(mmxext) ; dst, src, temp
  216. ; dst = (dst * src) >> 15
  217. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  218. ; in from the pmullw result.
  219. mova %3, %1
  220. pmulhw %1, %2
  221. pmullw %3, %2
  222. psrlw %3, 15
  223. psllw %1, 1
  224. por %1, %3
  225. %endif
  226. %endmacro
  227. %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
  228. %if %1
  229. cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
  230. %else
  231. cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
  232. %endif
  233. lea offset2q, [offsetq-mmsize]
  234. %if cpuflag(ssse3) && notcpuflag(atom)
  235. mova m5, [pb_revwords]
  236. ALIGN 16
  237. %elif %1
  238. mova m5, [pd_16384]
  239. %endif
  240. .loop:
  241. %if cpuflag(ssse3)
  242. ; This version does the 16x16->16 multiplication in-place without expanding
  243. ; to 32-bit. The ssse3 version is bit-identical.
  244. mova m0, [windowq+offset2q]
  245. mova m1, [ inputq+offset2q]
  246. pmulhrsw m1, m0
  247. REVERSE_WORDS m0, m5
  248. pmulhrsw m0, [ inputq+offsetq ]
  249. mova [outputq+offset2q], m1
  250. mova [outputq+offsetq ], m0
  251. %elif %1
  252. ; This version expands 16-bit to 32-bit, multiplies by the window,
  253. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  254. ; save to the output. The window is reversed for the second half.
  255. mova m3, [windowq+offset2q]
  256. mova m4, [ inputq+offset2q]
  257. pxor m0, m0
  258. punpcklwd m0, m3
  259. punpcklwd m1, m4
  260. pmaddwd m0, m1
  261. paddd m0, m5
  262. psrad m0, 15
  263. pxor m2, m2
  264. punpckhwd m2, m3
  265. punpckhwd m1, m4
  266. pmaddwd m2, m1
  267. paddd m2, m5
  268. psrad m2, 15
  269. packssdw m0, m2
  270. mova [outputq+offset2q], m0
  271. REVERSE_WORDS m3
  272. mova m4, [ inputq+offsetq]
  273. pxor m0, m0
  274. punpcklwd m0, m3
  275. punpcklwd m1, m4
  276. pmaddwd m0, m1
  277. paddd m0, m5
  278. psrad m0, 15
  279. pxor m2, m2
  280. punpckhwd m2, m3
  281. punpckhwd m1, m4
  282. pmaddwd m2, m1
  283. paddd m2, m5
  284. psrad m2, 15
  285. packssdw m0, m2
  286. mova [outputq+offsetq], m0
  287. %else
  288. ; This version does the 16x16->16 multiplication in-place without expanding
  289. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  290. ; therefore are not bit-identical to the C version.
  291. mova m0, [windowq+offset2q]
  292. mova m1, [ inputq+offset2q]
  293. mova m2, [ inputq+offsetq ]
  294. MUL16FIXED m1, m0, m3
  295. REVERSE_WORDS m0
  296. MUL16FIXED m2, m0, m3
  297. mova [outputq+offset2q], m1
  298. mova [outputq+offsetq ], m2
  299. %endif
  300. add offsetd, mmsize
  301. sub offset2d, mmsize
  302. jae .loop
  303. REP_RET
  304. %endmacro
  305. INIT_MMX mmxext
  306. APPLY_WINDOW_INT16 0
  307. INIT_XMM sse2
  308. APPLY_WINDOW_INT16 0
  309. INIT_MMX mmxext
  310. APPLY_WINDOW_INT16 1
  311. INIT_XMM sse2
  312. APPLY_WINDOW_INT16 1
  313. INIT_XMM ssse3
  314. APPLY_WINDOW_INT16 1
  315. INIT_XMM ssse3, atom
  316. APPLY_WINDOW_INT16 1
  317. ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  318. INIT_MMX mmxext
  319. cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
  320. movq mm0, [topq]
  321. movq mm2, mm0
  322. movd mm4, [left_topq]
  323. psllq mm2, 8
  324. movq mm1, mm0
  325. por mm4, mm2
  326. movd mm3, [leftq]
  327. psubb mm0, mm4 ; t-tl
  328. add dstq, wq
  329. add topq, wq
  330. add diffq, wq
  331. neg wq
  332. jmp .skip
  333. .loop:
  334. movq mm4, [topq+wq]
  335. movq mm0, mm4
  336. psllq mm4, 8
  337. por mm4, mm1
  338. movq mm1, mm0 ; t
  339. psubb mm0, mm4 ; t-tl
  340. .skip:
  341. movq mm2, [diffq+wq]
  342. %assign i 0
  343. %rep 8
  344. movq mm4, mm0
  345. paddb mm4, mm3 ; t-tl+l
  346. movq mm5, mm3
  347. pmaxub mm3, mm1
  348. pminub mm5, mm1
  349. pminub mm3, mm4
  350. pmaxub mm3, mm5 ; median
  351. paddb mm3, mm2 ; +residual
  352. %if i==0
  353. movq mm7, mm3
  354. psllq mm7, 56
  355. %else
  356. movq mm6, mm3
  357. psrlq mm7, 8
  358. psllq mm6, 56
  359. por mm7, mm6
  360. %endif
  361. %if i<7
  362. psrlq mm0, 8
  363. psrlq mm1, 8
  364. psrlq mm2, 8
  365. %endif
  366. %assign i i+1
  367. %endrep
  368. movq [dstq+wq], mm7
  369. add wq, 8
  370. jl .loop
  371. movzx r2d, byte [dstq-1]
  372. mov [leftq], r2d
  373. movzx r2d, byte [topq-1]
  374. mov [left_topq], r2d
  375. RET
  376. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  377. add srcq, wq
  378. add dstq, wq
  379. neg wq
  380. %%.loop:
  381. %if %2
  382. mova m1, [srcq+wq]
  383. %else
  384. movu m1, [srcq+wq]
  385. %endif
  386. mova m2, m1
  387. psllw m1, 8
  388. paddb m1, m2
  389. mova m2, m1
  390. pshufb m1, m3
  391. paddb m1, m2
  392. pshufb m0, m5
  393. mova m2, m1
  394. pshufb m1, m4
  395. paddb m1, m2
  396. %if mmsize == 16
  397. mova m2, m1
  398. pshufb m1, m6
  399. paddb m1, m2
  400. %endif
  401. paddb m0, m1
  402. %if %1
  403. mova [dstq+wq], m0
  404. %else
  405. movq [dstq+wq], m0
  406. movhps [dstq+wq+8], m0
  407. %endif
  408. add wq, mmsize
  409. jl %%.loop
  410. mov eax, mmsize-1
  411. sub eax, wd
  412. movd m1, eax
  413. pshufb m0, m1
  414. movd eax, m0
  415. RET
  416. %endmacro
  417. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  418. INIT_MMX ssse3
  419. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  420. .skip_prologue:
  421. mova m5, [pb_7]
  422. mova m4, [pb_zzzz3333zzzzbbbb]
  423. mova m3, [pb_zz11zz55zz99zzdd]
  424. movd m0, leftm
  425. psllq m0, 56
  426. ADD_HFYU_LEFT_LOOP 1, 1
  427. INIT_XMM sse4
  428. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  429. mova m5, [pb_f]
  430. mova m6, [pb_zzzzzzzz77777777]
  431. mova m4, [pb_zzzz3333zzzzbbbb]
  432. mova m3, [pb_zz11zz55zz99zzdd]
  433. movd m0, leftm
  434. pslldq m0, 15
  435. test srcq, 15
  436. jnz .src_unaligned
  437. test dstq, 15
  438. jnz .dst_unaligned
  439. ADD_HFYU_LEFT_LOOP 1, 1
  440. .dst_unaligned:
  441. ADD_HFYU_LEFT_LOOP 0, 1
  442. .src_unaligned:
  443. ADD_HFYU_LEFT_LOOP 0, 0
  444. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  445. INIT_XMM sse
  446. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  447. neg offsetq
  448. shl offsetq, 2
  449. sub v1q, offsetq
  450. sub v2q, offsetq
  451. xorps xmm0, xmm0
  452. .loop:
  453. movaps xmm1, [v1q+offsetq]
  454. mulps xmm1, [v2q+offsetq]
  455. addps xmm0, xmm1
  456. add offsetq, 16
  457. js .loop
  458. movhlps xmm1, xmm0
  459. addps xmm0, xmm1
  460. movss xmm1, xmm0
  461. shufps xmm0, xmm0, 1
  462. addss xmm0, xmm1
  463. %if ARCH_X86_64 == 0
  464. movss r0m, xmm0
  465. fld dword r0m
  466. %endif
  467. RET
  468. ;-----------------------------------------------------------------------------
  469. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  470. ; int32_t max, unsigned int len)
  471. ;-----------------------------------------------------------------------------
  472. ; %1 = number of xmm registers used
  473. ; %2 = number of inline load/process/store loops per asm loop
  474. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  475. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  476. ; %5 = suffix
  477. %macro VECTOR_CLIP_INT32 4-5
  478. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  479. %if %4
  480. cvtsi2ss m4, minm
  481. cvtsi2ss m5, maxm
  482. %else
  483. movd m4, minm
  484. movd m5, maxm
  485. %endif
  486. SPLATD m4
  487. SPLATD m5
  488. .loop:
  489. %assign %%i 1
  490. %rep %2
  491. mova m0, [srcq+mmsize*0*%%i]
  492. mova m1, [srcq+mmsize*1*%%i]
  493. mova m2, [srcq+mmsize*2*%%i]
  494. mova m3, [srcq+mmsize*3*%%i]
  495. %if %3
  496. mova m7, [srcq+mmsize*4*%%i]
  497. mova m8, [srcq+mmsize*5*%%i]
  498. mova m9, [srcq+mmsize*6*%%i]
  499. mova m10, [srcq+mmsize*7*%%i]
  500. %endif
  501. CLIPD m0, m4, m5, m6
  502. CLIPD m1, m4, m5, m6
  503. CLIPD m2, m4, m5, m6
  504. CLIPD m3, m4, m5, m6
  505. %if %3
  506. CLIPD m7, m4, m5, m6
  507. CLIPD m8, m4, m5, m6
  508. CLIPD m9, m4, m5, m6
  509. CLIPD m10, m4, m5, m6
  510. %endif
  511. mova [dstq+mmsize*0*%%i], m0
  512. mova [dstq+mmsize*1*%%i], m1
  513. mova [dstq+mmsize*2*%%i], m2
  514. mova [dstq+mmsize*3*%%i], m3
  515. %if %3
  516. mova [dstq+mmsize*4*%%i], m7
  517. mova [dstq+mmsize*5*%%i], m8
  518. mova [dstq+mmsize*6*%%i], m9
  519. mova [dstq+mmsize*7*%%i], m10
  520. %endif
  521. %assign %%i %%i+1
  522. %endrep
  523. add srcq, mmsize*4*(%2+%3)
  524. add dstq, mmsize*4*(%2+%3)
  525. sub lend, mmsize*(%2+%3)
  526. jg .loop
  527. REP_RET
  528. %endmacro
  529. INIT_MMX mmx
  530. %define CLIPD CLIPD_MMX
  531. VECTOR_CLIP_INT32 0, 1, 0, 0
  532. INIT_XMM sse2
  533. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  534. %define CLIPD CLIPD_SSE2
  535. VECTOR_CLIP_INT32 6, 2, 0, 1
  536. INIT_XMM sse4
  537. %define CLIPD CLIPD_SSE41
  538. %ifdef m8
  539. VECTOR_CLIP_INT32 11, 1, 1, 0
  540. %else
  541. VECTOR_CLIP_INT32 6, 1, 0, 0
  542. %endif
  543. ;-----------------------------------------------------------------------------
  544. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  545. ; int len)
  546. ;-----------------------------------------------------------------------------
  547. %macro VECTOR_FMUL_REVERSE 0
  548. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  549. lea lenq, [lend*4 - 2*mmsize]
  550. ALIGN 16
  551. .loop:
  552. %if cpuflag(avx)
  553. vmovaps xmm0, [src1q + 16]
  554. vinsertf128 m0, m0, [src1q], 1
  555. vshufps m0, m0, m0, q0123
  556. vmovaps xmm1, [src1q + mmsize + 16]
  557. vinsertf128 m1, m1, [src1q + mmsize], 1
  558. vshufps m1, m1, m1, q0123
  559. %else
  560. mova m0, [src1q]
  561. mova m1, [src1q + mmsize]
  562. shufps m0, m0, q0123
  563. shufps m1, m1, q0123
  564. %endif
  565. mulps m0, m0, [src0q + lenq + mmsize]
  566. mulps m1, m1, [src0q + lenq]
  567. mova [dstq + lenq + mmsize], m0
  568. mova [dstq + lenq], m1
  569. add src1q, 2*mmsize
  570. sub lenq, 2*mmsize
  571. jge .loop
  572. REP_RET
  573. %endmacro
  574. INIT_XMM sse
  575. VECTOR_FMUL_REVERSE
  576. INIT_YMM avx
  577. VECTOR_FMUL_REVERSE
  578. ;-----------------------------------------------------------------------------
  579. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  580. ; const float *src2, int len)
  581. ;-----------------------------------------------------------------------------
  582. %macro VECTOR_FMUL_ADD 0
  583. cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
  584. lea lenq, [lend*4 - 2*mmsize]
  585. ALIGN 16
  586. .loop:
  587. mova m0, [src0q + lenq]
  588. mova m1, [src0q + lenq + mmsize]
  589. mulps m0, m0, [src1q + lenq]
  590. mulps m1, m1, [src1q + lenq + mmsize]
  591. addps m0, m0, [src2q + lenq]
  592. addps m1, m1, [src2q + lenq + mmsize]
  593. mova [dstq + lenq], m0
  594. mova [dstq + lenq + mmsize], m1
  595. sub lenq, 2*mmsize
  596. jge .loop
  597. REP_RET
  598. %endmacro
  599. INIT_XMM sse
  600. VECTOR_FMUL_ADD
  601. INIT_YMM avx
  602. VECTOR_FMUL_ADD
  603. ;-----------------------------------------------------------------------------
  604. ; void ff_butterflies_float_interleave(float *dst, const float *src0,
  605. ; const float *src1, int len);
  606. ;-----------------------------------------------------------------------------
  607. %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
  608. cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
  609. %if ARCH_X86_64
  610. movsxd lenq, lend
  611. %endif
  612. test lenq, lenq
  613. jz .end
  614. shl lenq, 2
  615. lea src0q, [src0q + lenq]
  616. lea src1q, [src1q + lenq]
  617. lea dstq, [ dstq + 2*lenq]
  618. neg lenq
  619. .loop:
  620. mova m0, [src0q + lenq]
  621. mova m1, [src1q + lenq]
  622. subps m2, m0, m1
  623. addps m0, m0, m1
  624. unpcklps m1, m0, m2
  625. unpckhps m0, m0, m2
  626. %if cpuflag(avx)
  627. vextractf128 [dstq + 2*lenq ], m1, 0
  628. vextractf128 [dstq + 2*lenq + 16], m0, 0
  629. vextractf128 [dstq + 2*lenq + 32], m1, 1
  630. vextractf128 [dstq + 2*lenq + 48], m0, 1
  631. %else
  632. mova [dstq + 2*lenq ], m1
  633. mova [dstq + 2*lenq + mmsize], m0
  634. %endif
  635. add lenq, mmsize
  636. jl .loop
  637. .end:
  638. REP_RET
  639. %endmacro
  640. INIT_XMM sse
  641. BUTTERFLIES_FLOAT_INTERLEAVE
  642. INIT_YMM avx
  643. BUTTERFLIES_FLOAT_INTERLEAVE
  644. ; %1 = aligned/unaligned
  645. %macro BSWAP_LOOPS 1
  646. mov r3, r2
  647. sar r2, 3
  648. jz .left4_%1
  649. .loop8_%1:
  650. mov%1 m0, [r1 + 0]
  651. mov%1 m1, [r1 + 16]
  652. %if cpuflag(ssse3)
  653. pshufb m0, m2
  654. pshufb m1, m2
  655. mova [r0 + 0], m0
  656. mova [r0 + 16], m1
  657. %else
  658. pshuflw m0, m0, 10110001b
  659. pshuflw m1, m1, 10110001b
  660. pshufhw m0, m0, 10110001b
  661. pshufhw m1, m1, 10110001b
  662. mova m2, m0
  663. mova m3, m1
  664. psllw m0, 8
  665. psllw m1, 8
  666. psrlw m2, 8
  667. psrlw m3, 8
  668. por m2, m0
  669. por m3, m1
  670. mova [r0 + 0], m2
  671. mova [r0 + 16], m3
  672. %endif
  673. add r0, 32
  674. add r1, 32
  675. dec r2
  676. jnz .loop8_%1
  677. .left4_%1:
  678. mov r2, r3
  679. and r3, 4
  680. jz .left
  681. mov%1 m0, [r1]
  682. %if cpuflag(ssse3)
  683. pshufb m0, m2
  684. mova [r0], m0
  685. %else
  686. pshuflw m0, m0, 10110001b
  687. pshufhw m0, m0, 10110001b
  688. mova m2, m0
  689. psllw m0, 8
  690. psrlw m2, 8
  691. por m2, m0
  692. mova [r0], m2
  693. %endif
  694. add r1, 16
  695. add r0, 16
  696. %endmacro
  697. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  698. %macro BSWAP32_BUF 0
  699. %if cpuflag(ssse3)
  700. cglobal bswap32_buf, 3,4,3
  701. mov r3, r1
  702. mova m2, [pb_bswap32]
  703. %else
  704. cglobal bswap32_buf, 3,4,5
  705. mov r3, r1
  706. %endif
  707. and r3, 15
  708. jz .start_align
  709. BSWAP_LOOPS u
  710. jmp .left
  711. .start_align:
  712. BSWAP_LOOPS a
  713. .left:
  714. %if cpuflag(ssse3)
  715. mov r3, r2
  716. and r2, 2
  717. jz .left1
  718. movq m0, [r1]
  719. pshufb m0, m2
  720. movq [r0], m0
  721. add r1, 8
  722. add r0, 8
  723. .left1:
  724. and r3, 1
  725. jz .end
  726. mov r2d, [r1]
  727. bswap r2d
  728. mov [r0], r2d
  729. %else
  730. and r2, 3
  731. jz .end
  732. .loop2:
  733. mov r3d, [r1]
  734. bswap r3d
  735. mov [r0], r3d
  736. add r1, 4
  737. add r0, 4
  738. dec r2
  739. jnz .loop2
  740. %endif
  741. .end:
  742. RET
  743. %endmacro
  744. INIT_XMM sse2
  745. BSWAP32_BUF
  746. INIT_XMM ssse3
  747. BSWAP32_BUF
  748. %macro op_avgh 3
  749. movh %3, %2
  750. pavgb %1, %3
  751. movh %2, %1
  752. %endmacro
  753. %macro op_avg 2
  754. pavgb %1, %2
  755. mova %2, %1
  756. %endmacro
  757. %macro op_puth 2-3
  758. movh %2, %1
  759. %endmacro
  760. %macro op_put 2
  761. mova %2, %1
  762. %endmacro
  763. ; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  764. %macro PIXELS4_L2 1
  765. %define OP op_%1h
  766. cglobal %1_pixels4_l2, 6,6
  767. movsxdifnidn r3, r3d
  768. movsxdifnidn r4, r4d
  769. test r5d, 1
  770. je .loop
  771. movd m0, [r1]
  772. movd m1, [r2]
  773. add r1, r4
  774. add r2, 4
  775. pavgb m0, m1
  776. OP m0, [r0], m3
  777. add r0, r3
  778. dec r5d
  779. .loop:
  780. mova m0, [r1]
  781. mova m1, [r1+r4]
  782. lea r1, [r1+2*r4]
  783. pavgb m0, [r2]
  784. pavgb m1, [r2+4]
  785. OP m0, [r0], m3
  786. OP m1, [r0+r3], m3
  787. lea r0, [r0+2*r3]
  788. mova m0, [r1]
  789. mova m1, [r1+r4]
  790. lea r1, [r1+2*r4]
  791. pavgb m0, [r2+8]
  792. pavgb m1, [r2+12]
  793. OP m0, [r0], m3
  794. OP m1, [r0+r3], m3
  795. lea r0, [r0+2*r3]
  796. add r2, 16
  797. sub r5d, 4
  798. jne .loop
  799. REP_RET
  800. %endmacro
  801. INIT_MMX mmxext
  802. PIXELS4_L2 put
  803. PIXELS4_L2 avg
  804. ; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  805. %macro PIXELS8_L2 1
  806. %define OP op_%1
  807. cglobal %1_pixels8_l2, 6,6
  808. movsxdifnidn r3, r3d
  809. movsxdifnidn r4, r4d
  810. test r5d, 1
  811. je .loop
  812. mova m0, [r1]
  813. mova m1, [r2]
  814. add r1, r4
  815. add r2, 8
  816. pavgb m0, m1
  817. OP m0, [r0]
  818. add r0, r3
  819. dec r5d
  820. .loop:
  821. mova m0, [r1]
  822. mova m1, [r1+r4]
  823. lea r1, [r1+2*r4]
  824. pavgb m0, [r2]
  825. pavgb m1, [r2+8]
  826. OP m0, [r0]
  827. OP m1, [r0+r3]
  828. lea r0, [r0+2*r3]
  829. mova m0, [r1]
  830. mova m1, [r1+r4]
  831. lea r1, [r1+2*r4]
  832. pavgb m0, [r2+16]
  833. pavgb m1, [r2+24]
  834. OP m0, [r0]
  835. OP m1, [r0+r3]
  836. lea r0, [r0+2*r3]
  837. add r2, 32
  838. sub r5d, 4
  839. jne .loop
  840. REP_RET
  841. %endmacro
  842. INIT_MMX mmxext
  843. PIXELS8_L2 put
  844. PIXELS8_L2 avg
  845. ; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
  846. %macro PIXELS16_L2 1
  847. %define OP op_%1
  848. cglobal %1_pixels16_l2, 6,6
  849. movsxdifnidn r3, r3d
  850. movsxdifnidn r4, r4d
  851. test r5d, 1
  852. je .loop
  853. mova m0, [r1]
  854. mova m1, [r1+8]
  855. pavgb m0, [r2]
  856. pavgb m1, [r2+8]
  857. add r1, r4
  858. add r2, 16
  859. OP m0, [r0]
  860. OP m1, [r0+8]
  861. add r0, r3
  862. dec r5d
  863. .loop:
  864. mova m0, [r1]
  865. mova m1, [r1+8]
  866. add r1, r4
  867. pavgb m0, [r2]
  868. pavgb m1, [r2+8]
  869. OP m0, [r0]
  870. OP m1, [r0+8]
  871. add r0, r3
  872. mova m0, [r1]
  873. mova m1, [r1+8]
  874. add r1, r4
  875. pavgb m0, [r2+16]
  876. pavgb m1, [r2+24]
  877. OP m0, [r0]
  878. OP m1, [r0+8]
  879. add r0, r3
  880. add r2, 32
  881. sub r5d, 2
  882. jne .loop
  883. REP_RET
  884. %endmacro
  885. INIT_MMX mmxext
  886. PIXELS16_L2 put
  887. PIXELS16_L2 avg
  888. INIT_MMX mmxext
  889. ; void pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  890. %macro PIXELS48 2
  891. %if %2 == 4
  892. %define OP movh
  893. %else
  894. %define OP mova
  895. %endif
  896. cglobal %1_pixels%2, 4,5
  897. movsxdifnidn r2, r2d
  898. lea r4, [r2*3]
  899. .loop:
  900. OP m0, [r1]
  901. OP m1, [r1+r2]
  902. OP m2, [r1+r2*2]
  903. OP m3, [r1+r4]
  904. lea r1, [r1+r2*4]
  905. %ifidn %1, avg
  906. pavgb m0, [r0]
  907. pavgb m1, [r0+r2]
  908. pavgb m2, [r0+r2*2]
  909. pavgb m3, [r0+r4]
  910. %endif
  911. OP [r0], m0
  912. OP [r0+r2], m1
  913. OP [r0+r2*2], m2
  914. OP [r0+r4], m3
  915. sub r3d, 4
  916. lea r0, [r0+r2*4]
  917. jne .loop
  918. RET
  919. %endmacro
  920. PIXELS48 put, 4
  921. PIXELS48 avg, 4
  922. PIXELS48 put, 8
  923. PIXELS48 avg, 8
  924. INIT_XMM sse2
  925. ; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  926. cglobal put_pixels16, 4,5,4
  927. movsxdifnidn r2, r2d
  928. lea r4, [r2*3]
  929. .loop:
  930. movu m0, [r1]
  931. movu m1, [r1+r2]
  932. movu m2, [r1+r2*2]
  933. movu m3, [r1+r4]
  934. lea r1, [r1+r2*4]
  935. mova [r0], m0
  936. mova [r0+r2], m1
  937. mova [r0+r2*2], m2
  938. mova [r0+r4], m3
  939. sub r3d, 4
  940. lea r0, [r0+r2*4]
  941. jnz .loop
  942. REP_RET
  943. ; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  944. cglobal avg_pixels16, 4,5,4
  945. movsxdifnidn r2, r2d
  946. lea r4, [r2*3]
  947. .loop:
  948. movu m0, [r1]
  949. movu m1, [r1+r2]
  950. movu m2, [r1+r2*2]
  951. movu m3, [r1+r4]
  952. lea r1, [r1+r2*4]
  953. pavgb m0, [r0]
  954. pavgb m1, [r0+r2]
  955. pavgb m2, [r0+r2*2]
  956. pavgb m3, [r0+r4]
  957. mova [r0], m0
  958. mova [r0+r2], m1
  959. mova [r0+r2*2], m2
  960. mova [r0+r4], m3
  961. sub r3d, 4
  962. lea r0, [r0+r2*4]
  963. jnz .loop
  964. REP_RET