You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1367 lines
35KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  29. pd_16384: times 4 dd 16384
  30. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  31. SECTION_TEXT
  32. %macro SCALARPRODUCT 1
  33. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  34. cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
  35. shl orderq, 1
  36. add v1q, orderq
  37. add v2q, orderq
  38. neg orderq
  39. pxor m2, m2
  40. .loop:
  41. movu m0, [v1q + orderq]
  42. movu m1, [v1q + orderq + mmsize]
  43. pmaddwd m0, [v2q + orderq]
  44. pmaddwd m1, [v2q + orderq + mmsize]
  45. paddd m2, m0
  46. paddd m2, m1
  47. add orderq, mmsize*2
  48. jl .loop
  49. %if mmsize == 16
  50. movhlps m0, m2
  51. paddd m2, m0
  52. pshuflw m0, m2, 0x4e
  53. %else
  54. pshufw m0, m2, 0x4e
  55. %endif
  56. paddd m2, m0
  57. movd eax, m2
  58. RET
  59. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  60. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  61. shl orderq, 1
  62. movd m7, mulm
  63. %if mmsize == 16
  64. pshuflw m7, m7, 0
  65. punpcklqdq m7, m7
  66. %else
  67. pshufw m7, m7, 0
  68. %endif
  69. pxor m6, m6
  70. add v1q, orderq
  71. add v2q, orderq
  72. add v3q, orderq
  73. neg orderq
  74. .loop:
  75. movu m0, [v2q + orderq]
  76. movu m1, [v2q + orderq + mmsize]
  77. mova m4, [v1q + orderq]
  78. mova m5, [v1q + orderq + mmsize]
  79. movu m2, [v3q + orderq]
  80. movu m3, [v3q + orderq + mmsize]
  81. pmaddwd m0, m4
  82. pmaddwd m1, m5
  83. pmullw m2, m7
  84. pmullw m3, m7
  85. paddd m6, m0
  86. paddd m6, m1
  87. paddw m2, m4
  88. paddw m3, m5
  89. mova [v1q + orderq], m2
  90. mova [v1q + orderq + mmsize], m3
  91. add orderq, mmsize*2
  92. jl .loop
  93. %if mmsize == 16
  94. movhlps m0, m6
  95. paddd m6, m0
  96. pshuflw m0, m6, 0x4e
  97. %else
  98. pshufw m0, m6, 0x4e
  99. %endif
  100. paddd m6, m0
  101. movd eax, m6
  102. RET
  103. %endmacro
  104. INIT_MMX
  105. SCALARPRODUCT mmxext
  106. INIT_XMM
  107. SCALARPRODUCT sse2
  108. %macro SCALARPRODUCT_LOOP 1
  109. align 16
  110. .loop%1:
  111. sub orderq, mmsize*2
  112. %if %1
  113. mova m1, m4
  114. mova m4, [v2q + orderq]
  115. mova m0, [v2q + orderq + mmsize]
  116. palignr m1, m0, %1
  117. palignr m0, m4, %1
  118. mova m3, m5
  119. mova m5, [v3q + orderq]
  120. mova m2, [v3q + orderq + mmsize]
  121. palignr m3, m2, %1
  122. palignr m2, m5, %1
  123. %else
  124. mova m0, [v2q + orderq]
  125. mova m1, [v2q + orderq + mmsize]
  126. mova m2, [v3q + orderq]
  127. mova m3, [v3q + orderq + mmsize]
  128. %endif
  129. %define t0 [v1q + orderq]
  130. %define t1 [v1q + orderq + mmsize]
  131. %if ARCH_X86_64
  132. mova m8, t0
  133. mova m9, t1
  134. %define t0 m8
  135. %define t1 m9
  136. %endif
  137. pmaddwd m0, t0
  138. pmaddwd m1, t1
  139. pmullw m2, m7
  140. pmullw m3, m7
  141. paddw m2, t0
  142. paddw m3, t1
  143. paddd m6, m0
  144. paddd m6, m1
  145. mova [v1q + orderq], m2
  146. mova [v1q + orderq + mmsize], m3
  147. jg .loop%1
  148. %if %1
  149. jmp .end
  150. %endif
  151. %endmacro
  152. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  153. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  154. shl orderq, 1
  155. movd m7, mulm
  156. pshuflw m7, m7, 0
  157. punpcklqdq m7, m7
  158. pxor m6, m6
  159. mov r4d, v2d
  160. and r4d, 15
  161. and v2q, ~15
  162. and v3q, ~15
  163. mova m4, [v2q + orderq]
  164. mova m5, [v3q + orderq]
  165. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  166. cmp r4d, 0
  167. je .loop0
  168. cmp r4d, 2
  169. je .loop2
  170. cmp r4d, 4
  171. je .loop4
  172. cmp r4d, 6
  173. je .loop6
  174. cmp r4d, 8
  175. je .loop8
  176. cmp r4d, 10
  177. je .loop10
  178. cmp r4d, 12
  179. je .loop12
  180. SCALARPRODUCT_LOOP 14
  181. SCALARPRODUCT_LOOP 12
  182. SCALARPRODUCT_LOOP 10
  183. SCALARPRODUCT_LOOP 8
  184. SCALARPRODUCT_LOOP 6
  185. SCALARPRODUCT_LOOP 4
  186. SCALARPRODUCT_LOOP 2
  187. SCALARPRODUCT_LOOP 0
  188. .end:
  189. movhlps m0, m6
  190. paddd m6, m0
  191. pshuflw m0, m6, 0x4e
  192. paddd m6, m0
  193. movd eax, m6
  194. RET
  195. ;-----------------------------------------------------------------------------
  196. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  197. ; const int16_t *window, unsigned int len)
  198. ;-----------------------------------------------------------------------------
  199. %macro REVERSE_WORDS_MMXEXT 1-2
  200. pshufw %1, %1, 0x1B
  201. %endmacro
  202. %macro REVERSE_WORDS_SSE2 1-2
  203. pshuflw %1, %1, 0x1B
  204. pshufhw %1, %1, 0x1B
  205. pshufd %1, %1, 0x4E
  206. %endmacro
  207. %macro REVERSE_WORDS_SSSE3 2
  208. pshufb %1, %2
  209. %endmacro
  210. ; dst = (dst * src) >> 15
  211. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  212. ; in from the pmullw result.
  213. %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
  214. mova %3, %1
  215. pmulhw %1, %2
  216. pmullw %3, %2
  217. psrlw %3, 15
  218. psllw %1, 1
  219. por %1, %3
  220. %endmacro
  221. ; dst = ((dst * src) + (1<<14)) >> 15
  222. %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
  223. pmulhrsw %1, %2
  224. %endmacro
  225. %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
  226. cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
  227. lea offset2q, [offsetq-mmsize]
  228. %if %2
  229. mova m5, [pd_16384]
  230. %elifidn %1, ssse3
  231. mova m5, [pb_revwords]
  232. ALIGN 16
  233. %endif
  234. .loop:
  235. %if %2
  236. ; This version expands 16-bit to 32-bit, multiplies by the window,
  237. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  238. ; save to the output. The window is reversed for the second half.
  239. mova m3, [windowq+offset2q]
  240. mova m4, [ inputq+offset2q]
  241. pxor m0, m0
  242. punpcklwd m0, m3
  243. punpcklwd m1, m4
  244. pmaddwd m0, m1
  245. paddd m0, m5
  246. psrad m0, 15
  247. pxor m2, m2
  248. punpckhwd m2, m3
  249. punpckhwd m1, m4
  250. pmaddwd m2, m1
  251. paddd m2, m5
  252. psrad m2, 15
  253. packssdw m0, m2
  254. mova [outputq+offset2q], m0
  255. REVERSE_WORDS m3
  256. mova m4, [ inputq+offsetq]
  257. pxor m0, m0
  258. punpcklwd m0, m3
  259. punpcklwd m1, m4
  260. pmaddwd m0, m1
  261. paddd m0, m5
  262. psrad m0, 15
  263. pxor m2, m2
  264. punpckhwd m2, m3
  265. punpckhwd m1, m4
  266. pmaddwd m2, m1
  267. paddd m2, m5
  268. psrad m2, 15
  269. packssdw m0, m2
  270. mova [outputq+offsetq], m0
  271. %elif %3
  272. ; This version does the 16x16->16 multiplication in-place without expanding
  273. ; to 32-bit. The ssse3 version is bit-identical.
  274. mova m0, [windowq+offset2q]
  275. mova m1, [ inputq+offset2q]
  276. pmulhrsw m1, m0
  277. REVERSE_WORDS m0, m5
  278. pmulhrsw m0, [ inputq+offsetq ]
  279. mova [outputq+offset2q], m1
  280. mova [outputq+offsetq ], m0
  281. %else
  282. ; This version does the 16x16->16 multiplication in-place without expanding
  283. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  284. ; therefore are not bit-identical to the C version.
  285. mova m0, [windowq+offset2q]
  286. mova m1, [ inputq+offset2q]
  287. mova m2, [ inputq+offsetq ]
  288. MUL16FIXED m1, m0, m3
  289. REVERSE_WORDS m0
  290. MUL16FIXED m2, m0, m3
  291. mova [outputq+offset2q], m1
  292. mova [outputq+offsetq ], m2
  293. %endif
  294. add offsetd, mmsize
  295. sub offset2d, mmsize
  296. jae .loop
  297. REP_RET
  298. %endmacro
  299. INIT_MMX
  300. %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
  301. %define MUL16FIXED MUL16FIXED_MMXEXT
  302. APPLY_WINDOW_INT16 mmxext, 0, 0
  303. APPLY_WINDOW_INT16 mmxext_ba, 1, 0
  304. INIT_XMM
  305. %define REVERSE_WORDS REVERSE_WORDS_SSE2
  306. APPLY_WINDOW_INT16 sse2, 0, 0
  307. APPLY_WINDOW_INT16 sse2_ba, 1, 0
  308. APPLY_WINDOW_INT16 ssse3_atom, 0, 1
  309. %define REVERSE_WORDS REVERSE_WORDS_SSSE3
  310. APPLY_WINDOW_INT16 ssse3, 0, 1
  311. ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  312. cglobal add_hfyu_median_prediction_mmxext, 6,6,0, dst, top, diff, w, left, left_top
  313. movq mm0, [topq]
  314. movq mm2, mm0
  315. movd mm4, [left_topq]
  316. psllq mm2, 8
  317. movq mm1, mm0
  318. por mm4, mm2
  319. movd mm3, [leftq]
  320. psubb mm0, mm4 ; t-tl
  321. add dstq, wq
  322. add topq, wq
  323. add diffq, wq
  324. neg wq
  325. jmp .skip
  326. .loop:
  327. movq mm4, [topq+wq]
  328. movq mm0, mm4
  329. psllq mm4, 8
  330. por mm4, mm1
  331. movq mm1, mm0 ; t
  332. psubb mm0, mm4 ; t-tl
  333. .skip:
  334. movq mm2, [diffq+wq]
  335. %assign i 0
  336. %rep 8
  337. movq mm4, mm0
  338. paddb mm4, mm3 ; t-tl+l
  339. movq mm5, mm3
  340. pmaxub mm3, mm1
  341. pminub mm5, mm1
  342. pminub mm3, mm4
  343. pmaxub mm3, mm5 ; median
  344. paddb mm3, mm2 ; +residual
  345. %if i==0
  346. movq mm7, mm3
  347. psllq mm7, 56
  348. %else
  349. movq mm6, mm3
  350. psrlq mm7, 8
  351. psllq mm6, 56
  352. por mm7, mm6
  353. %endif
  354. %if i<7
  355. psrlq mm0, 8
  356. psrlq mm1, 8
  357. psrlq mm2, 8
  358. %endif
  359. %assign i i+1
  360. %endrep
  361. movq [dstq+wq], mm7
  362. add wq, 8
  363. jl .loop
  364. movzx r2d, byte [dstq-1]
  365. mov [leftq], r2d
  366. movzx r2d, byte [topq-1]
  367. mov [left_topq], r2d
  368. RET
  369. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  370. add srcq, wq
  371. add dstq, wq
  372. neg wq
  373. %%.loop:
  374. %if %2
  375. mova m1, [srcq+wq]
  376. %else
  377. movu m1, [srcq+wq]
  378. %endif
  379. mova m2, m1
  380. psllw m1, 8
  381. paddb m1, m2
  382. mova m2, m1
  383. pshufb m1, m3
  384. paddb m1, m2
  385. pshufb m0, m5
  386. mova m2, m1
  387. pshufb m1, m4
  388. paddb m1, m2
  389. %if mmsize == 16
  390. mova m2, m1
  391. pshufb m1, m6
  392. paddb m1, m2
  393. %endif
  394. paddb m0, m1
  395. %if %1
  396. mova [dstq+wq], m0
  397. %else
  398. movq [dstq+wq], m0
  399. movhps [dstq+wq+8], m0
  400. %endif
  401. add wq, mmsize
  402. jl %%.loop
  403. mov eax, mmsize-1
  404. sub eax, wd
  405. movd m1, eax
  406. pshufb m0, m1
  407. movd eax, m0
  408. RET
  409. %endmacro
  410. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  411. INIT_MMX
  412. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  413. .skip_prologue:
  414. mova m5, [pb_7]
  415. mova m4, [pb_zzzz3333zzzzbbbb]
  416. mova m3, [pb_zz11zz55zz99zzdd]
  417. movd m0, leftm
  418. psllq m0, 56
  419. ADD_HFYU_LEFT_LOOP 1, 1
  420. INIT_XMM
  421. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  422. mova m5, [pb_f]
  423. mova m6, [pb_zzzzzzzz77777777]
  424. mova m4, [pb_zzzz3333zzzzbbbb]
  425. mova m3, [pb_zz11zz55zz99zzdd]
  426. movd m0, leftm
  427. pslldq m0, 15
  428. test srcq, 15
  429. jnz .src_unaligned
  430. test dstq, 15
  431. jnz .dst_unaligned
  432. ADD_HFYU_LEFT_LOOP 1, 1
  433. .dst_unaligned:
  434. ADD_HFYU_LEFT_LOOP 0, 1
  435. .src_unaligned:
  436. ADD_HFYU_LEFT_LOOP 0, 0
  437. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  438. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  439. neg offsetq
  440. shl offsetq, 2
  441. sub v1q, offsetq
  442. sub v2q, offsetq
  443. xorps xmm0, xmm0
  444. .loop:
  445. movaps xmm1, [v1q+offsetq]
  446. mulps xmm1, [v2q+offsetq]
  447. addps xmm0, xmm1
  448. add offsetq, 16
  449. js .loop
  450. movhlps xmm1, xmm0
  451. addps xmm0, xmm1
  452. movss xmm1, xmm0
  453. shufps xmm0, xmm0, 1
  454. addss xmm0, xmm1
  455. %if ARCH_X86_64 == 0
  456. movss r0m, xmm0
  457. fld dword r0m
  458. %endif
  459. RET
  460. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  461. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  462. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  463. ;
  464. ; The actual function itself is below. It basically wraps a very simple
  465. ; w = end_x - start_x
  466. ; if (w) {
  467. ; if (w > 22) {
  468. ; jump to the slow loop functions
  469. ; } else {
  470. ; jump to the fast loop functions
  471. ; }
  472. ; }
  473. ;
  474. ; ... and then the same for left/right extend also. See below for loop
  475. ; function implementations. Fast are fixed-width, slow is variable-width
  476. %macro EMU_EDGE_FUNC 0
  477. %if ARCH_X86_64
  478. %define w_reg r7
  479. cglobal emu_edge_core, 6, 9, 1
  480. mov r8, r5 ; save block_h
  481. %else
  482. %define w_reg r6
  483. cglobal emu_edge_core, 2, 7, 0
  484. mov r4, r4m ; end_y
  485. mov r5, r5m ; block_h
  486. %endif
  487. ; start with vertical extend (top/bottom) and body pixel copy
  488. mov w_reg, r7m
  489. sub w_reg, r6m ; w = start_x - end_x
  490. sub r5, r4
  491. %if ARCH_X86_64
  492. sub r4, r3
  493. %else
  494. sub r4, dword r3m
  495. %endif
  496. cmp w_reg, 22
  497. jg .slow_v_extend_loop
  498. %if ARCH_X86_32
  499. mov r2, r2m ; linesize
  500. %endif
  501. sal w_reg, 7 ; w * 128
  502. %ifdef PIC
  503. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  504. add w_reg, rax
  505. %else
  506. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  507. %endif
  508. call w_reg ; fast top extend, body copy and bottom extend
  509. .v_extend_end:
  510. ; horizontal extend (left/right)
  511. mov w_reg, r6m ; start_x
  512. sub r0, w_reg
  513. %if ARCH_X86_64
  514. mov r3, r0 ; backup of buf+block_h*linesize
  515. mov r5, r8
  516. %else
  517. mov r0m, r0 ; backup of buf+block_h*linesize
  518. mov r5, r5m
  519. %endif
  520. test w_reg, w_reg
  521. jz .right_extend
  522. cmp w_reg, 22
  523. jg .slow_left_extend_loop
  524. mov r1, w_reg
  525. dec w_reg
  526. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  527. sar w_reg, 1
  528. sal w_reg, 6
  529. ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
  530. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  531. %ifdef PIC
  532. lea rax, [.emuedge_extend_left_2]
  533. add w_reg, rax
  534. %else
  535. lea w_reg, [.emuedge_extend_left_2+w_reg]
  536. %endif
  537. call w_reg
  538. ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
  539. .right_extend:
  540. %if ARCH_X86_32
  541. mov r0, r0m
  542. mov r5, r5m
  543. %endif
  544. mov w_reg, r7m ; end_x
  545. mov r1, r8m ; block_w
  546. mov r4, r1
  547. sub r1, w_reg
  548. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  549. cmp r1, 22
  550. jg .slow_right_extend_loop
  551. dec r1
  552. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  553. sar r1, 1
  554. sal r1, 6
  555. %ifdef PIC
  556. lea rax, [.emuedge_extend_right_2]
  557. add r1, rax
  558. %else
  559. lea r1, [.emuedge_extend_right_2+r1]
  560. %endif
  561. call r1
  562. .h_extend_end:
  563. RET
  564. %if ARCH_X86_64
  565. %define vall al
  566. %define valh ah
  567. %define valw ax
  568. %define valw2 r7w
  569. %define valw3 r3w
  570. %if WIN64
  571. %define valw4 r7w
  572. %else ; unix64
  573. %define valw4 r3w
  574. %endif
  575. %define vald eax
  576. %else
  577. %define vall bl
  578. %define valh bh
  579. %define valw bx
  580. %define valw2 r6w
  581. %define valw3 valw2
  582. %define valw4 valw3
  583. %define vald ebx
  584. %define stack_offset 0x14
  585. %endif
  586. %endmacro
  587. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  588. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  589. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  590. ; - else if (%2 & 8) fills 8 bytes into mm0
  591. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  592. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  593. ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
  594. ; (note that we're using r3 for body/bottom because it's a shorter
  595. ; opcode, and then the loop fits in 128 bytes)
  596. ; - else fills remaining bytes into rax
  597. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  598. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  599. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  600. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  601. ; - else fills remaining bytes into ebx
  602. ; writing data out is in the same way
  603. %macro READ_NUM_BYTES 2
  604. %assign %%src_off 0 ; offset in source buffer
  605. %assign %%smidx 0 ; mmx register idx
  606. %assign %%sxidx 0 ; xmm register idx
  607. %if cpuflag(sse)
  608. %rep %2/16
  609. movups xmm %+ %%sxidx, [r1+%%src_off]
  610. %assign %%src_off %%src_off+16
  611. %assign %%sxidx %%sxidx+1
  612. %endrep ; %2/16
  613. %endif
  614. %if ARCH_X86_64
  615. %if (%2-%%src_off) == 8
  616. mov rax, [r1+%%src_off]
  617. %assign %%src_off %%src_off+8
  618. %endif ; (%2-%%src_off) == 8
  619. %endif ; x86-64
  620. %rep (%2-%%src_off)/8
  621. movq mm %+ %%smidx, [r1+%%src_off]
  622. %assign %%src_off %%src_off+8
  623. %assign %%smidx %%smidx+1
  624. %endrep ; (%2-%%dst_off)/8
  625. %if (%2-%%src_off) == 4
  626. mov vald, [r1+%%src_off]
  627. %elif (%2-%%src_off) & 4
  628. movd mm %+ %%smidx, [r1+%%src_off]
  629. %assign %%src_off %%src_off+4
  630. %endif ; (%2-%%src_off) ==/& 4
  631. %if (%2-%%src_off) == 1
  632. mov vall, [r1+%%src_off]
  633. %elif (%2-%%src_off) == 2
  634. mov valw, [r1+%%src_off]
  635. %elif (%2-%%src_off) == 3
  636. %ifidn %1, top
  637. mov valw2, [r1+%%src_off]
  638. %elifidn %1, body
  639. mov valw3, [r1+%%src_off]
  640. %elifidn %1, bottom
  641. mov valw4, [r1+%%src_off]
  642. %endif ; %1 ==/!= top
  643. mov vall, [r1+%%src_off+2]
  644. %endif ; (%2-%%src_off) == 1/2/3
  645. %endmacro ; READ_NUM_BYTES
  646. %macro WRITE_NUM_BYTES 2
  647. %assign %%dst_off 0 ; offset in destination buffer
  648. %assign %%dmidx 0 ; mmx register idx
  649. %assign %%dxidx 0 ; xmm register idx
  650. %if cpuflag(sse)
  651. %rep %2/16
  652. movups [r0+%%dst_off], xmm %+ %%dxidx
  653. %assign %%dst_off %%dst_off+16
  654. %assign %%dxidx %%dxidx+1
  655. %endrep ; %2/16
  656. %endif
  657. %if ARCH_X86_64
  658. %if (%2-%%dst_off) == 8
  659. mov [r0+%%dst_off], rax
  660. %assign %%dst_off %%dst_off+8
  661. %endif ; (%2-%%dst_off) == 8
  662. %endif ; x86-64
  663. %rep (%2-%%dst_off)/8
  664. movq [r0+%%dst_off], mm %+ %%dmidx
  665. %assign %%dst_off %%dst_off+8
  666. %assign %%dmidx %%dmidx+1
  667. %endrep ; (%2-%%dst_off)/8
  668. %if (%2-%%dst_off) == 4
  669. mov [r0+%%dst_off], vald
  670. %elif (%2-%%dst_off) & 4
  671. movd [r0+%%dst_off], mm %+ %%dmidx
  672. %assign %%dst_off %%dst_off+4
  673. %endif ; (%2-%%dst_off) ==/& 4
  674. %if (%2-%%dst_off) == 1
  675. mov [r0+%%dst_off], vall
  676. %elif (%2-%%dst_off) == 2
  677. mov [r0+%%dst_off], valw
  678. %elif (%2-%%dst_off) == 3
  679. %ifidn %1, top
  680. mov [r0+%%dst_off], valw2
  681. %elifidn %1, body
  682. mov [r0+%%dst_off], valw3
  683. %elifidn %1, bottom
  684. mov [r0+%%dst_off], valw4
  685. %endif ; %1 ==/!= top
  686. mov [r0+%%dst_off+2], vall
  687. %endif ; (%2-%%dst_off) == 1/2/3
  688. %endmacro ; WRITE_NUM_BYTES
  689. ; vertical top/bottom extend and body copy fast loops
  690. ; these are function pointers to set-width line copy functions, i.e.
  691. ; they read a fixed number of pixels into set registers, and write
  692. ; those out into the destination buffer
  693. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  694. ; r6(eax/64)/r3(ebx/32)=val_reg
  695. %macro VERTICAL_EXTEND 0
  696. %assign %%n 1
  697. %rep 22
  698. ALIGN 128
  699. .emuedge_v_extend_ %+ %%n:
  700. ; extend pixels above body
  701. %if ARCH_X86_64
  702. test r3 , r3 ; if (!start_y)
  703. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  704. %else ; ARCH_X86_32
  705. cmp dword r3m, 0
  706. je .emuedge_copy_body_ %+ %%n %+ _loop
  707. %endif ; ARCH_X86_64/32
  708. READ_NUM_BYTES top, %%n ; read bytes
  709. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  710. WRITE_NUM_BYTES top, %%n ; write bytes
  711. add r0 , r2 ; dst += linesize
  712. %if ARCH_X86_64
  713. dec r3d
  714. %else ; ARCH_X86_32
  715. dec dword r3m
  716. %endif ; ARCH_X86_64/32
  717. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  718. ; copy body pixels
  719. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  720. READ_NUM_BYTES body, %%n ; read bytes
  721. WRITE_NUM_BYTES body, %%n ; write bytes
  722. add r0 , r2 ; dst += linesize
  723. add r1 , r2 ; src += linesize
  724. dec r4d
  725. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  726. ; copy bottom pixels
  727. test r5 , r5 ; if (!block_h)
  728. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  729. sub r1 , r2 ; src -= linesize
  730. READ_NUM_BYTES bottom, %%n ; read bytes
  731. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  732. WRITE_NUM_BYTES bottom, %%n ; write bytes
  733. add r0 , r2 ; dst += linesize
  734. dec r5d
  735. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  736. .emuedge_v_extend_end_ %+ %%n:
  737. %if ARCH_X86_64
  738. ret
  739. %else ; ARCH_X86_32
  740. rep ret
  741. %endif ; ARCH_X86_64/32
  742. %assign %%n %%n+1
  743. %endrep
  744. %endmacro VERTICAL_EXTEND
  745. ; left/right (horizontal) fast extend functions
  746. ; these are essentially identical to the vertical extend ones above,
  747. ; just left/right separated because number of pixels to extend is
  748. ; obviously not the same on both sides.
  749. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  750. ; lowest two bytes of the register (so val*0x0101), and are splatted
  751. ; into each byte of mm0 as well if n_pixels >= 8
  752. %macro READ_V_PIXEL 2
  753. mov vall, %2
  754. mov valh, vall
  755. %if %1 >= 8
  756. movd mm0, vald
  757. %if cpuflag(mmxext)
  758. pshufw mm0, mm0, 0
  759. %else ; mmx
  760. punpcklwd mm0, mm0
  761. punpckldq mm0, mm0
  762. %endif ; sse
  763. %endif ; %1 >= 8
  764. %endmacro
  765. %macro WRITE_V_PIXEL 2
  766. %assign %%dst_off 0
  767. %rep %1/8
  768. movq [%2+%%dst_off], mm0
  769. %assign %%dst_off %%dst_off+8
  770. %endrep
  771. %if %1 & 4
  772. %if %1 >= 8
  773. movd [%2+%%dst_off], mm0
  774. %else ; %1 < 8
  775. mov [%2+%%dst_off] , valw
  776. mov [%2+%%dst_off+2], valw
  777. %endif ; %1 >=/< 8
  778. %assign %%dst_off %%dst_off+4
  779. %endif ; %1 & 4
  780. %if %1&2
  781. mov [%2+%%dst_off], valw
  782. %endif ; %1 & 2
  783. %endmacro
  784. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  785. %macro LEFT_EXTEND 0
  786. %assign %%n 2
  787. %rep 11
  788. ALIGN 64
  789. .emuedge_extend_left_ %+ %%n: ; do {
  790. sub r0, r2 ; dst -= linesize
  791. READ_V_PIXEL %%n, [r0+r1] ; read pixels
  792. WRITE_V_PIXEL %%n, r0 ; write pixels
  793. dec r5
  794. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  795. %if ARCH_X86_64
  796. ret
  797. %else ; ARCH_X86_32
  798. rep ret
  799. %endif ; ARCH_X86_64/32
  800. %assign %%n %%n+2
  801. %endrep
  802. %endmacro ; LEFT_EXTEND
  803. ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
  804. %macro RIGHT_EXTEND 0
  805. %assign %%n 2
  806. %rep 11
  807. ALIGN 64
  808. .emuedge_extend_right_ %+ %%n: ; do {
  809. %if ARCH_X86_64
  810. sub r3, r2 ; dst -= linesize
  811. READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
  812. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  813. dec r8
  814. %else ; ARCH_X86_32
  815. sub r0, r2 ; dst -= linesize
  816. READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
  817. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  818. dec r5
  819. %endif ; ARCH_X86_64/32
  820. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  821. %if ARCH_X86_64
  822. ret
  823. %else ; ARCH_X86_32
  824. rep ret
  825. %endif ; ARCH_X86_64/32
  826. %assign %%n %%n+2
  827. %endrep
  828. %if ARCH_X86_32
  829. %define stack_offset 0x10
  830. %endif
  831. %endmacro ; RIGHT_EXTEND
  832. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  833. ; width specified in a register, and run a loop to copy the full amount
  834. ; of bytes. They are optimized for copying of large amounts of pixels per
  835. ; line, so they unconditionally splat data into mm registers to copy 8
  836. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  837. ; also, but I haven't optimized this as much (i.e. FIXME)
  838. %macro V_COPY_NPX 4-5
  839. %if %0 == 4
  840. test w_reg, %4
  841. jz .%1_skip_%4_px
  842. %else ; %0 == 5
  843. .%1_%4_px_loop:
  844. %endif
  845. %3 %2, [r1+cnt_reg]
  846. %3 [r0+cnt_reg], %2
  847. add cnt_reg, %4
  848. %if %0 == 5
  849. sub w_reg, %4
  850. test w_reg, %5
  851. jnz .%1_%4_px_loop
  852. %endif
  853. .%1_skip_%4_px:
  854. %endmacro
  855. %macro V_COPY_ROW 2
  856. %ifidn %1, bottom
  857. sub r1, linesize
  858. %endif
  859. .%1_copy_loop:
  860. xor cnt_reg, cnt_reg
  861. %if notcpuflag(sse)
  862. %define linesize r2m
  863. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  864. %else ; sse
  865. V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
  866. %if ARCH_X86_64
  867. %define linesize r2
  868. V_COPY_NPX %1, rax , mov, 8
  869. %else ; ARCH_X86_32
  870. %define linesize r2m
  871. V_COPY_NPX %1, mm0, movq, 8
  872. %endif ; ARCH_X86_64/32
  873. %endif ; sse
  874. V_COPY_NPX %1, vald, mov, 4
  875. V_COPY_NPX %1, valw, mov, 2
  876. V_COPY_NPX %1, vall, mov, 1
  877. mov w_reg, cnt_reg
  878. %ifidn %1, body
  879. add r1, linesize
  880. %endif
  881. add r0, linesize
  882. dec %2
  883. jnz .%1_copy_loop
  884. %endmacro
  885. %macro SLOW_V_EXTEND 0
  886. .slow_v_extend_loop:
  887. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  888. ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
  889. %if ARCH_X86_64
  890. push r8 ; save old value of block_h
  891. test r3, r3
  892. %define cnt_reg r8
  893. jz .do_body_copy ; if (!start_y) goto do_body_copy
  894. V_COPY_ROW top, r3
  895. %else
  896. cmp dword r3m, 0
  897. %define cnt_reg r2
  898. je .do_body_copy ; if (!start_y) goto do_body_copy
  899. V_COPY_ROW top, dword r3m
  900. %endif
  901. .do_body_copy:
  902. V_COPY_ROW body, r4
  903. %if ARCH_X86_64
  904. pop r8 ; restore old value of block_h
  905. %define cnt_reg r3
  906. %endif
  907. test r5, r5
  908. %if ARCH_X86_64
  909. jz .v_extend_end
  910. %else
  911. jz .skip_bottom_extend
  912. %endif
  913. V_COPY_ROW bottom, r5
  914. %if ARCH_X86_32
  915. .skip_bottom_extend:
  916. mov r2, r2m
  917. %endif
  918. jmp .v_extend_end
  919. %endmacro
  920. %macro SLOW_LEFT_EXTEND 0
  921. .slow_left_extend_loop:
  922. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
  923. mov r4, 8
  924. sub r0, linesize
  925. READ_V_PIXEL 8, [r0+w_reg]
  926. .left_extend_8px_loop:
  927. movq [r0+r4-8], mm0
  928. add r4, 8
  929. cmp r4, w_reg
  930. jle .left_extend_8px_loop
  931. sub r4, 8
  932. cmp r4, w_reg
  933. jge .left_extend_loop_end
  934. .left_extend_2px_loop:
  935. mov [r0+r4], valw
  936. add r4, 2
  937. cmp r4, w_reg
  938. jl .left_extend_2px_loop
  939. .left_extend_loop_end:
  940. dec r5
  941. jnz .slow_left_extend_loop
  942. %if ARCH_X86_32
  943. mov r2, r2m
  944. %endif
  945. jmp .right_extend
  946. %endmacro
  947. %macro SLOW_RIGHT_EXTEND 0
  948. .slow_right_extend_loop:
  949. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
  950. ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  951. %if ARCH_X86_64
  952. %define buf_reg r3
  953. %define bh_reg r8
  954. %else
  955. %define buf_reg r0
  956. %define bh_reg r5
  957. %endif
  958. lea r1, [r4-8]
  959. sub buf_reg, linesize
  960. READ_V_PIXEL 8, [buf_reg+w_reg-1]
  961. .right_extend_8px_loop:
  962. movq [buf_reg+r1], mm0
  963. sub r1, 8
  964. cmp r1, w_reg
  965. jge .right_extend_8px_loop
  966. add r1, 8
  967. cmp r1, w_reg
  968. je .right_extend_loop_end
  969. .right_extend_2px_loop:
  970. sub r1, 2
  971. mov [buf_reg+r1], valw
  972. cmp r1, w_reg
  973. jg .right_extend_2px_loop
  974. .right_extend_loop_end:
  975. dec bh_reg
  976. jnz .slow_right_extend_loop
  977. jmp .h_extend_end
  978. %endmacro
  979. %macro emu_edge 1
  980. INIT_XMM %1
  981. EMU_EDGE_FUNC
  982. VERTICAL_EXTEND
  983. LEFT_EXTEND
  984. RIGHT_EXTEND
  985. SLOW_V_EXTEND
  986. SLOW_LEFT_EXTEND
  987. SLOW_RIGHT_EXTEND
  988. %endmacro
  989. emu_edge sse
  990. %if ARCH_X86_32
  991. emu_edge mmx
  992. %endif
  993. ;-----------------------------------------------------------------------------
  994. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  995. ; int32_t max, unsigned int len)
  996. ;-----------------------------------------------------------------------------
  997. ; %1 = number of xmm registers used
  998. ; %2 = number of inline load/process/store loops per asm loop
  999. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  1000. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  1001. ; %5 = suffix
  1002. %macro VECTOR_CLIP_INT32 4-5
  1003. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  1004. %if %4
  1005. cvtsi2ss m4, minm
  1006. cvtsi2ss m5, maxm
  1007. %else
  1008. movd m4, minm
  1009. movd m5, maxm
  1010. %endif
  1011. SPLATD m4
  1012. SPLATD m5
  1013. .loop:
  1014. %assign %%i 1
  1015. %rep %2
  1016. mova m0, [srcq+mmsize*0*%%i]
  1017. mova m1, [srcq+mmsize*1*%%i]
  1018. mova m2, [srcq+mmsize*2*%%i]
  1019. mova m3, [srcq+mmsize*3*%%i]
  1020. %if %3
  1021. mova m7, [srcq+mmsize*4*%%i]
  1022. mova m8, [srcq+mmsize*5*%%i]
  1023. mova m9, [srcq+mmsize*6*%%i]
  1024. mova m10, [srcq+mmsize*7*%%i]
  1025. %endif
  1026. CLIPD m0, m4, m5, m6
  1027. CLIPD m1, m4, m5, m6
  1028. CLIPD m2, m4, m5, m6
  1029. CLIPD m3, m4, m5, m6
  1030. %if %3
  1031. CLIPD m7, m4, m5, m6
  1032. CLIPD m8, m4, m5, m6
  1033. CLIPD m9, m4, m5, m6
  1034. CLIPD m10, m4, m5, m6
  1035. %endif
  1036. mova [dstq+mmsize*0*%%i], m0
  1037. mova [dstq+mmsize*1*%%i], m1
  1038. mova [dstq+mmsize*2*%%i], m2
  1039. mova [dstq+mmsize*3*%%i], m3
  1040. %if %3
  1041. mova [dstq+mmsize*4*%%i], m7
  1042. mova [dstq+mmsize*5*%%i], m8
  1043. mova [dstq+mmsize*6*%%i], m9
  1044. mova [dstq+mmsize*7*%%i], m10
  1045. %endif
  1046. %assign %%i %%i+1
  1047. %endrep
  1048. add srcq, mmsize*4*(%2+%3)
  1049. add dstq, mmsize*4*(%2+%3)
  1050. sub lend, mmsize*(%2+%3)
  1051. jg .loop
  1052. REP_RET
  1053. %endmacro
  1054. INIT_MMX mmx
  1055. %define SPLATD SPLATD_MMX
  1056. %define CLIPD CLIPD_MMX
  1057. VECTOR_CLIP_INT32 0, 1, 0, 0
  1058. INIT_XMM sse2
  1059. %define SPLATD SPLATD_SSE2
  1060. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  1061. %define CLIPD CLIPD_SSE2
  1062. VECTOR_CLIP_INT32 6, 2, 0, 1
  1063. INIT_XMM sse4
  1064. %define CLIPD CLIPD_SSE41
  1065. %ifdef m8
  1066. VECTOR_CLIP_INT32 11, 1, 1, 0
  1067. %else
  1068. VECTOR_CLIP_INT32 6, 1, 0, 0
  1069. %endif
  1070. ;-----------------------------------------------------------------------------
  1071. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  1072. ; int len)
  1073. ;-----------------------------------------------------------------------------
  1074. %macro VECTOR_FMUL_REVERSE 0
  1075. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  1076. lea lenq, [lend*4 - 2*mmsize]
  1077. ALIGN 16
  1078. .loop:
  1079. %if cpuflag(avx)
  1080. vmovaps xmm0, [src1q + 16]
  1081. vinsertf128 m0, m0, [src1q], 1
  1082. vshufps m0, m0, m0, q0123
  1083. vmovaps xmm1, [src1q + mmsize + 16]
  1084. vinsertf128 m1, m1, [src1q + mmsize], 1
  1085. vshufps m1, m1, m1, q0123
  1086. %else
  1087. mova m0, [src1q]
  1088. mova m1, [src1q + mmsize]
  1089. shufps m0, m0, q0123
  1090. shufps m1, m1, q0123
  1091. %endif
  1092. mulps m0, m0, [src0q + lenq + mmsize]
  1093. mulps m1, m1, [src0q + lenq]
  1094. mova [dstq + lenq + mmsize], m0
  1095. mova [dstq + lenq], m1
  1096. add src1q, 2*mmsize
  1097. sub lenq, 2*mmsize
  1098. jge .loop
  1099. REP_RET
  1100. %endmacro
  1101. INIT_XMM sse
  1102. VECTOR_FMUL_REVERSE
  1103. INIT_YMM avx
  1104. VECTOR_FMUL_REVERSE
  1105. ;-----------------------------------------------------------------------------
  1106. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  1107. ; const float *src2, int len)
  1108. ;-----------------------------------------------------------------------------
  1109. %macro VECTOR_FMUL_ADD 0
  1110. cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
  1111. lea lenq, [lend*4 - 2*mmsize]
  1112. ALIGN 16
  1113. .loop:
  1114. mova m0, [src0q + lenq]
  1115. mova m1, [src0q + lenq + mmsize]
  1116. mulps m0, m0, [src1q + lenq]
  1117. mulps m1, m1, [src1q + lenq + mmsize]
  1118. addps m0, m0, [src2q + lenq]
  1119. addps m1, m1, [src2q + lenq + mmsize]
  1120. mova [dstq + lenq], m0
  1121. mova [dstq + lenq + mmsize], m1
  1122. sub lenq, 2*mmsize
  1123. jge .loop
  1124. REP_RET
  1125. %endmacro
  1126. INIT_XMM sse
  1127. VECTOR_FMUL_ADD
  1128. INIT_YMM avx
  1129. VECTOR_FMUL_ADD
  1130. ;-----------------------------------------------------------------------------
  1131. ; void ff_butterflies_float_interleave(float *dst, const float *src0,
  1132. ; const float *src1, int len);
  1133. ;-----------------------------------------------------------------------------
  1134. %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
  1135. cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
  1136. %if ARCH_X86_64
  1137. movsxd lenq, lend
  1138. %endif
  1139. test lenq, lenq
  1140. jz .end
  1141. shl lenq, 2
  1142. lea src0q, [src0q + lenq]
  1143. lea src1q, [src1q + lenq]
  1144. lea dstq, [ dstq + 2*lenq]
  1145. neg lenq
  1146. .loop:
  1147. mova m0, [src0q + lenq]
  1148. mova m1, [src1q + lenq]
  1149. subps m2, m0, m1
  1150. addps m0, m0, m1
  1151. unpcklps m1, m0, m2
  1152. unpckhps m0, m0, m2
  1153. %if cpuflag(avx)
  1154. vextractf128 [dstq + 2*lenq ], m1, 0
  1155. vextractf128 [dstq + 2*lenq + 16], m0, 0
  1156. vextractf128 [dstq + 2*lenq + 32], m1, 1
  1157. vextractf128 [dstq + 2*lenq + 48], m0, 1
  1158. %else
  1159. mova [dstq + 2*lenq ], m1
  1160. mova [dstq + 2*lenq + mmsize], m0
  1161. %endif
  1162. add lenq, mmsize
  1163. jl .loop
  1164. .end:
  1165. REP_RET
  1166. %endmacro
  1167. INIT_XMM sse
  1168. BUTTERFLIES_FLOAT_INTERLEAVE
  1169. INIT_YMM avx
  1170. BUTTERFLIES_FLOAT_INTERLEAVE
  1171. INIT_XMM sse2
  1172. ; %1 = aligned/unaligned
  1173. %macro BSWAP_LOOPS_SSE2 1
  1174. mov r3, r2
  1175. sar r2, 3
  1176. jz .left4_%1
  1177. .loop8_%1:
  1178. mov%1 m0, [r1 + 0]
  1179. mov%1 m1, [r1 + 16]
  1180. pshuflw m0, m0, 10110001b
  1181. pshuflw m1, m1, 10110001b
  1182. pshufhw m0, m0, 10110001b
  1183. pshufhw m1, m1, 10110001b
  1184. mova m2, m0
  1185. mova m3, m1
  1186. psllw m0, 8
  1187. psllw m1, 8
  1188. psrlw m2, 8
  1189. psrlw m3, 8
  1190. por m2, m0
  1191. por m3, m1
  1192. mova [r0 + 0], m2
  1193. mova [r0 + 16], m3
  1194. add r1, 32
  1195. add r0, 32
  1196. dec r2
  1197. jnz .loop8_%1
  1198. .left4_%1:
  1199. mov r2, r3
  1200. and r3, 4
  1201. jz .left
  1202. mov%1 m0, [r1]
  1203. pshuflw m0, m0, 10110001b
  1204. pshufhw m0, m0, 10110001b
  1205. mova m2, m0
  1206. psllw m0, 8
  1207. psrlw m2, 8
  1208. por m2, m0
  1209. mova [r0], m2
  1210. add r1, 16
  1211. add r0, 16
  1212. %endmacro
  1213. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1214. cglobal bswap32_buf, 3,4,5
  1215. mov r3, r1
  1216. and r3, 15
  1217. jz .start_align
  1218. BSWAP_LOOPS_SSE2 u
  1219. jmp .left
  1220. .start_align:
  1221. BSWAP_LOOPS_SSE2 a
  1222. .left:
  1223. and r2, 3
  1224. jz .end
  1225. .loop2:
  1226. mov r3d, [r1]
  1227. bswap r3d
  1228. mov [r0], r3d
  1229. add r1, 4
  1230. add r0, 4
  1231. dec r2
  1232. jnz .loop2
  1233. .end:
  1234. RET
  1235. ; %1 = aligned/unaligned
  1236. %macro BSWAP_LOOPS_SSSE3 1
  1237. mov r3, r2
  1238. sar r2, 3
  1239. jz .left4_%1
  1240. .loop8_%1:
  1241. mov%1 m0, [r1 + 0]
  1242. mov%1 m1, [r1 + 16]
  1243. pshufb m0, m2
  1244. pshufb m1, m2
  1245. mova [r0 + 0], m0
  1246. mova [r0 + 16], m1
  1247. add r0, 32
  1248. add r1, 32
  1249. dec r2
  1250. jnz .loop8_%1
  1251. .left4_%1:
  1252. mov r2, r3
  1253. and r3, 4
  1254. jz .left2
  1255. mov%1 m0, [r1]
  1256. pshufb m0, m2
  1257. mova [r0], m0
  1258. add r1, 16
  1259. add r0, 16
  1260. %endmacro
  1261. INIT_XMM ssse3
  1262. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1263. cglobal bswap32_buf, 3,4,3
  1264. mov r3, r1
  1265. mova m2, [pb_bswap32]
  1266. and r3, 15
  1267. jz .start_align
  1268. BSWAP_LOOPS_SSSE3 u
  1269. jmp .left2
  1270. .start_align:
  1271. BSWAP_LOOPS_SSSE3 a
  1272. .left2:
  1273. mov r3, r2
  1274. and r2, 2
  1275. jz .left1
  1276. movq m0, [r1]
  1277. pshufb m0, m2
  1278. movq [r0], m0
  1279. add r1, 8
  1280. add r0, 8
  1281. .left1:
  1282. and r3, 1
  1283. jz .end
  1284. mov r2d, [r1]
  1285. bswap r2d
  1286. mov [r0], r2d
  1287. .end:
  1288. RET