You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1363 lines
35KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  29. pd_16384: times 4 dd 16384
  30. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  31. SECTION_TEXT
  32. %macro SCALARPRODUCT 0
  33. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  34. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  35. shl orderq, 1
  36. add v1q, orderq
  37. add v2q, orderq
  38. neg orderq
  39. pxor m2, m2
  40. .loop:
  41. movu m0, [v1q + orderq]
  42. movu m1, [v1q + orderq + mmsize]
  43. pmaddwd m0, [v2q + orderq]
  44. pmaddwd m1, [v2q + orderq + mmsize]
  45. paddd m2, m0
  46. paddd m2, m1
  47. add orderq, mmsize*2
  48. jl .loop
  49. %if mmsize == 16
  50. movhlps m0, m2
  51. paddd m2, m0
  52. pshuflw m0, m2, 0x4e
  53. %else
  54. pshufw m0, m2, 0x4e
  55. %endif
  56. paddd m2, m0
  57. movd eax, m2
  58. RET
  59. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  60. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  61. shl orderq, 1
  62. movd m7, mulm
  63. %if mmsize == 16
  64. pshuflw m7, m7, 0
  65. punpcklqdq m7, m7
  66. %else
  67. pshufw m7, m7, 0
  68. %endif
  69. pxor m6, m6
  70. add v1q, orderq
  71. add v2q, orderq
  72. add v3q, orderq
  73. neg orderq
  74. .loop:
  75. movu m0, [v2q + orderq]
  76. movu m1, [v2q + orderq + mmsize]
  77. mova m4, [v1q + orderq]
  78. mova m5, [v1q + orderq + mmsize]
  79. movu m2, [v3q + orderq]
  80. movu m3, [v3q + orderq + mmsize]
  81. pmaddwd m0, m4
  82. pmaddwd m1, m5
  83. pmullw m2, m7
  84. pmullw m3, m7
  85. paddd m6, m0
  86. paddd m6, m1
  87. paddw m2, m4
  88. paddw m3, m5
  89. mova [v1q + orderq], m2
  90. mova [v1q + orderq + mmsize], m3
  91. add orderq, mmsize*2
  92. jl .loop
  93. %if mmsize == 16
  94. movhlps m0, m6
  95. paddd m6, m0
  96. pshuflw m0, m6, 0x4e
  97. %else
  98. pshufw m0, m6, 0x4e
  99. %endif
  100. paddd m6, m0
  101. movd eax, m6
  102. RET
  103. %endmacro
  104. INIT_MMX mmxext
  105. SCALARPRODUCT
  106. INIT_XMM sse2
  107. SCALARPRODUCT
  108. %macro SCALARPRODUCT_LOOP 1
  109. align 16
  110. .loop%1:
  111. sub orderq, mmsize*2
  112. %if %1
  113. mova m1, m4
  114. mova m4, [v2q + orderq]
  115. mova m0, [v2q + orderq + mmsize]
  116. palignr m1, m0, %1
  117. palignr m0, m4, %1
  118. mova m3, m5
  119. mova m5, [v3q + orderq]
  120. mova m2, [v3q + orderq + mmsize]
  121. palignr m3, m2, %1
  122. palignr m2, m5, %1
  123. %else
  124. mova m0, [v2q + orderq]
  125. mova m1, [v2q + orderq + mmsize]
  126. mova m2, [v3q + orderq]
  127. mova m3, [v3q + orderq + mmsize]
  128. %endif
  129. %define t0 [v1q + orderq]
  130. %define t1 [v1q + orderq + mmsize]
  131. %if ARCH_X86_64
  132. mova m8, t0
  133. mova m9, t1
  134. %define t0 m8
  135. %define t1 m9
  136. %endif
  137. pmaddwd m0, t0
  138. pmaddwd m1, t1
  139. pmullw m2, m7
  140. pmullw m3, m7
  141. paddw m2, t0
  142. paddw m3, t1
  143. paddd m6, m0
  144. paddd m6, m1
  145. mova [v1q + orderq], m2
  146. mova [v1q + orderq + mmsize], m3
  147. jg .loop%1
  148. %if %1
  149. jmp .end
  150. %endif
  151. %endmacro
  152. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  153. INIT_XMM ssse3
  154. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  155. shl orderq, 1
  156. movd m7, mulm
  157. pshuflw m7, m7, 0
  158. punpcklqdq m7, m7
  159. pxor m6, m6
  160. mov r4d, v2d
  161. and r4d, 15
  162. and v2q, ~15
  163. and v3q, ~15
  164. mova m4, [v2q + orderq]
  165. mova m5, [v3q + orderq]
  166. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  167. cmp r4d, 0
  168. je .loop0
  169. cmp r4d, 2
  170. je .loop2
  171. cmp r4d, 4
  172. je .loop4
  173. cmp r4d, 6
  174. je .loop6
  175. cmp r4d, 8
  176. je .loop8
  177. cmp r4d, 10
  178. je .loop10
  179. cmp r4d, 12
  180. je .loop12
  181. SCALARPRODUCT_LOOP 14
  182. SCALARPRODUCT_LOOP 12
  183. SCALARPRODUCT_LOOP 10
  184. SCALARPRODUCT_LOOP 8
  185. SCALARPRODUCT_LOOP 6
  186. SCALARPRODUCT_LOOP 4
  187. SCALARPRODUCT_LOOP 2
  188. SCALARPRODUCT_LOOP 0
  189. .end:
  190. movhlps m0, m6
  191. paddd m6, m0
  192. pshuflw m0, m6, 0x4e
  193. paddd m6, m0
  194. movd eax, m6
  195. RET
  196. ;-----------------------------------------------------------------------------
  197. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  198. ; const int16_t *window, unsigned int len)
  199. ;-----------------------------------------------------------------------------
  200. %macro REVERSE_WORDS 1-2
  201. %if cpuflag(ssse3) && notcpuflag(atom)
  202. pshufb %1, %2
  203. %elif cpuflag(sse2)
  204. pshuflw %1, %1, 0x1B
  205. pshufhw %1, %1, 0x1B
  206. pshufd %1, %1, 0x4E
  207. %elif cpuflag(mmxext)
  208. pshufw %1, %1, 0x1B
  209. %endif
  210. %endmacro
  211. %macro MUL16FIXED 3
  212. %if cpuflag(ssse3) ; dst, src, unused
  213. ; dst = ((dst * src) + (1<<14)) >> 15
  214. pmulhrsw %1, %2
  215. %elif cpuflag(mmxext) ; dst, src, temp
  216. ; dst = (dst * src) >> 15
  217. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  218. ; in from the pmullw result.
  219. mova %3, %1
  220. pmulhw %1, %2
  221. pmullw %3, %2
  222. psrlw %3, 15
  223. psllw %1, 1
  224. por %1, %3
  225. %endif
  226. %endmacro
  227. %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
  228. %if %1
  229. cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
  230. %else
  231. cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
  232. %endif
  233. lea offset2q, [offsetq-mmsize]
  234. %if cpuflag(ssse3) && notcpuflag(atom)
  235. mova m5, [pb_revwords]
  236. ALIGN 16
  237. %elif %1
  238. mova m5, [pd_16384]
  239. %endif
  240. .loop:
  241. %if cpuflag(ssse3)
  242. ; This version does the 16x16->16 multiplication in-place without expanding
  243. ; to 32-bit. The ssse3 version is bit-identical.
  244. mova m0, [windowq+offset2q]
  245. mova m1, [ inputq+offset2q]
  246. pmulhrsw m1, m0
  247. REVERSE_WORDS m0, m5
  248. pmulhrsw m0, [ inputq+offsetq ]
  249. mova [outputq+offset2q], m1
  250. mova [outputq+offsetq ], m0
  251. %elif %1
  252. ; This version expands 16-bit to 32-bit, multiplies by the window,
  253. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  254. ; save to the output. The window is reversed for the second half.
  255. mova m3, [windowq+offset2q]
  256. mova m4, [ inputq+offset2q]
  257. pxor m0, m0
  258. punpcklwd m0, m3
  259. punpcklwd m1, m4
  260. pmaddwd m0, m1
  261. paddd m0, m5
  262. psrad m0, 15
  263. pxor m2, m2
  264. punpckhwd m2, m3
  265. punpckhwd m1, m4
  266. pmaddwd m2, m1
  267. paddd m2, m5
  268. psrad m2, 15
  269. packssdw m0, m2
  270. mova [outputq+offset2q], m0
  271. REVERSE_WORDS m3
  272. mova m4, [ inputq+offsetq]
  273. pxor m0, m0
  274. punpcklwd m0, m3
  275. punpcklwd m1, m4
  276. pmaddwd m0, m1
  277. paddd m0, m5
  278. psrad m0, 15
  279. pxor m2, m2
  280. punpckhwd m2, m3
  281. punpckhwd m1, m4
  282. pmaddwd m2, m1
  283. paddd m2, m5
  284. psrad m2, 15
  285. packssdw m0, m2
  286. mova [outputq+offsetq], m0
  287. %else
  288. ; This version does the 16x16->16 multiplication in-place without expanding
  289. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  290. ; therefore are not bit-identical to the C version.
  291. mova m0, [windowq+offset2q]
  292. mova m1, [ inputq+offset2q]
  293. mova m2, [ inputq+offsetq ]
  294. MUL16FIXED m1, m0, m3
  295. REVERSE_WORDS m0
  296. MUL16FIXED m2, m0, m3
  297. mova [outputq+offset2q], m1
  298. mova [outputq+offsetq ], m2
  299. %endif
  300. add offsetd, mmsize
  301. sub offset2d, mmsize
  302. jae .loop
  303. REP_RET
  304. %endmacro
  305. INIT_MMX mmxext
  306. APPLY_WINDOW_INT16 0
  307. INIT_XMM sse2
  308. APPLY_WINDOW_INT16 0
  309. INIT_MMX mmxext
  310. APPLY_WINDOW_INT16 1
  311. INIT_XMM sse2
  312. APPLY_WINDOW_INT16 1
  313. INIT_XMM ssse3
  314. APPLY_WINDOW_INT16 1
  315. INIT_XMM ssse3, atom
  316. APPLY_WINDOW_INT16 1
  317. ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  318. INIT_MMX mmxext
  319. cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
  320. movq mm0, [topq]
  321. movq mm2, mm0
  322. movd mm4, [left_topq]
  323. psllq mm2, 8
  324. movq mm1, mm0
  325. por mm4, mm2
  326. movd mm3, [leftq]
  327. psubb mm0, mm4 ; t-tl
  328. add dstq, wq
  329. add topq, wq
  330. add diffq, wq
  331. neg wq
  332. jmp .skip
  333. .loop:
  334. movq mm4, [topq+wq]
  335. movq mm0, mm4
  336. psllq mm4, 8
  337. por mm4, mm1
  338. movq mm1, mm0 ; t
  339. psubb mm0, mm4 ; t-tl
  340. .skip:
  341. movq mm2, [diffq+wq]
  342. %assign i 0
  343. %rep 8
  344. movq mm4, mm0
  345. paddb mm4, mm3 ; t-tl+l
  346. movq mm5, mm3
  347. pmaxub mm3, mm1
  348. pminub mm5, mm1
  349. pminub mm3, mm4
  350. pmaxub mm3, mm5 ; median
  351. paddb mm3, mm2 ; +residual
  352. %if i==0
  353. movq mm7, mm3
  354. psllq mm7, 56
  355. %else
  356. movq mm6, mm3
  357. psrlq mm7, 8
  358. psllq mm6, 56
  359. por mm7, mm6
  360. %endif
  361. %if i<7
  362. psrlq mm0, 8
  363. psrlq mm1, 8
  364. psrlq mm2, 8
  365. %endif
  366. %assign i i+1
  367. %endrep
  368. movq [dstq+wq], mm7
  369. add wq, 8
  370. jl .loop
  371. movzx r2d, byte [dstq-1]
  372. mov [leftq], r2d
  373. movzx r2d, byte [topq-1]
  374. mov [left_topq], r2d
  375. RET
  376. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  377. add srcq, wq
  378. add dstq, wq
  379. neg wq
  380. %%.loop:
  381. %if %2
  382. mova m1, [srcq+wq]
  383. %else
  384. movu m1, [srcq+wq]
  385. %endif
  386. mova m2, m1
  387. psllw m1, 8
  388. paddb m1, m2
  389. mova m2, m1
  390. pshufb m1, m3
  391. paddb m1, m2
  392. pshufb m0, m5
  393. mova m2, m1
  394. pshufb m1, m4
  395. paddb m1, m2
  396. %if mmsize == 16
  397. mova m2, m1
  398. pshufb m1, m6
  399. paddb m1, m2
  400. %endif
  401. paddb m0, m1
  402. %if %1
  403. mova [dstq+wq], m0
  404. %else
  405. movq [dstq+wq], m0
  406. movhps [dstq+wq+8], m0
  407. %endif
  408. add wq, mmsize
  409. jl %%.loop
  410. mov eax, mmsize-1
  411. sub eax, wd
  412. movd m1, eax
  413. pshufb m0, m1
  414. movd eax, m0
  415. RET
  416. %endmacro
  417. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  418. INIT_MMX ssse3
  419. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  420. .skip_prologue:
  421. mova m5, [pb_7]
  422. mova m4, [pb_zzzz3333zzzzbbbb]
  423. mova m3, [pb_zz11zz55zz99zzdd]
  424. movd m0, leftm
  425. psllq m0, 56
  426. ADD_HFYU_LEFT_LOOP 1, 1
  427. INIT_XMM sse4
  428. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  429. mova m5, [pb_f]
  430. mova m6, [pb_zzzzzzzz77777777]
  431. mova m4, [pb_zzzz3333zzzzbbbb]
  432. mova m3, [pb_zz11zz55zz99zzdd]
  433. movd m0, leftm
  434. pslldq m0, 15
  435. test srcq, 15
  436. jnz .src_unaligned
  437. test dstq, 15
  438. jnz .dst_unaligned
  439. ADD_HFYU_LEFT_LOOP 1, 1
  440. .dst_unaligned:
  441. ADD_HFYU_LEFT_LOOP 0, 1
  442. .src_unaligned:
  443. ADD_HFYU_LEFT_LOOP 0, 0
  444. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  445. INIT_XMM sse
  446. cglobal scalarproduct_float, 3,3,2, v1, v2, offset
  447. neg offsetq
  448. shl offsetq, 2
  449. sub v1q, offsetq
  450. sub v2q, offsetq
  451. xorps xmm0, xmm0
  452. .loop:
  453. movaps xmm1, [v1q+offsetq]
  454. mulps xmm1, [v2q+offsetq]
  455. addps xmm0, xmm1
  456. add offsetq, 16
  457. js .loop
  458. movhlps xmm1, xmm0
  459. addps xmm0, xmm1
  460. movss xmm1, xmm0
  461. shufps xmm0, xmm0, 1
  462. addss xmm0, xmm1
  463. %if ARCH_X86_64 == 0
  464. movss r0m, xmm0
  465. fld dword r0m
  466. %endif
  467. RET
  468. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  469. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  470. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  471. ;
  472. ; The actual function itself is below. It basically wraps a very simple
  473. ; w = end_x - start_x
  474. ; if (w) {
  475. ; if (w > 22) {
  476. ; jump to the slow loop functions
  477. ; } else {
  478. ; jump to the fast loop functions
  479. ; }
  480. ; }
  481. ;
  482. ; ... and then the same for left/right extend also. See below for loop
  483. ; function implementations. Fast are fixed-width, slow is variable-width
  484. %macro EMU_EDGE_FUNC 0
  485. %if ARCH_X86_64
  486. %define w_reg r7
  487. cglobal emu_edge_core, 6, 9, 1
  488. mov r8, r5 ; save block_h
  489. %else
  490. %define w_reg r6
  491. cglobal emu_edge_core, 2, 7, 0
  492. mov r4, r4m ; end_y
  493. mov r5, r5m ; block_h
  494. %endif
  495. ; start with vertical extend (top/bottom) and body pixel copy
  496. mov w_reg, r7m
  497. sub w_reg, r6m ; w = start_x - end_x
  498. sub r5, r4
  499. %if ARCH_X86_64
  500. sub r4, r3
  501. %else
  502. sub r4, dword r3m
  503. %endif
  504. cmp w_reg, 22
  505. jg .slow_v_extend_loop
  506. %if ARCH_X86_32
  507. mov r2, r2m ; linesize
  508. %endif
  509. sal w_reg, 7 ; w * 128
  510. %ifdef PIC
  511. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  512. add w_reg, rax
  513. %else
  514. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  515. %endif
  516. call w_reg ; fast top extend, body copy and bottom extend
  517. .v_extend_end:
  518. ; horizontal extend (left/right)
  519. mov w_reg, r6m ; start_x
  520. sub r0, w_reg
  521. %if ARCH_X86_64
  522. mov r3, r0 ; backup of buf+block_h*linesize
  523. mov r5, r8
  524. %else
  525. mov r0m, r0 ; backup of buf+block_h*linesize
  526. mov r5, r5m
  527. %endif
  528. test w_reg, w_reg
  529. jz .right_extend
  530. cmp w_reg, 22
  531. jg .slow_left_extend_loop
  532. mov r1, w_reg
  533. dec w_reg
  534. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  535. sar w_reg, 1
  536. sal w_reg, 6
  537. ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
  538. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  539. %ifdef PIC
  540. lea rax, [.emuedge_extend_left_2]
  541. add w_reg, rax
  542. %else
  543. lea w_reg, [.emuedge_extend_left_2+w_reg]
  544. %endif
  545. call w_reg
  546. ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
  547. .right_extend:
  548. %if ARCH_X86_32
  549. mov r0, r0m
  550. mov r5, r5m
  551. %endif
  552. mov w_reg, r7m ; end_x
  553. mov r1, r8m ; block_w
  554. mov r4, r1
  555. sub r1, w_reg
  556. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  557. cmp r1, 22
  558. jg .slow_right_extend_loop
  559. dec r1
  560. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  561. sar r1, 1
  562. sal r1, 6
  563. %ifdef PIC
  564. lea rax, [.emuedge_extend_right_2]
  565. add r1, rax
  566. %else
  567. lea r1, [.emuedge_extend_right_2+r1]
  568. %endif
  569. call r1
  570. .h_extend_end:
  571. RET
  572. %if ARCH_X86_64
  573. %define vall al
  574. %define valh ah
  575. %define valw ax
  576. %define valw2 r7w
  577. %define valw3 r3w
  578. %if WIN64
  579. %define valw4 r7w
  580. %else ; unix64
  581. %define valw4 r3w
  582. %endif
  583. %define vald eax
  584. %else
  585. %define vall bl
  586. %define valh bh
  587. %define valw bx
  588. %define valw2 r6w
  589. %define valw3 valw2
  590. %define valw4 valw3
  591. %define vald ebx
  592. %define stack_offset 0x14
  593. %endif
  594. %endmacro
  595. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  596. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  597. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  598. ; - else if (%2 & 8) fills 8 bytes into mm0
  599. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  600. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  601. ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
  602. ; (note that we're using r3 for body/bottom because it's a shorter
  603. ; opcode, and then the loop fits in 128 bytes)
  604. ; - else fills remaining bytes into rax
  605. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  606. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  607. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  608. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  609. ; - else fills remaining bytes into ebx
  610. ; writing data out is in the same way
  611. %macro READ_NUM_BYTES 2
  612. %assign %%src_off 0 ; offset in source buffer
  613. %assign %%smidx 0 ; mmx register idx
  614. %assign %%sxidx 0 ; xmm register idx
  615. %if cpuflag(sse)
  616. %rep %2/16
  617. movups xmm %+ %%sxidx, [r1+%%src_off]
  618. %assign %%src_off %%src_off+16
  619. %assign %%sxidx %%sxidx+1
  620. %endrep ; %2/16
  621. %endif
  622. %if ARCH_X86_64
  623. %if (%2-%%src_off) == 8
  624. mov rax, [r1+%%src_off]
  625. %assign %%src_off %%src_off+8
  626. %endif ; (%2-%%src_off) == 8
  627. %endif ; x86-64
  628. %rep (%2-%%src_off)/8
  629. movq mm %+ %%smidx, [r1+%%src_off]
  630. %assign %%src_off %%src_off+8
  631. %assign %%smidx %%smidx+1
  632. %endrep ; (%2-%%dst_off)/8
  633. %if (%2-%%src_off) == 4
  634. mov vald, [r1+%%src_off]
  635. %elif (%2-%%src_off) & 4
  636. movd mm %+ %%smidx, [r1+%%src_off]
  637. %assign %%src_off %%src_off+4
  638. %endif ; (%2-%%src_off) ==/& 4
  639. %if (%2-%%src_off) == 1
  640. mov vall, [r1+%%src_off]
  641. %elif (%2-%%src_off) == 2
  642. mov valw, [r1+%%src_off]
  643. %elif (%2-%%src_off) == 3
  644. %ifidn %1, top
  645. mov valw2, [r1+%%src_off]
  646. %elifidn %1, body
  647. mov valw3, [r1+%%src_off]
  648. %elifidn %1, bottom
  649. mov valw4, [r1+%%src_off]
  650. %endif ; %1 ==/!= top
  651. mov vall, [r1+%%src_off+2]
  652. %endif ; (%2-%%src_off) == 1/2/3
  653. %endmacro ; READ_NUM_BYTES
  654. %macro WRITE_NUM_BYTES 2
  655. %assign %%dst_off 0 ; offset in destination buffer
  656. %assign %%dmidx 0 ; mmx register idx
  657. %assign %%dxidx 0 ; xmm register idx
  658. %if cpuflag(sse)
  659. %rep %2/16
  660. movups [r0+%%dst_off], xmm %+ %%dxidx
  661. %assign %%dst_off %%dst_off+16
  662. %assign %%dxidx %%dxidx+1
  663. %endrep ; %2/16
  664. %endif
  665. %if ARCH_X86_64
  666. %if (%2-%%dst_off) == 8
  667. mov [r0+%%dst_off], rax
  668. %assign %%dst_off %%dst_off+8
  669. %endif ; (%2-%%dst_off) == 8
  670. %endif ; x86-64
  671. %rep (%2-%%dst_off)/8
  672. movq [r0+%%dst_off], mm %+ %%dmidx
  673. %assign %%dst_off %%dst_off+8
  674. %assign %%dmidx %%dmidx+1
  675. %endrep ; (%2-%%dst_off)/8
  676. %if (%2-%%dst_off) == 4
  677. mov [r0+%%dst_off], vald
  678. %elif (%2-%%dst_off) & 4
  679. movd [r0+%%dst_off], mm %+ %%dmidx
  680. %assign %%dst_off %%dst_off+4
  681. %endif ; (%2-%%dst_off) ==/& 4
  682. %if (%2-%%dst_off) == 1
  683. mov [r0+%%dst_off], vall
  684. %elif (%2-%%dst_off) == 2
  685. mov [r0+%%dst_off], valw
  686. %elif (%2-%%dst_off) == 3
  687. %ifidn %1, top
  688. mov [r0+%%dst_off], valw2
  689. %elifidn %1, body
  690. mov [r0+%%dst_off], valw3
  691. %elifidn %1, bottom
  692. mov [r0+%%dst_off], valw4
  693. %endif ; %1 ==/!= top
  694. mov [r0+%%dst_off+2], vall
  695. %endif ; (%2-%%dst_off) == 1/2/3
  696. %endmacro ; WRITE_NUM_BYTES
  697. ; vertical top/bottom extend and body copy fast loops
  698. ; these are function pointers to set-width line copy functions, i.e.
  699. ; they read a fixed number of pixels into set registers, and write
  700. ; those out into the destination buffer
  701. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  702. ; r6(eax/64)/r3(ebx/32)=val_reg
  703. %macro VERTICAL_EXTEND 0
  704. %assign %%n 1
  705. %rep 22
  706. ALIGN 128
  707. .emuedge_v_extend_ %+ %%n:
  708. ; extend pixels above body
  709. %if ARCH_X86_64
  710. test r3 , r3 ; if (!start_y)
  711. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  712. %else ; ARCH_X86_32
  713. cmp dword r3m, 0
  714. je .emuedge_copy_body_ %+ %%n %+ _loop
  715. %endif ; ARCH_X86_64/32
  716. READ_NUM_BYTES top, %%n ; read bytes
  717. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  718. WRITE_NUM_BYTES top, %%n ; write bytes
  719. add r0 , r2 ; dst += linesize
  720. %if ARCH_X86_64
  721. dec r3d
  722. %else ; ARCH_X86_32
  723. dec dword r3m
  724. %endif ; ARCH_X86_64/32
  725. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  726. ; copy body pixels
  727. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  728. READ_NUM_BYTES body, %%n ; read bytes
  729. WRITE_NUM_BYTES body, %%n ; write bytes
  730. add r0 , r2 ; dst += linesize
  731. add r1 , r2 ; src += linesize
  732. dec r4d
  733. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  734. ; copy bottom pixels
  735. test r5 , r5 ; if (!block_h)
  736. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  737. sub r1 , r2 ; src -= linesize
  738. READ_NUM_BYTES bottom, %%n ; read bytes
  739. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  740. WRITE_NUM_BYTES bottom, %%n ; write bytes
  741. add r0 , r2 ; dst += linesize
  742. dec r5d
  743. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  744. .emuedge_v_extend_end_ %+ %%n:
  745. %if ARCH_X86_64
  746. ret
  747. %else ; ARCH_X86_32
  748. rep ret
  749. %endif ; ARCH_X86_64/32
  750. %assign %%n %%n+1
  751. %endrep
  752. %endmacro VERTICAL_EXTEND
  753. ; left/right (horizontal) fast extend functions
  754. ; these are essentially identical to the vertical extend ones above,
  755. ; just left/right separated because number of pixels to extend is
  756. ; obviously not the same on both sides.
  757. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  758. ; lowest two bytes of the register (so val*0x0101), and are splatted
  759. ; into each byte of mm0 as well if n_pixels >= 8
  760. %macro READ_V_PIXEL 2
  761. mov vall, %2
  762. mov valh, vall
  763. %if %1 >= 8
  764. movd mm0, vald
  765. %if cpuflag(mmxext)
  766. pshufw mm0, mm0, 0
  767. %else ; mmx
  768. punpcklwd mm0, mm0
  769. punpckldq mm0, mm0
  770. %endif ; sse
  771. %endif ; %1 >= 8
  772. %endmacro
  773. %macro WRITE_V_PIXEL 2
  774. %assign %%dst_off 0
  775. %rep %1/8
  776. movq [%2+%%dst_off], mm0
  777. %assign %%dst_off %%dst_off+8
  778. %endrep
  779. %if %1 & 4
  780. %if %1 >= 8
  781. movd [%2+%%dst_off], mm0
  782. %else ; %1 < 8
  783. mov [%2+%%dst_off] , valw
  784. mov [%2+%%dst_off+2], valw
  785. %endif ; %1 >=/< 8
  786. %assign %%dst_off %%dst_off+4
  787. %endif ; %1 & 4
  788. %if %1&2
  789. mov [%2+%%dst_off], valw
  790. %endif ; %1 & 2
  791. %endmacro
  792. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  793. %macro LEFT_EXTEND 0
  794. %assign %%n 2
  795. %rep 11
  796. ALIGN 64
  797. .emuedge_extend_left_ %+ %%n: ; do {
  798. sub r0, r2 ; dst -= linesize
  799. READ_V_PIXEL %%n, [r0+r1] ; read pixels
  800. WRITE_V_PIXEL %%n, r0 ; write pixels
  801. dec r5
  802. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  803. %if ARCH_X86_64
  804. ret
  805. %else ; ARCH_X86_32
  806. rep ret
  807. %endif ; ARCH_X86_64/32
  808. %assign %%n %%n+2
  809. %endrep
  810. %endmacro ; LEFT_EXTEND
  811. ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
  812. %macro RIGHT_EXTEND 0
  813. %assign %%n 2
  814. %rep 11
  815. ALIGN 64
  816. .emuedge_extend_right_ %+ %%n: ; do {
  817. %if ARCH_X86_64
  818. sub r3, r2 ; dst -= linesize
  819. READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
  820. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  821. dec r8
  822. %else ; ARCH_X86_32
  823. sub r0, r2 ; dst -= linesize
  824. READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
  825. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  826. dec r5
  827. %endif ; ARCH_X86_64/32
  828. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  829. %if ARCH_X86_64
  830. ret
  831. %else ; ARCH_X86_32
  832. rep ret
  833. %endif ; ARCH_X86_64/32
  834. %assign %%n %%n+2
  835. %endrep
  836. %if ARCH_X86_32
  837. %define stack_offset 0x10
  838. %endif
  839. %endmacro ; RIGHT_EXTEND
  840. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  841. ; width specified in a register, and run a loop to copy the full amount
  842. ; of bytes. They are optimized for copying of large amounts of pixels per
  843. ; line, so they unconditionally splat data into mm registers to copy 8
  844. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  845. ; also, but I haven't optimized this as much (i.e. FIXME)
  846. %macro V_COPY_NPX 4-5
  847. %if %0 == 4
  848. test w_reg, %4
  849. jz .%1_skip_%4_px
  850. %else ; %0 == 5
  851. .%1_%4_px_loop:
  852. %endif
  853. %3 %2, [r1+cnt_reg]
  854. %3 [r0+cnt_reg], %2
  855. add cnt_reg, %4
  856. %if %0 == 5
  857. sub w_reg, %4
  858. test w_reg, %5
  859. jnz .%1_%4_px_loop
  860. %endif
  861. .%1_skip_%4_px:
  862. %endmacro
  863. %macro V_COPY_ROW 2
  864. %ifidn %1, bottom
  865. sub r1, linesize
  866. %endif
  867. .%1_copy_loop:
  868. xor cnt_reg, cnt_reg
  869. %if notcpuflag(sse)
  870. %define linesize r2m
  871. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  872. %else ; sse
  873. V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
  874. %if ARCH_X86_64
  875. %define linesize r2
  876. V_COPY_NPX %1, rax , mov, 8
  877. %else ; ARCH_X86_32
  878. %define linesize r2m
  879. V_COPY_NPX %1, mm0, movq, 8
  880. %endif ; ARCH_X86_64/32
  881. %endif ; sse
  882. V_COPY_NPX %1, vald, mov, 4
  883. V_COPY_NPX %1, valw, mov, 2
  884. V_COPY_NPX %1, vall, mov, 1
  885. mov w_reg, cnt_reg
  886. %ifidn %1, body
  887. add r1, linesize
  888. %endif
  889. add r0, linesize
  890. dec %2
  891. jnz .%1_copy_loop
  892. %endmacro
  893. %macro SLOW_V_EXTEND 0
  894. .slow_v_extend_loop:
  895. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  896. ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
  897. %if ARCH_X86_64
  898. push r8 ; save old value of block_h
  899. test r3, r3
  900. %define cnt_reg r8
  901. jz .do_body_copy ; if (!start_y) goto do_body_copy
  902. V_COPY_ROW top, r3
  903. %else
  904. cmp dword r3m, 0
  905. %define cnt_reg r2
  906. je .do_body_copy ; if (!start_y) goto do_body_copy
  907. V_COPY_ROW top, dword r3m
  908. %endif
  909. .do_body_copy:
  910. V_COPY_ROW body, r4
  911. %if ARCH_X86_64
  912. pop r8 ; restore old value of block_h
  913. %define cnt_reg r3
  914. %endif
  915. test r5, r5
  916. %if ARCH_X86_64
  917. jz .v_extend_end
  918. %else
  919. jz .skip_bottom_extend
  920. %endif
  921. V_COPY_ROW bottom, r5
  922. %if ARCH_X86_32
  923. .skip_bottom_extend:
  924. mov r2, r2m
  925. %endif
  926. jmp .v_extend_end
  927. %endmacro
  928. %macro SLOW_LEFT_EXTEND 0
  929. .slow_left_extend_loop:
  930. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
  931. mov r4, 8
  932. sub r0, linesize
  933. READ_V_PIXEL 8, [r0+w_reg]
  934. .left_extend_8px_loop:
  935. movq [r0+r4-8], mm0
  936. add r4, 8
  937. cmp r4, w_reg
  938. jle .left_extend_8px_loop
  939. sub r4, 8
  940. cmp r4, w_reg
  941. jge .left_extend_loop_end
  942. .left_extend_2px_loop:
  943. mov [r0+r4], valw
  944. add r4, 2
  945. cmp r4, w_reg
  946. jl .left_extend_2px_loop
  947. .left_extend_loop_end:
  948. dec r5
  949. jnz .slow_left_extend_loop
  950. %if ARCH_X86_32
  951. mov r2, r2m
  952. %endif
  953. jmp .right_extend
  954. %endmacro
  955. %macro SLOW_RIGHT_EXTEND 0
  956. .slow_right_extend_loop:
  957. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
  958. ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  959. %if ARCH_X86_64
  960. %define buf_reg r3
  961. %define bh_reg r8
  962. %else
  963. %define buf_reg r0
  964. %define bh_reg r5
  965. %endif
  966. lea r1, [r4-8]
  967. sub buf_reg, linesize
  968. READ_V_PIXEL 8, [buf_reg+w_reg-1]
  969. .right_extend_8px_loop:
  970. movq [buf_reg+r1], mm0
  971. sub r1, 8
  972. cmp r1, w_reg
  973. jge .right_extend_8px_loop
  974. add r1, 8
  975. cmp r1, w_reg
  976. je .right_extend_loop_end
  977. .right_extend_2px_loop:
  978. sub r1, 2
  979. mov [buf_reg+r1], valw
  980. cmp r1, w_reg
  981. jg .right_extend_2px_loop
  982. .right_extend_loop_end:
  983. dec bh_reg
  984. jnz .slow_right_extend_loop
  985. jmp .h_extend_end
  986. %endmacro
  987. %macro emu_edge 1
  988. INIT_XMM %1
  989. EMU_EDGE_FUNC
  990. VERTICAL_EXTEND
  991. LEFT_EXTEND
  992. RIGHT_EXTEND
  993. SLOW_V_EXTEND
  994. SLOW_LEFT_EXTEND
  995. SLOW_RIGHT_EXTEND
  996. %endmacro
  997. emu_edge sse
  998. %if ARCH_X86_32
  999. emu_edge mmx
  1000. %endif
  1001. ;-----------------------------------------------------------------------------
  1002. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  1003. ; int32_t max, unsigned int len)
  1004. ;-----------------------------------------------------------------------------
  1005. ; %1 = number of xmm registers used
  1006. ; %2 = number of inline load/process/store loops per asm loop
  1007. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  1008. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  1009. ; %5 = suffix
  1010. %macro VECTOR_CLIP_INT32 4-5
  1011. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  1012. %if %4
  1013. cvtsi2ss m4, minm
  1014. cvtsi2ss m5, maxm
  1015. %else
  1016. movd m4, minm
  1017. movd m5, maxm
  1018. %endif
  1019. SPLATD m4
  1020. SPLATD m5
  1021. .loop:
  1022. %assign %%i 1
  1023. %rep %2
  1024. mova m0, [srcq+mmsize*0*%%i]
  1025. mova m1, [srcq+mmsize*1*%%i]
  1026. mova m2, [srcq+mmsize*2*%%i]
  1027. mova m3, [srcq+mmsize*3*%%i]
  1028. %if %3
  1029. mova m7, [srcq+mmsize*4*%%i]
  1030. mova m8, [srcq+mmsize*5*%%i]
  1031. mova m9, [srcq+mmsize*6*%%i]
  1032. mova m10, [srcq+mmsize*7*%%i]
  1033. %endif
  1034. CLIPD m0, m4, m5, m6
  1035. CLIPD m1, m4, m5, m6
  1036. CLIPD m2, m4, m5, m6
  1037. CLIPD m3, m4, m5, m6
  1038. %if %3
  1039. CLIPD m7, m4, m5, m6
  1040. CLIPD m8, m4, m5, m6
  1041. CLIPD m9, m4, m5, m6
  1042. CLIPD m10, m4, m5, m6
  1043. %endif
  1044. mova [dstq+mmsize*0*%%i], m0
  1045. mova [dstq+mmsize*1*%%i], m1
  1046. mova [dstq+mmsize*2*%%i], m2
  1047. mova [dstq+mmsize*3*%%i], m3
  1048. %if %3
  1049. mova [dstq+mmsize*4*%%i], m7
  1050. mova [dstq+mmsize*5*%%i], m8
  1051. mova [dstq+mmsize*6*%%i], m9
  1052. mova [dstq+mmsize*7*%%i], m10
  1053. %endif
  1054. %assign %%i %%i+1
  1055. %endrep
  1056. add srcq, mmsize*4*(%2+%3)
  1057. add dstq, mmsize*4*(%2+%3)
  1058. sub lend, mmsize*(%2+%3)
  1059. jg .loop
  1060. REP_RET
  1061. %endmacro
  1062. INIT_MMX mmx
  1063. %define CLIPD CLIPD_MMX
  1064. VECTOR_CLIP_INT32 0, 1, 0, 0
  1065. INIT_XMM sse2
  1066. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  1067. %define CLIPD CLIPD_SSE2
  1068. VECTOR_CLIP_INT32 6, 2, 0, 1
  1069. INIT_XMM sse4
  1070. %define CLIPD CLIPD_SSE41
  1071. %ifdef m8
  1072. VECTOR_CLIP_INT32 11, 1, 1, 0
  1073. %else
  1074. VECTOR_CLIP_INT32 6, 1, 0, 0
  1075. %endif
  1076. ;-----------------------------------------------------------------------------
  1077. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  1078. ; int len)
  1079. ;-----------------------------------------------------------------------------
  1080. %macro VECTOR_FMUL_REVERSE 0
  1081. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  1082. lea lenq, [lend*4 - 2*mmsize]
  1083. ALIGN 16
  1084. .loop:
  1085. %if cpuflag(avx)
  1086. vmovaps xmm0, [src1q + 16]
  1087. vinsertf128 m0, m0, [src1q], 1
  1088. vshufps m0, m0, m0, q0123
  1089. vmovaps xmm1, [src1q + mmsize + 16]
  1090. vinsertf128 m1, m1, [src1q + mmsize], 1
  1091. vshufps m1, m1, m1, q0123
  1092. %else
  1093. mova m0, [src1q]
  1094. mova m1, [src1q + mmsize]
  1095. shufps m0, m0, q0123
  1096. shufps m1, m1, q0123
  1097. %endif
  1098. mulps m0, m0, [src0q + lenq + mmsize]
  1099. mulps m1, m1, [src0q + lenq]
  1100. mova [dstq + lenq + mmsize], m0
  1101. mova [dstq + lenq], m1
  1102. add src1q, 2*mmsize
  1103. sub lenq, 2*mmsize
  1104. jge .loop
  1105. REP_RET
  1106. %endmacro
  1107. INIT_XMM sse
  1108. VECTOR_FMUL_REVERSE
  1109. %if HAVE_AVX_EXTERNAL
  1110. INIT_YMM avx
  1111. VECTOR_FMUL_REVERSE
  1112. %endif
  1113. ;-----------------------------------------------------------------------------
  1114. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  1115. ; const float *src2, int len)
  1116. ;-----------------------------------------------------------------------------
  1117. %macro VECTOR_FMUL_ADD 0
  1118. cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
  1119. lea lenq, [lend*4 - 2*mmsize]
  1120. ALIGN 16
  1121. .loop:
  1122. mova m0, [src0q + lenq]
  1123. mova m1, [src0q + lenq + mmsize]
  1124. mulps m0, m0, [src1q + lenq]
  1125. mulps m1, m1, [src1q + lenq + mmsize]
  1126. addps m0, m0, [src2q + lenq]
  1127. addps m1, m1, [src2q + lenq + mmsize]
  1128. mova [dstq + lenq], m0
  1129. mova [dstq + lenq + mmsize], m1
  1130. sub lenq, 2*mmsize
  1131. jge .loop
  1132. REP_RET
  1133. %endmacro
  1134. INIT_XMM sse
  1135. VECTOR_FMUL_ADD
  1136. %if HAVE_AVX_EXTERNAL
  1137. INIT_YMM avx
  1138. VECTOR_FMUL_ADD
  1139. %endif
  1140. ;-----------------------------------------------------------------------------
  1141. ; void ff_butterflies_float_interleave(float *dst, const float *src0,
  1142. ; const float *src1, int len);
  1143. ;-----------------------------------------------------------------------------
  1144. %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
  1145. cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
  1146. %if ARCH_X86_64
  1147. movsxd lenq, lend
  1148. %endif
  1149. test lenq, lenq
  1150. jz .end
  1151. shl lenq, 2
  1152. lea src0q, [src0q + lenq]
  1153. lea src1q, [src1q + lenq]
  1154. lea dstq, [ dstq + 2*lenq]
  1155. neg lenq
  1156. .loop:
  1157. mova m0, [src0q + lenq]
  1158. mova m1, [src1q + lenq]
  1159. subps m2, m0, m1
  1160. addps m0, m0, m1
  1161. unpcklps m1, m0, m2
  1162. unpckhps m0, m0, m2
  1163. %if cpuflag(avx)
  1164. vextractf128 [dstq + 2*lenq ], m1, 0
  1165. vextractf128 [dstq + 2*lenq + 16], m0, 0
  1166. vextractf128 [dstq + 2*lenq + 32], m1, 1
  1167. vextractf128 [dstq + 2*lenq + 48], m0, 1
  1168. %else
  1169. mova [dstq + 2*lenq ], m1
  1170. mova [dstq + 2*lenq + mmsize], m0
  1171. %endif
  1172. add lenq, mmsize
  1173. jl .loop
  1174. .end:
  1175. REP_RET
  1176. %endmacro
  1177. INIT_XMM sse
  1178. BUTTERFLIES_FLOAT_INTERLEAVE
  1179. %if HAVE_AVX_EXTERNAL
  1180. INIT_YMM avx
  1181. BUTTERFLIES_FLOAT_INTERLEAVE
  1182. %endif
  1183. ; %1 = aligned/unaligned
  1184. %macro BSWAP_LOOPS 1
  1185. mov r3, r2
  1186. sar r2, 3
  1187. jz .left4_%1
  1188. .loop8_%1:
  1189. mov%1 m0, [r1 + 0]
  1190. mov%1 m1, [r1 + 16]
  1191. %if cpuflag(ssse3)
  1192. pshufb m0, m2
  1193. pshufb m1, m2
  1194. mova [r0 + 0], m0
  1195. mova [r0 + 16], m1
  1196. %else
  1197. pshuflw m0, m0, 10110001b
  1198. pshuflw m1, m1, 10110001b
  1199. pshufhw m0, m0, 10110001b
  1200. pshufhw m1, m1, 10110001b
  1201. mova m2, m0
  1202. mova m3, m1
  1203. psllw m0, 8
  1204. psllw m1, 8
  1205. psrlw m2, 8
  1206. psrlw m3, 8
  1207. por m2, m0
  1208. por m3, m1
  1209. mova [r0 + 0], m2
  1210. mova [r0 + 16], m3
  1211. %endif
  1212. add r0, 32
  1213. add r1, 32
  1214. dec r2
  1215. jnz .loop8_%1
  1216. .left4_%1:
  1217. mov r2, r3
  1218. and r3, 4
  1219. jz .left
  1220. mov%1 m0, [r1]
  1221. %if cpuflag(ssse3)
  1222. pshufb m0, m2
  1223. mova [r0], m0
  1224. %else
  1225. pshuflw m0, m0, 10110001b
  1226. pshufhw m0, m0, 10110001b
  1227. mova m2, m0
  1228. psllw m0, 8
  1229. psrlw m2, 8
  1230. por m2, m0
  1231. mova [r0], m2
  1232. %endif
  1233. add r1, 16
  1234. add r0, 16
  1235. %endmacro
  1236. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1237. %macro BSWAP32_BUF 0
  1238. %if cpuflag(ssse3)
  1239. cglobal bswap32_buf, 3,4,3
  1240. mov r3, r1
  1241. mova m2, [pb_bswap32]
  1242. %else
  1243. cglobal bswap32_buf, 3,4,5
  1244. mov r3, r1
  1245. %endif
  1246. and r3, 15
  1247. jz .start_align
  1248. BSWAP_LOOPS u
  1249. jmp .left
  1250. .start_align:
  1251. BSWAP_LOOPS a
  1252. .left:
  1253. %if cpuflag(ssse3)
  1254. mov r3, r2
  1255. and r2, 2
  1256. jz .left1
  1257. movq m0, [r1]
  1258. pshufb m0, m2
  1259. movq [r0], m0
  1260. add r1, 8
  1261. add r0, 8
  1262. .left1:
  1263. and r3, 1
  1264. jz .end
  1265. mov r2d, [r1]
  1266. bswap r2d
  1267. mov [r0], r2d
  1268. %else
  1269. and r2, 3
  1270. jz .end
  1271. .loop2:
  1272. mov r3d, [r1]
  1273. bswap r3d
  1274. mov [r0], r3d
  1275. add r1, 4
  1276. add r0, 4
  1277. dec r2
  1278. jnz .loop2
  1279. %endif
  1280. .end:
  1281. RET
  1282. %endmacro
  1283. INIT_XMM sse2
  1284. BSWAP32_BUF
  1285. INIT_XMM ssse3
  1286. BSWAP32_BUF