You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1382 lines
35KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. pb_f: times 16 db 15
  25. pb_zzzzzzzz77777777: times 8 db -1
  26. pb_7: times 8 db 7
  27. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  28. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  29. pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
  30. pd_16384: times 4 dd 16384
  31. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  32. SECTION_TEXT
  33. %macro SCALARPRODUCT 1
  34. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  35. cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
  36. shl orderq, 1
  37. add v1q, orderq
  38. add v2q, orderq
  39. neg orderq
  40. pxor m2, m2
  41. .loop:
  42. movu m0, [v1q + orderq]
  43. movu m1, [v1q + orderq + mmsize]
  44. pmaddwd m0, [v2q + orderq]
  45. pmaddwd m1, [v2q + orderq + mmsize]
  46. paddd m2, m0
  47. paddd m2, m1
  48. add orderq, mmsize*2
  49. jl .loop
  50. %if mmsize == 16
  51. movhlps m0, m2
  52. paddd m2, m0
  53. pshuflw m0, m2, 0x4e
  54. %else
  55. pshufw m0, m2, 0x4e
  56. %endif
  57. paddd m2, m0
  58. movd eax, m2
  59. RET
  60. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  61. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  62. shl orderq, 1
  63. movd m7, mulm
  64. %if mmsize == 16
  65. pshuflw m7, m7, 0
  66. punpcklqdq m7, m7
  67. %else
  68. pshufw m7, m7, 0
  69. %endif
  70. pxor m6, m6
  71. add v1q, orderq
  72. add v2q, orderq
  73. add v3q, orderq
  74. neg orderq
  75. .loop:
  76. movu m0, [v2q + orderq]
  77. movu m1, [v2q + orderq + mmsize]
  78. mova m4, [v1q + orderq]
  79. mova m5, [v1q + orderq + mmsize]
  80. movu m2, [v3q + orderq]
  81. movu m3, [v3q + orderq + mmsize]
  82. pmaddwd m0, m4
  83. pmaddwd m1, m5
  84. pmullw m2, m7
  85. pmullw m3, m7
  86. paddd m6, m0
  87. paddd m6, m1
  88. paddw m2, m4
  89. paddw m3, m5
  90. mova [v1q + orderq], m2
  91. mova [v1q + orderq + mmsize], m3
  92. add orderq, mmsize*2
  93. jl .loop
  94. %if mmsize == 16
  95. movhlps m0, m6
  96. paddd m6, m0
  97. pshuflw m0, m6, 0x4e
  98. %else
  99. pshufw m0, m6, 0x4e
  100. %endif
  101. paddd m6, m0
  102. movd eax, m6
  103. RET
  104. %endmacro
  105. INIT_MMX
  106. SCALARPRODUCT mmx2
  107. INIT_XMM
  108. SCALARPRODUCT sse2
  109. %macro SCALARPRODUCT_LOOP 1
  110. align 16
  111. .loop%1:
  112. sub orderq, mmsize*2
  113. %if %1
  114. mova m1, m4
  115. mova m4, [v2q + orderq]
  116. mova m0, [v2q + orderq + mmsize]
  117. palignr m1, m0, %1
  118. palignr m0, m4, %1
  119. mova m3, m5
  120. mova m5, [v3q + orderq]
  121. mova m2, [v3q + orderq + mmsize]
  122. palignr m3, m2, %1
  123. palignr m2, m5, %1
  124. %else
  125. mova m0, [v2q + orderq]
  126. mova m1, [v2q + orderq + mmsize]
  127. mova m2, [v3q + orderq]
  128. mova m3, [v3q + orderq + mmsize]
  129. %endif
  130. %define t0 [v1q + orderq]
  131. %define t1 [v1q + orderq + mmsize]
  132. %if ARCH_X86_64
  133. mova m8, t0
  134. mova m9, t1
  135. %define t0 m8
  136. %define t1 m9
  137. %endif
  138. pmaddwd m0, t0
  139. pmaddwd m1, t1
  140. pmullw m2, m7
  141. pmullw m3, m7
  142. paddw m2, t0
  143. paddw m3, t1
  144. paddd m6, m0
  145. paddd m6, m1
  146. mova [v1q + orderq], m2
  147. mova [v1q + orderq + mmsize], m3
  148. jg .loop%1
  149. %if %1
  150. jmp .end
  151. %endif
  152. %endmacro
  153. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  154. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  155. shl orderq, 1
  156. movd m7, mulm
  157. pshuflw m7, m7, 0
  158. punpcklqdq m7, m7
  159. pxor m6, m6
  160. mov r4d, v2d
  161. and r4d, 15
  162. and v2q, ~15
  163. and v3q, ~15
  164. mova m4, [v2q + orderq]
  165. mova m5, [v3q + orderq]
  166. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  167. cmp r4d, 0
  168. je .loop0
  169. cmp r4d, 2
  170. je .loop2
  171. cmp r4d, 4
  172. je .loop4
  173. cmp r4d, 6
  174. je .loop6
  175. cmp r4d, 8
  176. je .loop8
  177. cmp r4d, 10
  178. je .loop10
  179. cmp r4d, 12
  180. je .loop12
  181. SCALARPRODUCT_LOOP 14
  182. SCALARPRODUCT_LOOP 12
  183. SCALARPRODUCT_LOOP 10
  184. SCALARPRODUCT_LOOP 8
  185. SCALARPRODUCT_LOOP 6
  186. SCALARPRODUCT_LOOP 4
  187. SCALARPRODUCT_LOOP 2
  188. SCALARPRODUCT_LOOP 0
  189. .end:
  190. movhlps m0, m6
  191. paddd m6, m0
  192. pshuflw m0, m6, 0x4e
  193. paddd m6, m0
  194. movd eax, m6
  195. RET
  196. ;-----------------------------------------------------------------------------
  197. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  198. ; const int16_t *window, unsigned int len)
  199. ;-----------------------------------------------------------------------------
  200. %macro REVERSE_WORDS_MMXEXT 1-2
  201. pshufw %1, %1, 0x1B
  202. %endmacro
  203. %macro REVERSE_WORDS_SSE2 1-2
  204. pshuflw %1, %1, 0x1B
  205. pshufhw %1, %1, 0x1B
  206. pshufd %1, %1, 0x4E
  207. %endmacro
  208. %macro REVERSE_WORDS_SSSE3 2
  209. pshufb %1, %2
  210. %endmacro
  211. ; dst = (dst * src) >> 15
  212. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  213. ; in from the pmullw result.
  214. %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
  215. mova %3, %1
  216. pmulhw %1, %2
  217. pmullw %3, %2
  218. psrlw %3, 15
  219. psllw %1, 1
  220. por %1, %3
  221. %endmacro
  222. ; dst = ((dst * src) + (1<<14)) >> 15
  223. %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
  224. pmulhrsw %1, %2
  225. %endmacro
  226. %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
  227. cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
  228. lea offset2q, [offsetq-mmsize]
  229. %if %2
  230. mova m5, [pd_16384]
  231. %elifidn %1, ssse3
  232. mova m5, [pb_revwords]
  233. ALIGN 16
  234. %endif
  235. .loop:
  236. %if %2
  237. ; This version expands 16-bit to 32-bit, multiplies by the window,
  238. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  239. ; save to the output. The window is reversed for the second half.
  240. mova m3, [windowq+offset2q]
  241. mova m4, [ inputq+offset2q]
  242. pxor m0, m0
  243. punpcklwd m0, m3
  244. punpcklwd m1, m4
  245. pmaddwd m0, m1
  246. paddd m0, m5
  247. psrad m0, 15
  248. pxor m2, m2
  249. punpckhwd m2, m3
  250. punpckhwd m1, m4
  251. pmaddwd m2, m1
  252. paddd m2, m5
  253. psrad m2, 15
  254. packssdw m0, m2
  255. mova [outputq+offset2q], m0
  256. REVERSE_WORDS m3
  257. mova m4, [ inputq+offsetq]
  258. pxor m0, m0
  259. punpcklwd m0, m3
  260. punpcklwd m1, m4
  261. pmaddwd m0, m1
  262. paddd m0, m5
  263. psrad m0, 15
  264. pxor m2, m2
  265. punpckhwd m2, m3
  266. punpckhwd m1, m4
  267. pmaddwd m2, m1
  268. paddd m2, m5
  269. psrad m2, 15
  270. packssdw m0, m2
  271. mova [outputq+offsetq], m0
  272. %elif %3
  273. ; This version does the 16x16->16 multiplication in-place without expanding
  274. ; to 32-bit. The ssse3 version is bit-identical.
  275. mova m0, [windowq+offset2q]
  276. mova m1, [ inputq+offset2q]
  277. pmulhrsw m1, m0
  278. REVERSE_WORDS m0, m5
  279. pmulhrsw m0, [ inputq+offsetq ]
  280. mova [outputq+offset2q], m1
  281. mova [outputq+offsetq ], m0
  282. %else
  283. ; This version does the 16x16->16 multiplication in-place without expanding
  284. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  285. ; therefore are not bit-identical to the C version.
  286. mova m0, [windowq+offset2q]
  287. mova m1, [ inputq+offset2q]
  288. mova m2, [ inputq+offsetq ]
  289. MUL16FIXED m1, m0, m3
  290. REVERSE_WORDS m0
  291. MUL16FIXED m2, m0, m3
  292. mova [outputq+offset2q], m1
  293. mova [outputq+offsetq ], m2
  294. %endif
  295. add offsetd, mmsize
  296. sub offset2d, mmsize
  297. jae .loop
  298. REP_RET
  299. %endmacro
  300. INIT_MMX
  301. %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
  302. %define MUL16FIXED MUL16FIXED_MMXEXT
  303. APPLY_WINDOW_INT16 mmxext, 0, 0
  304. APPLY_WINDOW_INT16 mmxext_ba, 1, 0
  305. INIT_XMM
  306. %define REVERSE_WORDS REVERSE_WORDS_SSE2
  307. APPLY_WINDOW_INT16 sse2, 0, 0
  308. APPLY_WINDOW_INT16 sse2_ba, 1, 0
  309. APPLY_WINDOW_INT16 ssse3_atom, 0, 1
  310. %define REVERSE_WORDS REVERSE_WORDS_SSSE3
  311. APPLY_WINDOW_INT16 ssse3, 0, 1
  312. ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  313. cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
  314. movq mm0, [topq]
  315. movq mm2, mm0
  316. movd mm4, [left_topq]
  317. psllq mm2, 8
  318. movq mm1, mm0
  319. por mm4, mm2
  320. movd mm3, [leftq]
  321. psubb mm0, mm4 ; t-tl
  322. add dstq, wq
  323. add topq, wq
  324. add diffq, wq
  325. neg wq
  326. jmp .skip
  327. .loop:
  328. movq mm4, [topq+wq]
  329. movq mm0, mm4
  330. psllq mm4, 8
  331. por mm4, mm1
  332. movq mm1, mm0 ; t
  333. psubb mm0, mm4 ; t-tl
  334. .skip:
  335. movq mm2, [diffq+wq]
  336. %assign i 0
  337. %rep 8
  338. movq mm4, mm0
  339. paddb mm4, mm3 ; t-tl+l
  340. movq mm5, mm3
  341. pmaxub mm3, mm1
  342. pminub mm5, mm1
  343. pminub mm3, mm4
  344. pmaxub mm3, mm5 ; median
  345. paddb mm3, mm2 ; +residual
  346. %if i==0
  347. movq mm7, mm3
  348. psllq mm7, 56
  349. %else
  350. movq mm6, mm3
  351. psrlq mm7, 8
  352. psllq mm6, 56
  353. por mm7, mm6
  354. %endif
  355. %if i<7
  356. psrlq mm0, 8
  357. psrlq mm1, 8
  358. psrlq mm2, 8
  359. %endif
  360. %assign i i+1
  361. %endrep
  362. movq [dstq+wq], mm7
  363. add wq, 8
  364. jl .loop
  365. movzx r2d, byte [dstq-1]
  366. mov [leftq], r2d
  367. movzx r2d, byte [topq-1]
  368. mov [left_topq], r2d
  369. RET
  370. %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
  371. add srcq, wq
  372. add dstq, wq
  373. neg wq
  374. %%.loop:
  375. mova m1, [srcq+wq]
  376. mova m2, m1
  377. psllw m1, 8
  378. paddb m1, m2
  379. mova m2, m1
  380. pshufb m1, m3
  381. paddb m1, m2
  382. pshufb m0, m5
  383. mova m2, m1
  384. pshufb m1, m4
  385. paddb m1, m2
  386. %if mmsize == 16
  387. mova m2, m1
  388. pshufb m1, m6
  389. paddb m1, m2
  390. %endif
  391. paddb m0, m1
  392. %if %1
  393. mova [dstq+wq], m0
  394. %else
  395. movq [dstq+wq], m0
  396. movhps [dstq+wq+8], m0
  397. %endif
  398. add wq, mmsize
  399. jl %%.loop
  400. mov eax, mmsize-1
  401. sub eax, wd
  402. movd m1, eax
  403. pshufb m0, m1
  404. movd eax, m0
  405. RET
  406. %endmacro
  407. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  408. INIT_MMX
  409. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  410. .skip_prologue:
  411. mova m5, [pb_7]
  412. mova m4, [pb_zzzz3333zzzzbbbb]
  413. mova m3, [pb_zz11zz55zz99zzdd]
  414. movd m0, leftm
  415. psllq m0, 56
  416. ADD_HFYU_LEFT_LOOP 1
  417. INIT_XMM
  418. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  419. mova m5, [pb_f]
  420. mova m6, [pb_zzzzzzzz77777777]
  421. mova m4, [pb_zzzz3333zzzzbbbb]
  422. mova m3, [pb_zz11zz55zz99zzdd]
  423. movd m0, leftm
  424. pslldq m0, 15
  425. test srcq, 15
  426. jnz add_hfyu_left_prediction_ssse3.skip_prologue
  427. test dstq, 15
  428. jnz .unaligned
  429. ADD_HFYU_LEFT_LOOP 1
  430. .unaligned:
  431. ADD_HFYU_LEFT_LOOP 0
  432. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  433. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  434. neg offsetq
  435. shl offsetq, 2
  436. sub v1q, offsetq
  437. sub v2q, offsetq
  438. xorps xmm0, xmm0
  439. .loop:
  440. movaps xmm1, [v1q+offsetq]
  441. mulps xmm1, [v2q+offsetq]
  442. addps xmm0, xmm1
  443. add offsetq, 16
  444. js .loop
  445. movhlps xmm1, xmm0
  446. addps xmm0, xmm1
  447. movss xmm1, xmm0
  448. shufps xmm0, xmm0, 1
  449. addss xmm0, xmm1
  450. %if ARCH_X86_64 == 0
  451. movss r0m, xmm0
  452. fld dword r0m
  453. %endif
  454. RET
  455. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  456. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  457. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  458. ;
  459. ; The actual function itself is below. It basically wraps a very simple
  460. ; w = end_x - start_x
  461. ; if (w) {
  462. ; if (w > 22) {
  463. ; jump to the slow loop functions
  464. ; } else {
  465. ; jump to the fast loop functions
  466. ; }
  467. ; }
  468. ;
  469. ; ... and then the same for left/right extend also. See below for loop
  470. ; function implementations. Fast are fixed-width, slow is variable-width
  471. %macro EMU_EDGE_FUNC 0
  472. %if ARCH_X86_64
  473. %define w_reg r7
  474. cglobal emu_edge_core, 6, 9, 1
  475. mov r8, r5 ; save block_h
  476. %else
  477. %define w_reg r6
  478. cglobal emu_edge_core, 2, 7, 0
  479. mov r4, r4m ; end_y
  480. mov r5, r5m ; block_h
  481. %endif
  482. ; start with vertical extend (top/bottom) and body pixel copy
  483. mov w_reg, r7m
  484. sub w_reg, r6m ; w = start_x - end_x
  485. sub r5, r4
  486. %if ARCH_X86_64
  487. sub r4, r3
  488. %else
  489. sub r4, dword r3m
  490. %endif
  491. cmp w_reg, 22
  492. jg .slow_v_extend_loop
  493. %if ARCH_X86_32
  494. mov r2, r2m ; linesize
  495. %endif
  496. sal w_reg, 7 ; w * 128
  497. %ifdef PIC
  498. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  499. add w_reg, rax
  500. %else
  501. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  502. %endif
  503. call w_reg ; fast top extend, body copy and bottom extend
  504. .v_extend_end:
  505. ; horizontal extend (left/right)
  506. mov w_reg, r6m ; start_x
  507. sub r0, w_reg
  508. %if ARCH_X86_64
  509. mov r3, r0 ; backup of buf+block_h*linesize
  510. mov r5, r8
  511. %else
  512. mov r0m, r0 ; backup of buf+block_h*linesize
  513. mov r5, r5m
  514. %endif
  515. test w_reg, w_reg
  516. jz .right_extend
  517. cmp w_reg, 22
  518. jg .slow_left_extend_loop
  519. mov r1, w_reg
  520. dec w_reg
  521. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  522. sar w_reg, 1
  523. sal w_reg, 6
  524. ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
  525. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  526. %ifdef PIC
  527. lea rax, [.emuedge_extend_left_2]
  528. add w_reg, rax
  529. %else
  530. lea w_reg, [.emuedge_extend_left_2+w_reg]
  531. %endif
  532. call w_reg
  533. ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
  534. .right_extend:
  535. %if ARCH_X86_32
  536. mov r0, r0m
  537. mov r5, r5m
  538. %endif
  539. mov w_reg, r7m ; end_x
  540. mov r1, r8m ; block_w
  541. mov r4, r1
  542. sub r1, w_reg
  543. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  544. cmp r1, 22
  545. jg .slow_right_extend_loop
  546. dec r1
  547. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  548. sar r1, 1
  549. sal r1, 6
  550. %ifdef PIC
  551. lea rax, [.emuedge_extend_right_2]
  552. add r1, rax
  553. %else
  554. lea r1, [.emuedge_extend_right_2+r1]
  555. %endif
  556. call r1
  557. .h_extend_end:
  558. RET
  559. %if ARCH_X86_64
  560. %define vall al
  561. %define valh ah
  562. %define valw ax
  563. %define valw2 r7w
  564. %define valw3 r3w
  565. %if WIN64
  566. %define valw4 r7w
  567. %else ; unix64
  568. %define valw4 r3w
  569. %endif
  570. %define vald eax
  571. %else
  572. %define vall bl
  573. %define valh bh
  574. %define valw bx
  575. %define valw2 r6w
  576. %define valw3 valw2
  577. %define valw4 valw3
  578. %define vald ebx
  579. %define stack_offset 0x14
  580. %endif
  581. %endmacro
  582. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  583. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  584. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  585. ; - else if (%2 & 8) fills 8 bytes into mm0
  586. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  587. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  588. ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
  589. ; (note that we're using r3 for body/bottom because it's a shorter
  590. ; opcode, and then the loop fits in 128 bytes)
  591. ; - else fills remaining bytes into rax
  592. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  593. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  594. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  595. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  596. ; - else fills remaining bytes into ebx
  597. ; writing data out is in the same way
  598. %macro READ_NUM_BYTES 2
  599. %assign %%src_off 0 ; offset in source buffer
  600. %assign %%smidx 0 ; mmx register idx
  601. %assign %%sxidx 0 ; xmm register idx
  602. %if cpuflag(sse)
  603. %rep %2/16
  604. movups xmm %+ %%sxidx, [r1+%%src_off]
  605. %assign %%src_off %%src_off+16
  606. %assign %%sxidx %%sxidx+1
  607. %endrep ; %2/16
  608. %endif
  609. %if ARCH_X86_64
  610. %if (%2-%%src_off) == 8
  611. mov rax, [r1+%%src_off]
  612. %assign %%src_off %%src_off+8
  613. %endif ; (%2-%%src_off) == 8
  614. %endif ; x86-64
  615. %rep (%2-%%src_off)/8
  616. movq mm %+ %%smidx, [r1+%%src_off]
  617. %assign %%src_off %%src_off+8
  618. %assign %%smidx %%smidx+1
  619. %endrep ; (%2-%%dst_off)/8
  620. %if (%2-%%src_off) == 4
  621. mov vald, [r1+%%src_off]
  622. %elif (%2-%%src_off) & 4
  623. movd mm %+ %%smidx, [r1+%%src_off]
  624. %assign %%src_off %%src_off+4
  625. %endif ; (%2-%%src_off) ==/& 4
  626. %if (%2-%%src_off) == 1
  627. mov vall, [r1+%%src_off]
  628. %elif (%2-%%src_off) == 2
  629. mov valw, [r1+%%src_off]
  630. %elif (%2-%%src_off) == 3
  631. %ifidn %1, top
  632. mov valw2, [r1+%%src_off]
  633. %elifidn %1, body
  634. mov valw3, [r1+%%src_off]
  635. %elifidn %1, bottom
  636. mov valw4, [r1+%%src_off]
  637. %endif ; %1 ==/!= top
  638. mov vall, [r1+%%src_off+2]
  639. %endif ; (%2-%%src_off) == 1/2/3
  640. %endmacro ; READ_NUM_BYTES
  641. %macro WRITE_NUM_BYTES 2
  642. %assign %%dst_off 0 ; offset in destination buffer
  643. %assign %%dmidx 0 ; mmx register idx
  644. %assign %%dxidx 0 ; xmm register idx
  645. %if cpuflag(sse)
  646. %rep %2/16
  647. movups [r0+%%dst_off], xmm %+ %%dxidx
  648. %assign %%dst_off %%dst_off+16
  649. %assign %%dxidx %%dxidx+1
  650. %endrep ; %2/16
  651. %endif
  652. %if ARCH_X86_64
  653. %if (%2-%%dst_off) == 8
  654. mov [r0+%%dst_off], rax
  655. %assign %%dst_off %%dst_off+8
  656. %endif ; (%2-%%dst_off) == 8
  657. %endif ; x86-64
  658. %rep (%2-%%dst_off)/8
  659. movq [r0+%%dst_off], mm %+ %%dmidx
  660. %assign %%dst_off %%dst_off+8
  661. %assign %%dmidx %%dmidx+1
  662. %endrep ; (%2-%%dst_off)/8
  663. %if (%2-%%dst_off) == 4
  664. mov [r0+%%dst_off], vald
  665. %elif (%2-%%dst_off) & 4
  666. movd [r0+%%dst_off], mm %+ %%dmidx
  667. %assign %%dst_off %%dst_off+4
  668. %endif ; (%2-%%dst_off) ==/& 4
  669. %if (%2-%%dst_off) == 1
  670. mov [r0+%%dst_off], vall
  671. %elif (%2-%%dst_off) == 2
  672. mov [r0+%%dst_off], valw
  673. %elif (%2-%%dst_off) == 3
  674. %ifidn %1, top
  675. mov [r0+%%dst_off], valw2
  676. %elifidn %1, body
  677. mov [r0+%%dst_off], valw3
  678. %elifidn %1, bottom
  679. mov [r0+%%dst_off], valw4
  680. %endif ; %1 ==/!= top
  681. mov [r0+%%dst_off+2], vall
  682. %endif ; (%2-%%dst_off) == 1/2/3
  683. %endmacro ; WRITE_NUM_BYTES
  684. ; vertical top/bottom extend and body copy fast loops
  685. ; these are function pointers to set-width line copy functions, i.e.
  686. ; they read a fixed number of pixels into set registers, and write
  687. ; those out into the destination buffer
  688. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  689. ; r6(eax/64)/r3(ebx/32)=val_reg
  690. %macro VERTICAL_EXTEND 0
  691. %assign %%n 1
  692. %rep 22
  693. ALIGN 128
  694. .emuedge_v_extend_ %+ %%n:
  695. ; extend pixels above body
  696. %if ARCH_X86_64
  697. test r3 , r3 ; if (!start_y)
  698. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  699. %else ; ARCH_X86_32
  700. cmp dword r3m, 0
  701. je .emuedge_copy_body_ %+ %%n %+ _loop
  702. %endif ; ARCH_X86_64/32
  703. READ_NUM_BYTES top, %%n ; read bytes
  704. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  705. WRITE_NUM_BYTES top, %%n ; write bytes
  706. add r0 , r2 ; dst += linesize
  707. %if ARCH_X86_64
  708. dec r3d
  709. %else ; ARCH_X86_32
  710. dec dword r3m
  711. %endif ; ARCH_X86_64/32
  712. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  713. ; copy body pixels
  714. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  715. READ_NUM_BYTES body, %%n ; read bytes
  716. WRITE_NUM_BYTES body, %%n ; write bytes
  717. add r0 , r2 ; dst += linesize
  718. add r1 , r2 ; src += linesize
  719. dec r4d
  720. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  721. ; copy bottom pixels
  722. test r5 , r5 ; if (!block_h)
  723. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  724. sub r1 , r2 ; src -= linesize
  725. READ_NUM_BYTES bottom, %%n ; read bytes
  726. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  727. WRITE_NUM_BYTES bottom, %%n ; write bytes
  728. add r0 , r2 ; dst += linesize
  729. dec r5d
  730. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  731. .emuedge_v_extend_end_ %+ %%n:
  732. %if ARCH_X86_64
  733. ret
  734. %else ; ARCH_X86_32
  735. rep ret
  736. %endif ; ARCH_X86_64/32
  737. %assign %%n %%n+1
  738. %endrep
  739. %endmacro VERTICAL_EXTEND
  740. ; left/right (horizontal) fast extend functions
  741. ; these are essentially identical to the vertical extend ones above,
  742. ; just left/right separated because number of pixels to extend is
  743. ; obviously not the same on both sides.
  744. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  745. ; lowest two bytes of the register (so val*0x0101), and are splatted
  746. ; into each byte of mm0 as well if n_pixels >= 8
  747. %macro READ_V_PIXEL 2
  748. mov vall, %2
  749. mov valh, vall
  750. %if %1 >= 8
  751. movd mm0, vald
  752. %if cpuflag(mmx2)
  753. pshufw mm0, mm0, 0
  754. %else ; mmx
  755. punpcklwd mm0, mm0
  756. punpckldq mm0, mm0
  757. %endif ; sse
  758. %endif ; %1 >= 8
  759. %endmacro
  760. %macro WRITE_V_PIXEL 2
  761. %assign %%dst_off 0
  762. %rep %1/8
  763. movq [%2+%%dst_off], mm0
  764. %assign %%dst_off %%dst_off+8
  765. %endrep
  766. %if %1 & 4
  767. %if %1 >= 8
  768. movd [%2+%%dst_off], mm0
  769. %else ; %1 < 8
  770. mov [%2+%%dst_off] , valw
  771. mov [%2+%%dst_off+2], valw
  772. %endif ; %1 >=/< 8
  773. %assign %%dst_off %%dst_off+4
  774. %endif ; %1 & 4
  775. %if %1&2
  776. mov [%2+%%dst_off], valw
  777. %endif ; %1 & 2
  778. %endmacro
  779. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  780. %macro LEFT_EXTEND 0
  781. %assign %%n 2
  782. %rep 11
  783. ALIGN 64
  784. .emuedge_extend_left_ %+ %%n: ; do {
  785. sub r0, r2 ; dst -= linesize
  786. READ_V_PIXEL %%n, [r0+r1] ; read pixels
  787. WRITE_V_PIXEL %%n, r0 ; write pixels
  788. dec r5
  789. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  790. %if ARCH_X86_64
  791. ret
  792. %else ; ARCH_X86_32
  793. rep ret
  794. %endif ; ARCH_X86_64/32
  795. %assign %%n %%n+2
  796. %endrep
  797. %endmacro ; LEFT_EXTEND
  798. ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
  799. %macro RIGHT_EXTEND 0
  800. %assign %%n 2
  801. %rep 11
  802. ALIGN 64
  803. .emuedge_extend_right_ %+ %%n: ; do {
  804. %if ARCH_X86_64
  805. sub r3, r2 ; dst -= linesize
  806. READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
  807. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  808. dec r8
  809. %else ; ARCH_X86_32
  810. sub r0, r2 ; dst -= linesize
  811. READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
  812. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  813. dec r5
  814. %endif ; ARCH_X86_64/32
  815. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  816. %if ARCH_X86_64
  817. ret
  818. %else ; ARCH_X86_32
  819. rep ret
  820. %endif ; ARCH_X86_64/32
  821. %assign %%n %%n+2
  822. %endrep
  823. %if ARCH_X86_32
  824. %define stack_offset 0x10
  825. %endif
  826. %endmacro ; RIGHT_EXTEND
  827. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  828. ; width specified in a register, and run a loop to copy the full amount
  829. ; of bytes. They are optimized for copying of large amounts of pixels per
  830. ; line, so they unconditionally splat data into mm registers to copy 8
  831. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  832. ; also, but I haven't optimized this as much (i.e. FIXME)
  833. %macro V_COPY_NPX 4-5
  834. %if %0 == 4
  835. test w_reg, %4
  836. jz .%1_skip_%4_px
  837. %else ; %0 == 5
  838. .%1_%4_px_loop:
  839. %endif
  840. %3 %2, [r1+cnt_reg]
  841. %3 [r0+cnt_reg], %2
  842. add cnt_reg, %4
  843. %if %0 == 5
  844. sub w_reg, %4
  845. test w_reg, %5
  846. jnz .%1_%4_px_loop
  847. %endif
  848. .%1_skip_%4_px:
  849. %endmacro
  850. %macro V_COPY_ROW 2
  851. %ifidn %1, bottom
  852. sub r1, linesize
  853. %endif
  854. .%1_copy_loop:
  855. xor cnt_reg, cnt_reg
  856. %if notcpuflag(sse)
  857. %define linesize r2m
  858. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  859. %else ; sse
  860. V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
  861. %if ARCH_X86_64
  862. %define linesize r2
  863. V_COPY_NPX %1, rax , mov, 8
  864. %else ; ARCH_X86_32
  865. %define linesize r2m
  866. V_COPY_NPX %1, mm0, movq, 8
  867. %endif ; ARCH_X86_64/32
  868. %endif ; sse
  869. V_COPY_NPX %1, vald, mov, 4
  870. V_COPY_NPX %1, valw, mov, 2
  871. V_COPY_NPX %1, vall, mov, 1
  872. mov w_reg, cnt_reg
  873. %ifidn %1, body
  874. add r1, linesize
  875. %endif
  876. add r0, linesize
  877. dec %2
  878. jnz .%1_copy_loop
  879. %endmacro
  880. %macro SLOW_V_EXTEND 0
  881. .slow_v_extend_loop:
  882. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  883. ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
  884. %if ARCH_X86_64
  885. push r8 ; save old value of block_h
  886. test r3, r3
  887. %define cnt_reg r8
  888. jz .do_body_copy ; if (!start_y) goto do_body_copy
  889. V_COPY_ROW top, r3
  890. %else
  891. cmp dword r3m, 0
  892. %define cnt_reg r2
  893. je .do_body_copy ; if (!start_y) goto do_body_copy
  894. V_COPY_ROW top, dword r3m
  895. %endif
  896. .do_body_copy:
  897. V_COPY_ROW body, r4
  898. %if ARCH_X86_64
  899. pop r8 ; restore old value of block_h
  900. %define cnt_reg r3
  901. %endif
  902. test r5, r5
  903. %if ARCH_X86_64
  904. jz .v_extend_end
  905. %else
  906. jz .skip_bottom_extend
  907. %endif
  908. V_COPY_ROW bottom, r5
  909. %if ARCH_X86_32
  910. .skip_bottom_extend:
  911. mov r2, r2m
  912. %endif
  913. jmp .v_extend_end
  914. %endmacro
  915. %macro SLOW_LEFT_EXTEND 0
  916. .slow_left_extend_loop:
  917. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
  918. mov r4, 8
  919. sub r0, linesize
  920. READ_V_PIXEL 8, [r0+w_reg]
  921. .left_extend_8px_loop:
  922. movq [r0+r4-8], mm0
  923. add r4, 8
  924. cmp r4, w_reg
  925. jle .left_extend_8px_loop
  926. sub r4, 8
  927. cmp r4, w_reg
  928. jge .left_extend_loop_end
  929. .left_extend_2px_loop:
  930. mov [r0+r4], valw
  931. add r4, 2
  932. cmp r4, w_reg
  933. jl .left_extend_2px_loop
  934. .left_extend_loop_end:
  935. dec r5
  936. jnz .slow_left_extend_loop
  937. %if ARCH_X86_32
  938. mov r2, r2m
  939. %endif
  940. jmp .right_extend
  941. %endmacro
  942. %macro SLOW_RIGHT_EXTEND 0
  943. .slow_right_extend_loop:
  944. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
  945. ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  946. %if ARCH_X86_64
  947. %define buf_reg r3
  948. %define bh_reg r8
  949. %else
  950. %define buf_reg r0
  951. %define bh_reg r5
  952. %endif
  953. lea r1, [r4-8]
  954. sub buf_reg, linesize
  955. READ_V_PIXEL 8, [buf_reg+w_reg-1]
  956. .right_extend_8px_loop:
  957. movq [buf_reg+r1], mm0
  958. sub r1, 8
  959. cmp r1, w_reg
  960. jge .right_extend_8px_loop
  961. add r1, 8
  962. cmp r1, w_reg
  963. je .right_extend_loop_end
  964. .right_extend_2px_loop:
  965. sub r1, 2
  966. mov [buf_reg+r1], valw
  967. cmp r1, w_reg
  968. jg .right_extend_2px_loop
  969. .right_extend_loop_end:
  970. dec bh_reg
  971. jnz .slow_right_extend_loop
  972. jmp .h_extend_end
  973. %endmacro
  974. %macro emu_edge 1
  975. INIT_XMM %1
  976. EMU_EDGE_FUNC
  977. VERTICAL_EXTEND
  978. LEFT_EXTEND
  979. RIGHT_EXTEND
  980. SLOW_V_EXTEND
  981. SLOW_LEFT_EXTEND
  982. SLOW_RIGHT_EXTEND
  983. %endmacro
  984. emu_edge sse
  985. %if ARCH_X86_32
  986. emu_edge mmx
  987. %endif
  988. ;-----------------------------------------------------------------------------
  989. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  990. ; int32_t max, unsigned int len)
  991. ;-----------------------------------------------------------------------------
  992. ; %1 = number of xmm registers used
  993. ; %2 = number of inline load/process/store loops per asm loop
  994. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  995. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  996. ; %5 = suffix
  997. %macro VECTOR_CLIP_INT32 4-5
  998. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  999. %if %4
  1000. cvtsi2ss m4, minm
  1001. cvtsi2ss m5, maxm
  1002. %else
  1003. movd m4, minm
  1004. movd m5, maxm
  1005. %endif
  1006. SPLATD m4
  1007. SPLATD m5
  1008. .loop:
  1009. %assign %%i 1
  1010. %rep %2
  1011. mova m0, [srcq+mmsize*0*%%i]
  1012. mova m1, [srcq+mmsize*1*%%i]
  1013. mova m2, [srcq+mmsize*2*%%i]
  1014. mova m3, [srcq+mmsize*3*%%i]
  1015. %if %3
  1016. mova m7, [srcq+mmsize*4*%%i]
  1017. mova m8, [srcq+mmsize*5*%%i]
  1018. mova m9, [srcq+mmsize*6*%%i]
  1019. mova m10, [srcq+mmsize*7*%%i]
  1020. %endif
  1021. CLIPD m0, m4, m5, m6
  1022. CLIPD m1, m4, m5, m6
  1023. CLIPD m2, m4, m5, m6
  1024. CLIPD m3, m4, m5, m6
  1025. %if %3
  1026. CLIPD m7, m4, m5, m6
  1027. CLIPD m8, m4, m5, m6
  1028. CLIPD m9, m4, m5, m6
  1029. CLIPD m10, m4, m5, m6
  1030. %endif
  1031. mova [dstq+mmsize*0*%%i], m0
  1032. mova [dstq+mmsize*1*%%i], m1
  1033. mova [dstq+mmsize*2*%%i], m2
  1034. mova [dstq+mmsize*3*%%i], m3
  1035. %if %3
  1036. mova [dstq+mmsize*4*%%i], m7
  1037. mova [dstq+mmsize*5*%%i], m8
  1038. mova [dstq+mmsize*6*%%i], m9
  1039. mova [dstq+mmsize*7*%%i], m10
  1040. %endif
  1041. %assign %%i %%i+1
  1042. %endrep
  1043. add srcq, mmsize*4*(%2+%3)
  1044. add dstq, mmsize*4*(%2+%3)
  1045. sub lend, mmsize*(%2+%3)
  1046. jg .loop
  1047. REP_RET
  1048. %endmacro
  1049. INIT_MMX mmx
  1050. %define SPLATD SPLATD_MMX
  1051. %define CLIPD CLIPD_MMX
  1052. VECTOR_CLIP_INT32 0, 1, 0, 0
  1053. INIT_XMM sse2
  1054. %define SPLATD SPLATD_SSE2
  1055. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  1056. %define CLIPD CLIPD_SSE2
  1057. VECTOR_CLIP_INT32 6, 2, 0, 1
  1058. INIT_XMM sse4
  1059. %define CLIPD CLIPD_SSE41
  1060. %ifdef m8
  1061. VECTOR_CLIP_INT32 11, 1, 1, 0
  1062. %else
  1063. VECTOR_CLIP_INT32 6, 1, 0, 0
  1064. %endif
  1065. ;-----------------------------------------------------------------------------
  1066. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  1067. ; int len)
  1068. ;-----------------------------------------------------------------------------
  1069. %macro VECTOR_FMUL_REVERSE 0
  1070. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  1071. lea lenq, [lend*4 - 2*mmsize]
  1072. ALIGN 16
  1073. .loop
  1074. %if cpuflag(avx)
  1075. vmovaps xmm0, [src1q + 16]
  1076. vinsertf128 m0, m0, [src1q], 1
  1077. vshufps m0, m0, m0, q0123
  1078. vmovaps xmm1, [src1q + mmsize + 16]
  1079. vinsertf128 m1, m1, [src1q + mmsize], 1
  1080. vshufps m1, m1, m1, q0123
  1081. %else
  1082. mova m0, [src1q]
  1083. mova m1, [src1q + mmsize]
  1084. shufps m0, m0, q0123
  1085. shufps m1, m1, q0123
  1086. %endif
  1087. mulps m0, m0, [src0q + lenq + mmsize]
  1088. mulps m1, m1, [src0q + lenq]
  1089. mova [dstq + lenq + mmsize], m0
  1090. mova [dstq + lenq], m1
  1091. add src1q, 2*mmsize
  1092. sub lenq, 2*mmsize
  1093. jge .loop
  1094. %if mmsize == 32
  1095. vzeroupper
  1096. RET
  1097. %else
  1098. REP_RET
  1099. %endif
  1100. %endmacro
  1101. INIT_XMM sse
  1102. VECTOR_FMUL_REVERSE
  1103. %if HAVE_AVX
  1104. INIT_YMM avx
  1105. VECTOR_FMUL_REVERSE
  1106. %endif
  1107. ;-----------------------------------------------------------------------------
  1108. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  1109. ; const float *src2, int len)
  1110. ;-----------------------------------------------------------------------------
  1111. %macro VECTOR_FMUL_ADD 0
  1112. cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
  1113. lea lenq, [lend*4 - 2*mmsize]
  1114. ALIGN 16
  1115. .loop
  1116. mova m0, [src0q + lenq]
  1117. mova m1, [src0q + lenq + mmsize]
  1118. mulps m0, m0, [src1q + lenq]
  1119. mulps m1, m1, [src1q + lenq + mmsize]
  1120. addps m0, m0, [src2q + lenq]
  1121. addps m1, m1, [src2q + lenq + mmsize]
  1122. mova [dstq + lenq], m0
  1123. mova [dstq + lenq + mmsize], m1
  1124. sub lenq, 2*mmsize
  1125. jge .loop
  1126. %if mmsize == 32
  1127. vzeroupper
  1128. RET
  1129. %else
  1130. REP_RET
  1131. %endif
  1132. %endmacro
  1133. INIT_XMM sse
  1134. VECTOR_FMUL_ADD
  1135. %if HAVE_AVX
  1136. INIT_YMM avx
  1137. VECTOR_FMUL_ADD
  1138. %endif
  1139. ;-----------------------------------------------------------------------------
  1140. ; void ff_butterflies_float_interleave(float *dst, const float *src0,
  1141. ; const float *src1, int len);
  1142. ;-----------------------------------------------------------------------------
  1143. %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
  1144. cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
  1145. %if ARCH_X86_64
  1146. movsxd lenq, lend
  1147. %endif
  1148. test lenq, lenq
  1149. jz .end
  1150. shl lenq, 2
  1151. lea src0q, [src0q + lenq]
  1152. lea src1q, [src1q + lenq]
  1153. lea dstq, [ dstq + 2*lenq]
  1154. neg lenq
  1155. .loop:
  1156. mova m0, [src0q + lenq]
  1157. mova m1, [src1q + lenq]
  1158. subps m2, m0, m1
  1159. addps m0, m0, m1
  1160. unpcklps m1, m0, m2
  1161. unpckhps m0, m0, m2
  1162. %if cpuflag(avx)
  1163. vextractf128 [dstq + 2*lenq ], m1, 0
  1164. vextractf128 [dstq + 2*lenq + 16], m0, 0
  1165. vextractf128 [dstq + 2*lenq + 32], m1, 1
  1166. vextractf128 [dstq + 2*lenq + 48], m0, 1
  1167. %else
  1168. mova [dstq + 2*lenq ], m1
  1169. mova [dstq + 2*lenq + mmsize], m0
  1170. %endif
  1171. add lenq, mmsize
  1172. jl .loop
  1173. %if mmsize == 32
  1174. vzeroupper
  1175. RET
  1176. %endif
  1177. .end:
  1178. REP_RET
  1179. %endmacro
  1180. INIT_XMM sse
  1181. BUTTERFLIES_FLOAT_INTERLEAVE
  1182. %if HAVE_AVX
  1183. INIT_YMM avx
  1184. BUTTERFLIES_FLOAT_INTERLEAVE
  1185. %endif
  1186. INIT_XMM sse2
  1187. ; %1 = aligned/unaligned
  1188. %macro BSWAP_LOOPS_SSE2 1
  1189. mov r3, r2
  1190. sar r2, 3
  1191. jz .left4_%1
  1192. .loop8_%1:
  1193. mov%1 m0, [r1 + 0]
  1194. mov%1 m1, [r1 + 16]
  1195. pshuflw m0, m0, 10110001b
  1196. pshuflw m1, m1, 10110001b
  1197. pshufhw m0, m0, 10110001b
  1198. pshufhw m1, m1, 10110001b
  1199. mova m2, m0
  1200. mova m3, m1
  1201. psllw m0, 8
  1202. psllw m1, 8
  1203. psrlw m2, 8
  1204. psrlw m3, 8
  1205. por m2, m0
  1206. por m3, m1
  1207. mova [r0 + 0], m2
  1208. mova [r0 + 16], m3
  1209. add r1, 32
  1210. add r0, 32
  1211. dec r2
  1212. jnz .loop8_%1
  1213. .left4_%1:
  1214. mov r2, r3
  1215. and r3, 4
  1216. jz .left
  1217. mov%1 m0, [r1]
  1218. pshuflw m0, m0, 10110001b
  1219. pshufhw m0, m0, 10110001b
  1220. mova m2, m0
  1221. psllw m0, 8
  1222. psrlw m2, 8
  1223. por m2, m0
  1224. mova [r0], m2
  1225. add r1, 16
  1226. add r0, 16
  1227. %endmacro
  1228. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1229. cglobal bswap32_buf, 3,4,5
  1230. mov r3, r1
  1231. and r3, 15
  1232. jz .start_align
  1233. BSWAP_LOOPS_SSE2 u
  1234. jmp .left
  1235. .start_align:
  1236. BSWAP_LOOPS_SSE2 a
  1237. .left:
  1238. and r2, 3
  1239. jz .end
  1240. .loop2:
  1241. mov r3d, [r1]
  1242. bswap r3d
  1243. mov [r0], r3d
  1244. add r1, 4
  1245. add r0, 4
  1246. dec r2
  1247. jnz .loop2
  1248. .end
  1249. RET
  1250. ; %1 = aligned/unaligned
  1251. %macro BSWAP_LOOPS_SSSE3 1
  1252. mov r3, r2
  1253. sar r2, 3
  1254. jz .left4_%1
  1255. .loop8_%1:
  1256. mov%1 m0, [r1 + 0]
  1257. mov%1 m1, [r1 + 16]
  1258. pshufb m0, m2
  1259. pshufb m1, m2
  1260. mova [r0 + 0], m0
  1261. mova [r0 + 16], m1
  1262. add r0, 32
  1263. add r1, 32
  1264. dec r2
  1265. jnz .loop8_%1
  1266. .left4_%1:
  1267. mov r2, r3
  1268. and r3, 4
  1269. jz .left2
  1270. mov%1 m0, [r1]
  1271. pshufb m0, m2
  1272. mova [r0], m0
  1273. add r1, 16
  1274. add r0, 16
  1275. %endmacro
  1276. INIT_XMM ssse3
  1277. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1278. cglobal bswap32_buf, 3,4,3
  1279. mov r3, r1
  1280. mova m2, [pb_bswap32]
  1281. and r3, 15
  1282. jz .start_align
  1283. BSWAP_LOOPS_SSSE3 u
  1284. jmp .left2
  1285. .start_align:
  1286. BSWAP_LOOPS_SSSE3 a
  1287. .left2:
  1288. mov r3, r2
  1289. and r2, 2
  1290. jz .left1
  1291. movq m0, [r1]
  1292. pshufb m0, m2
  1293. movq [r0], m0
  1294. add r1, 8
  1295. add r0, 8
  1296. .left1:
  1297. and r3, 1
  1298. jz .end
  1299. mov r2d, [r1]
  1300. bswap r2d
  1301. mov [r0], r2d
  1302. .end:
  1303. RET