You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1375 lines
35KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. pb_f: times 16 db 15
  25. pb_zzzzzzzz77777777: times 8 db -1
  26. pb_7: times 8 db 7
  27. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  28. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  29. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  30. pd_16384: times 4 dd 16384
  31. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  32. SECTION_TEXT
  33. %macro SCALARPRODUCT 1
  34. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  35. cglobal scalarproduct_int16_%1, 3,3,3, v1, v2, order
  36. shl orderq, 1
  37. add v1q, orderq
  38. add v2q, orderq
  39. neg orderq
  40. pxor m2, m2
  41. .loop:
  42. movu m0, [v1q + orderq]
  43. movu m1, [v1q + orderq + mmsize]
  44. pmaddwd m0, [v2q + orderq]
  45. pmaddwd m1, [v2q + orderq + mmsize]
  46. paddd m2, m0
  47. paddd m2, m1
  48. add orderq, mmsize*2
  49. jl .loop
  50. %if mmsize == 16
  51. movhlps m0, m2
  52. paddd m2, m0
  53. pshuflw m0, m2, 0x4e
  54. %else
  55. pshufw m0, m2, 0x4e
  56. %endif
  57. paddd m2, m0
  58. movd eax, m2
  59. RET
  60. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  61. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  62. shl orderq, 1
  63. movd m7, mulm
  64. %if mmsize == 16
  65. pshuflw m7, m7, 0
  66. punpcklqdq m7, m7
  67. %else
  68. pshufw m7, m7, 0
  69. %endif
  70. pxor m6, m6
  71. add v1q, orderq
  72. add v2q, orderq
  73. add v3q, orderq
  74. neg orderq
  75. .loop:
  76. movu m0, [v2q + orderq]
  77. movu m1, [v2q + orderq + mmsize]
  78. mova m4, [v1q + orderq]
  79. mova m5, [v1q + orderq + mmsize]
  80. movu m2, [v3q + orderq]
  81. movu m3, [v3q + orderq + mmsize]
  82. pmaddwd m0, m4
  83. pmaddwd m1, m5
  84. pmullw m2, m7
  85. pmullw m3, m7
  86. paddd m6, m0
  87. paddd m6, m1
  88. paddw m2, m4
  89. paddw m3, m5
  90. mova [v1q + orderq], m2
  91. mova [v1q + orderq + mmsize], m3
  92. add orderq, mmsize*2
  93. jl .loop
  94. %if mmsize == 16
  95. movhlps m0, m6
  96. paddd m6, m0
  97. pshuflw m0, m6, 0x4e
  98. %else
  99. pshufw m0, m6, 0x4e
  100. %endif
  101. paddd m6, m0
  102. movd eax, m6
  103. RET
  104. %endmacro
  105. INIT_MMX
  106. SCALARPRODUCT mmx2
  107. INIT_XMM
  108. SCALARPRODUCT sse2
  109. %macro SCALARPRODUCT_LOOP 1
  110. align 16
  111. .loop%1:
  112. sub orderq, mmsize*2
  113. %if %1
  114. mova m1, m4
  115. mova m4, [v2q + orderq]
  116. mova m0, [v2q + orderq + mmsize]
  117. palignr m1, m0, %1
  118. palignr m0, m4, %1
  119. mova m3, m5
  120. mova m5, [v3q + orderq]
  121. mova m2, [v3q + orderq + mmsize]
  122. palignr m3, m2, %1
  123. palignr m2, m5, %1
  124. %else
  125. mova m0, [v2q + orderq]
  126. mova m1, [v2q + orderq + mmsize]
  127. mova m2, [v3q + orderq]
  128. mova m3, [v3q + orderq + mmsize]
  129. %endif
  130. %define t0 [v1q + orderq]
  131. %define t1 [v1q + orderq + mmsize]
  132. %if ARCH_X86_64
  133. mova m8, t0
  134. mova m9, t1
  135. %define t0 m8
  136. %define t1 m9
  137. %endif
  138. pmaddwd m0, t0
  139. pmaddwd m1, t1
  140. pmullw m2, m7
  141. pmullw m3, m7
  142. paddw m2, t0
  143. paddw m3, t1
  144. paddd m6, m0
  145. paddd m6, m1
  146. mova [v1q + orderq], m2
  147. mova [v1q + orderq + mmsize], m3
  148. jg .loop%1
  149. %if %1
  150. jmp .end
  151. %endif
  152. %endmacro
  153. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  154. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  155. shl orderq, 1
  156. movd m7, mulm
  157. pshuflw m7, m7, 0
  158. punpcklqdq m7, m7
  159. pxor m6, m6
  160. mov r4d, v2d
  161. and r4d, 15
  162. and v2q, ~15
  163. and v3q, ~15
  164. mova m4, [v2q + orderq]
  165. mova m5, [v3q + orderq]
  166. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  167. cmp r4d, 0
  168. je .loop0
  169. cmp r4d, 2
  170. je .loop2
  171. cmp r4d, 4
  172. je .loop4
  173. cmp r4d, 6
  174. je .loop6
  175. cmp r4d, 8
  176. je .loop8
  177. cmp r4d, 10
  178. je .loop10
  179. cmp r4d, 12
  180. je .loop12
  181. SCALARPRODUCT_LOOP 14
  182. SCALARPRODUCT_LOOP 12
  183. SCALARPRODUCT_LOOP 10
  184. SCALARPRODUCT_LOOP 8
  185. SCALARPRODUCT_LOOP 6
  186. SCALARPRODUCT_LOOP 4
  187. SCALARPRODUCT_LOOP 2
  188. SCALARPRODUCT_LOOP 0
  189. .end:
  190. movhlps m0, m6
  191. paddd m6, m0
  192. pshuflw m0, m6, 0x4e
  193. paddd m6, m0
  194. movd eax, m6
  195. RET
  196. ;-----------------------------------------------------------------------------
  197. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  198. ; const int16_t *window, unsigned int len)
  199. ;-----------------------------------------------------------------------------
  200. %macro REVERSE_WORDS_MMXEXT 1-2
  201. pshufw %1, %1, 0x1B
  202. %endmacro
  203. %macro REVERSE_WORDS_SSE2 1-2
  204. pshuflw %1, %1, 0x1B
  205. pshufhw %1, %1, 0x1B
  206. pshufd %1, %1, 0x4E
  207. %endmacro
  208. %macro REVERSE_WORDS_SSSE3 2
  209. pshufb %1, %2
  210. %endmacro
  211. ; dst = (dst * src) >> 15
  212. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  213. ; in from the pmullw result.
  214. %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
  215. mova %3, %1
  216. pmulhw %1, %2
  217. pmullw %3, %2
  218. psrlw %3, 15
  219. psllw %1, 1
  220. por %1, %3
  221. %endmacro
  222. ; dst = ((dst * src) + (1<<14)) >> 15
  223. %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
  224. pmulhrsw %1, %2
  225. %endmacro
  226. %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
  227. cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
  228. lea offset2q, [offsetq-mmsize]
  229. %if %2
  230. mova m5, [pd_16384]
  231. %elifidn %1, ssse3
  232. mova m5, [pb_revwords]
  233. ALIGN 16
  234. %endif
  235. .loop:
  236. %if %2
  237. ; This version expands 16-bit to 32-bit, multiplies by the window,
  238. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  239. ; save to the output. The window is reversed for the second half.
  240. mova m3, [windowq+offset2q]
  241. mova m4, [ inputq+offset2q]
  242. pxor m0, m0
  243. punpcklwd m0, m3
  244. punpcklwd m1, m4
  245. pmaddwd m0, m1
  246. paddd m0, m5
  247. psrad m0, 15
  248. pxor m2, m2
  249. punpckhwd m2, m3
  250. punpckhwd m1, m4
  251. pmaddwd m2, m1
  252. paddd m2, m5
  253. psrad m2, 15
  254. packssdw m0, m2
  255. mova [outputq+offset2q], m0
  256. REVERSE_WORDS m3
  257. mova m4, [ inputq+offsetq]
  258. pxor m0, m0
  259. punpcklwd m0, m3
  260. punpcklwd m1, m4
  261. pmaddwd m0, m1
  262. paddd m0, m5
  263. psrad m0, 15
  264. pxor m2, m2
  265. punpckhwd m2, m3
  266. punpckhwd m1, m4
  267. pmaddwd m2, m1
  268. paddd m2, m5
  269. psrad m2, 15
  270. packssdw m0, m2
  271. mova [outputq+offsetq], m0
  272. %elif %3
  273. ; This version does the 16x16->16 multiplication in-place without expanding
  274. ; to 32-bit. The ssse3 version is bit-identical.
  275. mova m0, [windowq+offset2q]
  276. mova m1, [ inputq+offset2q]
  277. pmulhrsw m1, m0
  278. REVERSE_WORDS m0, m5
  279. pmulhrsw m0, [ inputq+offsetq ]
  280. mova [outputq+offset2q], m1
  281. mova [outputq+offsetq ], m0
  282. %else
  283. ; This version does the 16x16->16 multiplication in-place without expanding
  284. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  285. ; therefore are not bit-identical to the C version.
  286. mova m0, [windowq+offset2q]
  287. mova m1, [ inputq+offset2q]
  288. mova m2, [ inputq+offsetq ]
  289. MUL16FIXED m1, m0, m3
  290. REVERSE_WORDS m0
  291. MUL16FIXED m2, m0, m3
  292. mova [outputq+offset2q], m1
  293. mova [outputq+offsetq ], m2
  294. %endif
  295. add offsetd, mmsize
  296. sub offset2d, mmsize
  297. jae .loop
  298. REP_RET
  299. %endmacro
  300. INIT_MMX
  301. %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
  302. %define MUL16FIXED MUL16FIXED_MMXEXT
  303. APPLY_WINDOW_INT16 mmxext, 0, 0
  304. APPLY_WINDOW_INT16 mmxext_ba, 1, 0
  305. INIT_XMM
  306. %define REVERSE_WORDS REVERSE_WORDS_SSE2
  307. APPLY_WINDOW_INT16 sse2, 0, 0
  308. APPLY_WINDOW_INT16 sse2_ba, 1, 0
  309. APPLY_WINDOW_INT16 ssse3_atom, 0, 1
  310. %define REVERSE_WORDS REVERSE_WORDS_SSSE3
  311. APPLY_WINDOW_INT16 ssse3, 0, 1
  312. ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  313. cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
  314. movq mm0, [topq]
  315. movq mm2, mm0
  316. movd mm4, [left_topq]
  317. psllq mm2, 8
  318. movq mm1, mm0
  319. por mm4, mm2
  320. movd mm3, [leftq]
  321. psubb mm0, mm4 ; t-tl
  322. add dstq, wq
  323. add topq, wq
  324. add diffq, wq
  325. neg wq
  326. jmp .skip
  327. .loop:
  328. movq mm4, [topq+wq]
  329. movq mm0, mm4
  330. psllq mm4, 8
  331. por mm4, mm1
  332. movq mm1, mm0 ; t
  333. psubb mm0, mm4 ; t-tl
  334. .skip:
  335. movq mm2, [diffq+wq]
  336. %assign i 0
  337. %rep 8
  338. movq mm4, mm0
  339. paddb mm4, mm3 ; t-tl+l
  340. movq mm5, mm3
  341. pmaxub mm3, mm1
  342. pminub mm5, mm1
  343. pminub mm3, mm4
  344. pmaxub mm3, mm5 ; median
  345. paddb mm3, mm2 ; +residual
  346. %if i==0
  347. movq mm7, mm3
  348. psllq mm7, 56
  349. %else
  350. movq mm6, mm3
  351. psrlq mm7, 8
  352. psllq mm6, 56
  353. por mm7, mm6
  354. %endif
  355. %if i<7
  356. psrlq mm0, 8
  357. psrlq mm1, 8
  358. psrlq mm2, 8
  359. %endif
  360. %assign i i+1
  361. %endrep
  362. movq [dstq+wq], mm7
  363. add wq, 8
  364. jl .loop
  365. movzx r2d, byte [dstq-1]
  366. mov [leftq], r2d
  367. movzx r2d, byte [topq-1]
  368. mov [left_topq], r2d
  369. RET
  370. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  371. add srcq, wq
  372. add dstq, wq
  373. neg wq
  374. %%.loop:
  375. %if %2
  376. mova m1, [srcq+wq]
  377. %else
  378. movu m1, [srcq+wq]
  379. %endif
  380. mova m2, m1
  381. psllw m1, 8
  382. paddb m1, m2
  383. mova m2, m1
  384. pshufb m1, m3
  385. paddb m1, m2
  386. pshufb m0, m5
  387. mova m2, m1
  388. pshufb m1, m4
  389. paddb m1, m2
  390. %if mmsize == 16
  391. mova m2, m1
  392. pshufb m1, m6
  393. paddb m1, m2
  394. %endif
  395. paddb m0, m1
  396. %if %1
  397. mova [dstq+wq], m0
  398. %else
  399. movq [dstq+wq], m0
  400. movhps [dstq+wq+8], m0
  401. %endif
  402. add wq, mmsize
  403. jl %%.loop
  404. mov eax, mmsize-1
  405. sub eax, wd
  406. movd m1, eax
  407. pshufb m0, m1
  408. movd eax, m0
  409. RET
  410. %endmacro
  411. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  412. INIT_MMX
  413. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  414. .skip_prologue:
  415. mova m5, [pb_7]
  416. mova m4, [pb_zzzz3333zzzzbbbb]
  417. mova m3, [pb_zz11zz55zz99zzdd]
  418. movd m0, leftm
  419. psllq m0, 56
  420. ADD_HFYU_LEFT_LOOP 1, 1
  421. INIT_XMM
  422. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  423. mova m5, [pb_f]
  424. mova m6, [pb_zzzzzzzz77777777]
  425. mova m4, [pb_zzzz3333zzzzbbbb]
  426. mova m3, [pb_zz11zz55zz99zzdd]
  427. movd m0, leftm
  428. pslldq m0, 15
  429. test srcq, 15
  430. jnz .src_unaligned
  431. test dstq, 15
  432. jnz .dst_unaligned
  433. ADD_HFYU_LEFT_LOOP 1, 1
  434. .dst_unaligned:
  435. ADD_HFYU_LEFT_LOOP 0, 1
  436. .src_unaligned:
  437. ADD_HFYU_LEFT_LOOP 0, 0
  438. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  439. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  440. neg offsetq
  441. shl offsetq, 2
  442. sub v1q, offsetq
  443. sub v2q, offsetq
  444. xorps xmm0, xmm0
  445. .loop:
  446. movaps xmm1, [v1q+offsetq]
  447. mulps xmm1, [v2q+offsetq]
  448. addps xmm0, xmm1
  449. add offsetq, 16
  450. js .loop
  451. movhlps xmm1, xmm0
  452. addps xmm0, xmm1
  453. movss xmm1, xmm0
  454. shufps xmm0, xmm0, 1
  455. addss xmm0, xmm1
  456. %if ARCH_X86_64 == 0
  457. movss r0m, xmm0
  458. fld dword r0m
  459. %endif
  460. RET
  461. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  462. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  463. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  464. ;
  465. ; The actual function itself is below. It basically wraps a very simple
  466. ; w = end_x - start_x
  467. ; if (w) {
  468. ; if (w > 22) {
  469. ; jump to the slow loop functions
  470. ; } else {
  471. ; jump to the fast loop functions
  472. ; }
  473. ; }
  474. ;
  475. ; ... and then the same for left/right extend also. See below for loop
  476. ; function implementations. Fast are fixed-width, slow is variable-width
  477. %macro EMU_EDGE_FUNC 0
  478. %if ARCH_X86_64
  479. %define w_reg r7
  480. cglobal emu_edge_core, 6, 9, 1
  481. mov r8, r5 ; save block_h
  482. %else
  483. %define w_reg r6
  484. cglobal emu_edge_core, 2, 7, 0
  485. mov r4, r4m ; end_y
  486. mov r5, r5m ; block_h
  487. %endif
  488. ; start with vertical extend (top/bottom) and body pixel copy
  489. mov w_reg, r7m
  490. sub w_reg, r6m ; w = start_x - end_x
  491. sub r5, r4
  492. %if ARCH_X86_64
  493. sub r4, r3
  494. %else
  495. sub r4, dword r3m
  496. %endif
  497. cmp w_reg, 22
  498. jg .slow_v_extend_loop
  499. %if ARCH_X86_32
  500. mov r2, r2m ; linesize
  501. %endif
  502. sal w_reg, 7 ; w * 128
  503. %ifdef PIC
  504. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  505. add w_reg, rax
  506. %else
  507. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  508. %endif
  509. call w_reg ; fast top extend, body copy and bottom extend
  510. .v_extend_end:
  511. ; horizontal extend (left/right)
  512. mov w_reg, r6m ; start_x
  513. sub r0, w_reg
  514. %if ARCH_X86_64
  515. mov r3, r0 ; backup of buf+block_h*linesize
  516. mov r5, r8
  517. %else
  518. mov r0m, r0 ; backup of buf+block_h*linesize
  519. mov r5, r5m
  520. %endif
  521. test w_reg, w_reg
  522. jz .right_extend
  523. cmp w_reg, 22
  524. jg .slow_left_extend_loop
  525. mov r1, w_reg
  526. dec w_reg
  527. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  528. sar w_reg, 1
  529. sal w_reg, 6
  530. ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
  531. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  532. %ifdef PIC
  533. lea rax, [.emuedge_extend_left_2]
  534. add w_reg, rax
  535. %else
  536. lea w_reg, [.emuedge_extend_left_2+w_reg]
  537. %endif
  538. call w_reg
  539. ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
  540. .right_extend:
  541. %if ARCH_X86_32
  542. mov r0, r0m
  543. mov r5, r5m
  544. %endif
  545. mov w_reg, r7m ; end_x
  546. mov r1, r8m ; block_w
  547. mov r4, r1
  548. sub r1, w_reg
  549. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  550. cmp r1, 22
  551. jg .slow_right_extend_loop
  552. dec r1
  553. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  554. sar r1, 1
  555. sal r1, 6
  556. %ifdef PIC
  557. lea rax, [.emuedge_extend_right_2]
  558. add r1, rax
  559. %else
  560. lea r1, [.emuedge_extend_right_2+r1]
  561. %endif
  562. call r1
  563. .h_extend_end:
  564. RET
  565. %if ARCH_X86_64
  566. %define vall al
  567. %define valh ah
  568. %define valw ax
  569. %define valw2 r7w
  570. %define valw3 r3w
  571. %if WIN64
  572. %define valw4 r7w
  573. %else ; unix64
  574. %define valw4 r3w
  575. %endif
  576. %define vald eax
  577. %else
  578. %define vall bl
  579. %define valh bh
  580. %define valw bx
  581. %define valw2 r6w
  582. %define valw3 valw2
  583. %define valw4 valw3
  584. %define vald ebx
  585. %define stack_offset 0x14
  586. %endif
  587. %endmacro
  588. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  589. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  590. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  591. ; - else if (%2 & 8) fills 8 bytes into mm0
  592. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  593. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  594. ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
  595. ; (note that we're using r3 for body/bottom because it's a shorter
  596. ; opcode, and then the loop fits in 128 bytes)
  597. ; - else fills remaining bytes into rax
  598. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  599. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  600. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  601. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  602. ; - else fills remaining bytes into ebx
  603. ; writing data out is in the same way
  604. %macro READ_NUM_BYTES 2
  605. %assign %%src_off 0 ; offset in source buffer
  606. %assign %%smidx 0 ; mmx register idx
  607. %assign %%sxidx 0 ; xmm register idx
  608. %if cpuflag(sse)
  609. %rep %2/16
  610. movups xmm %+ %%sxidx, [r1+%%src_off]
  611. %assign %%src_off %%src_off+16
  612. %assign %%sxidx %%sxidx+1
  613. %endrep ; %2/16
  614. %endif
  615. %if ARCH_X86_64
  616. %if (%2-%%src_off) == 8
  617. mov rax, [r1+%%src_off]
  618. %assign %%src_off %%src_off+8
  619. %endif ; (%2-%%src_off) == 8
  620. %endif ; x86-64
  621. %rep (%2-%%src_off)/8
  622. movq mm %+ %%smidx, [r1+%%src_off]
  623. %assign %%src_off %%src_off+8
  624. %assign %%smidx %%smidx+1
  625. %endrep ; (%2-%%dst_off)/8
  626. %if (%2-%%src_off) == 4
  627. mov vald, [r1+%%src_off]
  628. %elif (%2-%%src_off) & 4
  629. movd mm %+ %%smidx, [r1+%%src_off]
  630. %assign %%src_off %%src_off+4
  631. %endif ; (%2-%%src_off) ==/& 4
  632. %if (%2-%%src_off) == 1
  633. mov vall, [r1+%%src_off]
  634. %elif (%2-%%src_off) == 2
  635. mov valw, [r1+%%src_off]
  636. %elif (%2-%%src_off) == 3
  637. %ifidn %1, top
  638. mov valw2, [r1+%%src_off]
  639. %elifidn %1, body
  640. mov valw3, [r1+%%src_off]
  641. %elifidn %1, bottom
  642. mov valw4, [r1+%%src_off]
  643. %endif ; %1 ==/!= top
  644. mov vall, [r1+%%src_off+2]
  645. %endif ; (%2-%%src_off) == 1/2/3
  646. %endmacro ; READ_NUM_BYTES
  647. %macro WRITE_NUM_BYTES 2
  648. %assign %%dst_off 0 ; offset in destination buffer
  649. %assign %%dmidx 0 ; mmx register idx
  650. %assign %%dxidx 0 ; xmm register idx
  651. %if cpuflag(sse)
  652. %rep %2/16
  653. movups [r0+%%dst_off], xmm %+ %%dxidx
  654. %assign %%dst_off %%dst_off+16
  655. %assign %%dxidx %%dxidx+1
  656. %endrep ; %2/16
  657. %endif
  658. %if ARCH_X86_64
  659. %if (%2-%%dst_off) == 8
  660. mov [r0+%%dst_off], rax
  661. %assign %%dst_off %%dst_off+8
  662. %endif ; (%2-%%dst_off) == 8
  663. %endif ; x86-64
  664. %rep (%2-%%dst_off)/8
  665. movq [r0+%%dst_off], mm %+ %%dmidx
  666. %assign %%dst_off %%dst_off+8
  667. %assign %%dmidx %%dmidx+1
  668. %endrep ; (%2-%%dst_off)/8
  669. %if (%2-%%dst_off) == 4
  670. mov [r0+%%dst_off], vald
  671. %elif (%2-%%dst_off) & 4
  672. movd [r0+%%dst_off], mm %+ %%dmidx
  673. %assign %%dst_off %%dst_off+4
  674. %endif ; (%2-%%dst_off) ==/& 4
  675. %if (%2-%%dst_off) == 1
  676. mov [r0+%%dst_off], vall
  677. %elif (%2-%%dst_off) == 2
  678. mov [r0+%%dst_off], valw
  679. %elif (%2-%%dst_off) == 3
  680. %ifidn %1, top
  681. mov [r0+%%dst_off], valw2
  682. %elifidn %1, body
  683. mov [r0+%%dst_off], valw3
  684. %elifidn %1, bottom
  685. mov [r0+%%dst_off], valw4
  686. %endif ; %1 ==/!= top
  687. mov [r0+%%dst_off+2], vall
  688. %endif ; (%2-%%dst_off) == 1/2/3
  689. %endmacro ; WRITE_NUM_BYTES
  690. ; vertical top/bottom extend and body copy fast loops
  691. ; these are function pointers to set-width line copy functions, i.e.
  692. ; they read a fixed number of pixels into set registers, and write
  693. ; those out into the destination buffer
  694. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  695. ; r6(eax/64)/r3(ebx/32)=val_reg
  696. %macro VERTICAL_EXTEND 0
  697. %assign %%n 1
  698. %rep 22
  699. ALIGN 128
  700. .emuedge_v_extend_ %+ %%n:
  701. ; extend pixels above body
  702. %if ARCH_X86_64
  703. test r3 , r3 ; if (!start_y)
  704. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  705. %else ; ARCH_X86_32
  706. cmp dword r3m, 0
  707. je .emuedge_copy_body_ %+ %%n %+ _loop
  708. %endif ; ARCH_X86_64/32
  709. READ_NUM_BYTES top, %%n ; read bytes
  710. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  711. WRITE_NUM_BYTES top, %%n ; write bytes
  712. add r0 , r2 ; dst += linesize
  713. %if ARCH_X86_64
  714. dec r3d
  715. %else ; ARCH_X86_32
  716. dec dword r3m
  717. %endif ; ARCH_X86_64/32
  718. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  719. ; copy body pixels
  720. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  721. READ_NUM_BYTES body, %%n ; read bytes
  722. WRITE_NUM_BYTES body, %%n ; write bytes
  723. add r0 , r2 ; dst += linesize
  724. add r1 , r2 ; src += linesize
  725. dec r4d
  726. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  727. ; copy bottom pixels
  728. test r5 , r5 ; if (!block_h)
  729. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  730. sub r1 , r2 ; src -= linesize
  731. READ_NUM_BYTES bottom, %%n ; read bytes
  732. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  733. WRITE_NUM_BYTES bottom, %%n ; write bytes
  734. add r0 , r2 ; dst += linesize
  735. dec r5d
  736. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  737. .emuedge_v_extend_end_ %+ %%n:
  738. %if ARCH_X86_64
  739. ret
  740. %else ; ARCH_X86_32
  741. rep ret
  742. %endif ; ARCH_X86_64/32
  743. %assign %%n %%n+1
  744. %endrep
  745. %endmacro VERTICAL_EXTEND
  746. ; left/right (horizontal) fast extend functions
  747. ; these are essentially identical to the vertical extend ones above,
  748. ; just left/right separated because number of pixels to extend is
  749. ; obviously not the same on both sides.
  750. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  751. ; lowest two bytes of the register (so val*0x0101), and are splatted
  752. ; into each byte of mm0 as well if n_pixels >= 8
  753. %macro READ_V_PIXEL 2
  754. mov vall, %2
  755. mov valh, vall
  756. %if %1 >= 8
  757. movd mm0, vald
  758. %if cpuflag(mmx2)
  759. pshufw mm0, mm0, 0
  760. %else ; mmx
  761. punpcklwd mm0, mm0
  762. punpckldq mm0, mm0
  763. %endif ; sse
  764. %endif ; %1 >= 8
  765. %endmacro
  766. %macro WRITE_V_PIXEL 2
  767. %assign %%dst_off 0
  768. %rep %1/8
  769. movq [%2+%%dst_off], mm0
  770. %assign %%dst_off %%dst_off+8
  771. %endrep
  772. %if %1 & 4
  773. %if %1 >= 8
  774. movd [%2+%%dst_off], mm0
  775. %else ; %1 < 8
  776. mov [%2+%%dst_off] , valw
  777. mov [%2+%%dst_off+2], valw
  778. %endif ; %1 >=/< 8
  779. %assign %%dst_off %%dst_off+4
  780. %endif ; %1 & 4
  781. %if %1&2
  782. mov [%2+%%dst_off], valw
  783. %endif ; %1 & 2
  784. %endmacro
  785. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  786. %macro LEFT_EXTEND 0
  787. %assign %%n 2
  788. %rep 11
  789. ALIGN 64
  790. .emuedge_extend_left_ %+ %%n: ; do {
  791. sub r0, r2 ; dst -= linesize
  792. READ_V_PIXEL %%n, [r0+r1] ; read pixels
  793. WRITE_V_PIXEL %%n, r0 ; write pixels
  794. dec r5
  795. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  796. %if ARCH_X86_64
  797. ret
  798. %else ; ARCH_X86_32
  799. rep ret
  800. %endif ; ARCH_X86_64/32
  801. %assign %%n %%n+2
  802. %endrep
  803. %endmacro ; LEFT_EXTEND
  804. ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
  805. %macro RIGHT_EXTEND 0
  806. %assign %%n 2
  807. %rep 11
  808. ALIGN 64
  809. .emuedge_extend_right_ %+ %%n: ; do {
  810. %if ARCH_X86_64
  811. sub r3, r2 ; dst -= linesize
  812. READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
  813. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  814. dec r8
  815. %else ; ARCH_X86_32
  816. sub r0, r2 ; dst -= linesize
  817. READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
  818. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  819. dec r5
  820. %endif ; ARCH_X86_64/32
  821. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  822. %if ARCH_X86_64
  823. ret
  824. %else ; ARCH_X86_32
  825. rep ret
  826. %endif ; ARCH_X86_64/32
  827. %assign %%n %%n+2
  828. %endrep
  829. %if ARCH_X86_32
  830. %define stack_offset 0x10
  831. %endif
  832. %endmacro ; RIGHT_EXTEND
  833. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  834. ; width specified in a register, and run a loop to copy the full amount
  835. ; of bytes. They are optimized for copying of large amounts of pixels per
  836. ; line, so they unconditionally splat data into mm registers to copy 8
  837. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  838. ; also, but I haven't optimized this as much (i.e. FIXME)
  839. %macro V_COPY_NPX 4-5
  840. %if %0 == 4
  841. test w_reg, %4
  842. jz .%1_skip_%4_px
  843. %else ; %0 == 5
  844. .%1_%4_px_loop:
  845. %endif
  846. %3 %2, [r1+cnt_reg]
  847. %3 [r0+cnt_reg], %2
  848. add cnt_reg, %4
  849. %if %0 == 5
  850. sub w_reg, %4
  851. test w_reg, %5
  852. jnz .%1_%4_px_loop
  853. %endif
  854. .%1_skip_%4_px:
  855. %endmacro
  856. %macro V_COPY_ROW 2
  857. %ifidn %1, bottom
  858. sub r1, linesize
  859. %endif
  860. .%1_copy_loop:
  861. xor cnt_reg, cnt_reg
  862. %if notcpuflag(sse)
  863. %define linesize r2m
  864. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  865. %else ; sse
  866. V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
  867. %if ARCH_X86_64
  868. %define linesize r2
  869. V_COPY_NPX %1, rax , mov, 8
  870. %else ; ARCH_X86_32
  871. %define linesize r2m
  872. V_COPY_NPX %1, mm0, movq, 8
  873. %endif ; ARCH_X86_64/32
  874. %endif ; sse
  875. V_COPY_NPX %1, vald, mov, 4
  876. V_COPY_NPX %1, valw, mov, 2
  877. V_COPY_NPX %1, vall, mov, 1
  878. mov w_reg, cnt_reg
  879. %ifidn %1, body
  880. add r1, linesize
  881. %endif
  882. add r0, linesize
  883. dec %2
  884. jnz .%1_copy_loop
  885. %endmacro
  886. %macro SLOW_V_EXTEND 0
  887. .slow_v_extend_loop:
  888. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  889. ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
  890. %if ARCH_X86_64
  891. push r8 ; save old value of block_h
  892. test r3, r3
  893. %define cnt_reg r8
  894. jz .do_body_copy ; if (!start_y) goto do_body_copy
  895. V_COPY_ROW top, r3
  896. %else
  897. cmp dword r3m, 0
  898. %define cnt_reg r2
  899. je .do_body_copy ; if (!start_y) goto do_body_copy
  900. V_COPY_ROW top, dword r3m
  901. %endif
  902. .do_body_copy:
  903. V_COPY_ROW body, r4
  904. %if ARCH_X86_64
  905. pop r8 ; restore old value of block_h
  906. %define cnt_reg r3
  907. %endif
  908. test r5, r5
  909. %if ARCH_X86_64
  910. jz .v_extend_end
  911. %else
  912. jz .skip_bottom_extend
  913. %endif
  914. V_COPY_ROW bottom, r5
  915. %if ARCH_X86_32
  916. .skip_bottom_extend:
  917. mov r2, r2m
  918. %endif
  919. jmp .v_extend_end
  920. %endmacro
  921. %macro SLOW_LEFT_EXTEND 0
  922. .slow_left_extend_loop:
  923. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
  924. mov r4, 8
  925. sub r0, linesize
  926. READ_V_PIXEL 8, [r0+w_reg]
  927. .left_extend_8px_loop:
  928. movq [r0+r4-8], mm0
  929. add r4, 8
  930. cmp r4, w_reg
  931. jle .left_extend_8px_loop
  932. sub r4, 8
  933. cmp r4, w_reg
  934. jge .left_extend_loop_end
  935. .left_extend_2px_loop:
  936. mov [r0+r4], valw
  937. add r4, 2
  938. cmp r4, w_reg
  939. jl .left_extend_2px_loop
  940. .left_extend_loop_end:
  941. dec r5
  942. jnz .slow_left_extend_loop
  943. %if ARCH_X86_32
  944. mov r2, r2m
  945. %endif
  946. jmp .right_extend
  947. %endmacro
  948. %macro SLOW_RIGHT_EXTEND 0
  949. .slow_right_extend_loop:
  950. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
  951. ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  952. %if ARCH_X86_64
  953. %define buf_reg r3
  954. %define bh_reg r8
  955. %else
  956. %define buf_reg r0
  957. %define bh_reg r5
  958. %endif
  959. lea r1, [r4-8]
  960. sub buf_reg, linesize
  961. READ_V_PIXEL 8, [buf_reg+w_reg-1]
  962. .right_extend_8px_loop:
  963. movq [buf_reg+r1], mm0
  964. sub r1, 8
  965. cmp r1, w_reg
  966. jge .right_extend_8px_loop
  967. add r1, 8
  968. cmp r1, w_reg
  969. je .right_extend_loop_end
  970. .right_extend_2px_loop:
  971. sub r1, 2
  972. mov [buf_reg+r1], valw
  973. cmp r1, w_reg
  974. jg .right_extend_2px_loop
  975. .right_extend_loop_end:
  976. dec bh_reg
  977. jnz .slow_right_extend_loop
  978. jmp .h_extend_end
  979. %endmacro
  980. %macro emu_edge 1
  981. INIT_XMM %1
  982. EMU_EDGE_FUNC
  983. VERTICAL_EXTEND
  984. LEFT_EXTEND
  985. RIGHT_EXTEND
  986. SLOW_V_EXTEND
  987. SLOW_LEFT_EXTEND
  988. SLOW_RIGHT_EXTEND
  989. %endmacro
  990. emu_edge sse
  991. %if ARCH_X86_32
  992. emu_edge mmx
  993. %endif
  994. ;-----------------------------------------------------------------------------
  995. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  996. ; int32_t max, unsigned int len)
  997. ;-----------------------------------------------------------------------------
  998. ; %1 = number of xmm registers used
  999. ; %2 = number of inline load/process/store loops per asm loop
  1000. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  1001. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  1002. ; %5 = suffix
  1003. %macro VECTOR_CLIP_INT32 4-5
  1004. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  1005. %if %4
  1006. cvtsi2ss m4, minm
  1007. cvtsi2ss m5, maxm
  1008. %else
  1009. movd m4, minm
  1010. movd m5, maxm
  1011. %endif
  1012. SPLATD m4
  1013. SPLATD m5
  1014. .loop:
  1015. %assign %%i 1
  1016. %rep %2
  1017. mova m0, [srcq+mmsize*0*%%i]
  1018. mova m1, [srcq+mmsize*1*%%i]
  1019. mova m2, [srcq+mmsize*2*%%i]
  1020. mova m3, [srcq+mmsize*3*%%i]
  1021. %if %3
  1022. mova m7, [srcq+mmsize*4*%%i]
  1023. mova m8, [srcq+mmsize*5*%%i]
  1024. mova m9, [srcq+mmsize*6*%%i]
  1025. mova m10, [srcq+mmsize*7*%%i]
  1026. %endif
  1027. CLIPD m0, m4, m5, m6
  1028. CLIPD m1, m4, m5, m6
  1029. CLIPD m2, m4, m5, m6
  1030. CLIPD m3, m4, m5, m6
  1031. %if %3
  1032. CLIPD m7, m4, m5, m6
  1033. CLIPD m8, m4, m5, m6
  1034. CLIPD m9, m4, m5, m6
  1035. CLIPD m10, m4, m5, m6
  1036. %endif
  1037. mova [dstq+mmsize*0*%%i], m0
  1038. mova [dstq+mmsize*1*%%i], m1
  1039. mova [dstq+mmsize*2*%%i], m2
  1040. mova [dstq+mmsize*3*%%i], m3
  1041. %if %3
  1042. mova [dstq+mmsize*4*%%i], m7
  1043. mova [dstq+mmsize*5*%%i], m8
  1044. mova [dstq+mmsize*6*%%i], m9
  1045. mova [dstq+mmsize*7*%%i], m10
  1046. %endif
  1047. %assign %%i %%i+1
  1048. %endrep
  1049. add srcq, mmsize*4*(%2+%3)
  1050. add dstq, mmsize*4*(%2+%3)
  1051. sub lend, mmsize*(%2+%3)
  1052. jg .loop
  1053. REP_RET
  1054. %endmacro
  1055. INIT_MMX mmx
  1056. %define SPLATD SPLATD_MMX
  1057. %define CLIPD CLIPD_MMX
  1058. VECTOR_CLIP_INT32 0, 1, 0, 0
  1059. INIT_XMM sse2
  1060. %define SPLATD SPLATD_SSE2
  1061. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  1062. %define CLIPD CLIPD_SSE2
  1063. VECTOR_CLIP_INT32 6, 2, 0, 1
  1064. INIT_XMM sse4
  1065. %define CLIPD CLIPD_SSE41
  1066. %ifdef m8
  1067. VECTOR_CLIP_INT32 11, 1, 1, 0
  1068. %else
  1069. VECTOR_CLIP_INT32 6, 1, 0, 0
  1070. %endif
  1071. ;-----------------------------------------------------------------------------
  1072. ; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
  1073. ; int len)
  1074. ;-----------------------------------------------------------------------------
  1075. %macro VECTOR_FMUL_REVERSE 0
  1076. cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
  1077. lea lenq, [lend*4 - 2*mmsize]
  1078. ALIGN 16
  1079. .loop:
  1080. %if cpuflag(avx)
  1081. vmovaps xmm0, [src1q + 16]
  1082. vinsertf128 m0, m0, [src1q], 1
  1083. vshufps m0, m0, m0, q0123
  1084. vmovaps xmm1, [src1q + mmsize + 16]
  1085. vinsertf128 m1, m1, [src1q + mmsize], 1
  1086. vshufps m1, m1, m1, q0123
  1087. %else
  1088. mova m0, [src1q]
  1089. mova m1, [src1q + mmsize]
  1090. shufps m0, m0, q0123
  1091. shufps m1, m1, q0123
  1092. %endif
  1093. mulps m0, m0, [src0q + lenq + mmsize]
  1094. mulps m1, m1, [src0q + lenq]
  1095. mova [dstq + lenq + mmsize], m0
  1096. mova [dstq + lenq], m1
  1097. add src1q, 2*mmsize
  1098. sub lenq, 2*mmsize
  1099. jge .loop
  1100. REP_RET
  1101. %endmacro
  1102. INIT_XMM sse
  1103. VECTOR_FMUL_REVERSE
  1104. %if HAVE_AVX
  1105. INIT_YMM avx
  1106. VECTOR_FMUL_REVERSE
  1107. %endif
  1108. ;-----------------------------------------------------------------------------
  1109. ; vector_fmul_add(float *dst, const float *src0, const float *src1,
  1110. ; const float *src2, int len)
  1111. ;-----------------------------------------------------------------------------
  1112. %macro VECTOR_FMUL_ADD 0
  1113. cglobal vector_fmul_add, 5,5,2, dst, src0, src1, src2, len
  1114. lea lenq, [lend*4 - 2*mmsize]
  1115. ALIGN 16
  1116. .loop:
  1117. mova m0, [src0q + lenq]
  1118. mova m1, [src0q + lenq + mmsize]
  1119. mulps m0, m0, [src1q + lenq]
  1120. mulps m1, m1, [src1q + lenq + mmsize]
  1121. addps m0, m0, [src2q + lenq]
  1122. addps m1, m1, [src2q + lenq + mmsize]
  1123. mova [dstq + lenq], m0
  1124. mova [dstq + lenq + mmsize], m1
  1125. sub lenq, 2*mmsize
  1126. jge .loop
  1127. REP_RET
  1128. %endmacro
  1129. INIT_XMM sse
  1130. VECTOR_FMUL_ADD
  1131. %if HAVE_AVX
  1132. INIT_YMM avx
  1133. VECTOR_FMUL_ADD
  1134. %endif
  1135. ;-----------------------------------------------------------------------------
  1136. ; void ff_butterflies_float_interleave(float *dst, const float *src0,
  1137. ; const float *src1, int len);
  1138. ;-----------------------------------------------------------------------------
  1139. %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
  1140. cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
  1141. %if ARCH_X86_64
  1142. movsxd lenq, lend
  1143. %endif
  1144. test lenq, lenq
  1145. jz .end
  1146. shl lenq, 2
  1147. lea src0q, [src0q + lenq]
  1148. lea src1q, [src1q + lenq]
  1149. lea dstq, [ dstq + 2*lenq]
  1150. neg lenq
  1151. .loop:
  1152. mova m0, [src0q + lenq]
  1153. mova m1, [src1q + lenq]
  1154. subps m2, m0, m1
  1155. addps m0, m0, m1
  1156. unpcklps m1, m0, m2
  1157. unpckhps m0, m0, m2
  1158. %if cpuflag(avx)
  1159. vextractf128 [dstq + 2*lenq ], m1, 0
  1160. vextractf128 [dstq + 2*lenq + 16], m0, 0
  1161. vextractf128 [dstq + 2*lenq + 32], m1, 1
  1162. vextractf128 [dstq + 2*lenq + 48], m0, 1
  1163. %else
  1164. mova [dstq + 2*lenq ], m1
  1165. mova [dstq + 2*lenq + mmsize], m0
  1166. %endif
  1167. add lenq, mmsize
  1168. jl .loop
  1169. .end:
  1170. REP_RET
  1171. %endmacro
  1172. INIT_XMM sse
  1173. BUTTERFLIES_FLOAT_INTERLEAVE
  1174. %if HAVE_AVX
  1175. INIT_YMM avx
  1176. BUTTERFLIES_FLOAT_INTERLEAVE
  1177. %endif
  1178. INIT_XMM sse2
  1179. ; %1 = aligned/unaligned
  1180. %macro BSWAP_LOOPS_SSE2 1
  1181. mov r3, r2
  1182. sar r2, 3
  1183. jz .left4_%1
  1184. .loop8_%1:
  1185. mov%1 m0, [r1 + 0]
  1186. mov%1 m1, [r1 + 16]
  1187. pshuflw m0, m0, 10110001b
  1188. pshuflw m1, m1, 10110001b
  1189. pshufhw m0, m0, 10110001b
  1190. pshufhw m1, m1, 10110001b
  1191. mova m2, m0
  1192. mova m3, m1
  1193. psllw m0, 8
  1194. psllw m1, 8
  1195. psrlw m2, 8
  1196. psrlw m3, 8
  1197. por m2, m0
  1198. por m3, m1
  1199. mova [r0 + 0], m2
  1200. mova [r0 + 16], m3
  1201. add r1, 32
  1202. add r0, 32
  1203. dec r2
  1204. jnz .loop8_%1
  1205. .left4_%1:
  1206. mov r2, r3
  1207. and r3, 4
  1208. jz .left
  1209. mov%1 m0, [r1]
  1210. pshuflw m0, m0, 10110001b
  1211. pshufhw m0, m0, 10110001b
  1212. mova m2, m0
  1213. psllw m0, 8
  1214. psrlw m2, 8
  1215. por m2, m0
  1216. mova [r0], m2
  1217. add r1, 16
  1218. add r0, 16
  1219. %endmacro
  1220. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1221. cglobal bswap32_buf, 3,4,5
  1222. mov r3, r1
  1223. and r3, 15
  1224. jz .start_align
  1225. BSWAP_LOOPS_SSE2 u
  1226. jmp .left
  1227. .start_align:
  1228. BSWAP_LOOPS_SSE2 a
  1229. .left:
  1230. and r2, 3
  1231. jz .end
  1232. .loop2:
  1233. mov r3d, [r1]
  1234. bswap r3d
  1235. mov [r0], r3d
  1236. add r1, 4
  1237. add r0, 4
  1238. dec r2
  1239. jnz .loop2
  1240. .end:
  1241. RET
  1242. ; %1 = aligned/unaligned
  1243. %macro BSWAP_LOOPS_SSSE3 1
  1244. mov r3, r2
  1245. sar r2, 3
  1246. jz .left4_%1
  1247. .loop8_%1:
  1248. mov%1 m0, [r1 + 0]
  1249. mov%1 m1, [r1 + 16]
  1250. pshufb m0, m2
  1251. pshufb m1, m2
  1252. mova [r0 + 0], m0
  1253. mova [r0 + 16], m1
  1254. add r0, 32
  1255. add r1, 32
  1256. dec r2
  1257. jnz .loop8_%1
  1258. .left4_%1:
  1259. mov r2, r3
  1260. and r3, 4
  1261. jz .left2
  1262. mov%1 m0, [r1]
  1263. pshufb m0, m2
  1264. mova [r0], m0
  1265. add r1, 16
  1266. add r0, 16
  1267. %endmacro
  1268. INIT_XMM ssse3
  1269. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1270. cglobal bswap32_buf, 3,4,3
  1271. mov r3, r1
  1272. mova m2, [pb_bswap32]
  1273. and r3, 15
  1274. jz .start_align
  1275. BSWAP_LOOPS_SSSE3 u
  1276. jmp .left2
  1277. .start_align:
  1278. BSWAP_LOOPS_SSSE3 a
  1279. .left2:
  1280. mov r3, r2
  1281. and r2, 2
  1282. jz .left1
  1283. movq m0, [r1]
  1284. pshufb m0, m2
  1285. movq [r0], m0
  1286. add r1, 8
  1287. add r0, 8
  1288. .left1:
  1289. and r3, 1
  1290. jz .end
  1291. mov r2d, [r1]
  1292. bswap r2d
  1293. mov [r0], r2d
  1294. .end:
  1295. RET