You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1304 lines
33KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. pb_f: times 16 db 15
  25. pb_zzzzzzzz77777777: times 8 db -1
  26. pb_7: times 8 db 7
  27. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  28. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  29. pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
  30. pd_16384: times 4 dd 16384
  31. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  32. SECTION_TEXT
  33. %macro SCALARPRODUCT 1
  34. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
  35. cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
  36. shl orderq, 1
  37. add v1q, orderq
  38. add v2q, orderq
  39. neg orderq
  40. movd m3, shiftm
  41. pxor m2, m2
  42. .loop:
  43. movu m0, [v1q + orderq]
  44. movu m1, [v1q + orderq + mmsize]
  45. pmaddwd m0, [v2q + orderq]
  46. pmaddwd m1, [v2q + orderq + mmsize]
  47. paddd m2, m0
  48. paddd m2, m1
  49. add orderq, mmsize*2
  50. jl .loop
  51. %if mmsize == 16
  52. movhlps m0, m2
  53. paddd m2, m0
  54. psrad m2, m3
  55. pshuflw m0, m2, 0x4e
  56. %else
  57. psrad m2, m3
  58. pshufw m0, m2, 0x4e
  59. %endif
  60. paddd m2, m0
  61. movd eax, m2
  62. RET
  63. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  64. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  65. shl orderq, 1
  66. movd m7, mulm
  67. %if mmsize == 16
  68. pshuflw m7, m7, 0
  69. punpcklqdq m7, m7
  70. %else
  71. pshufw m7, m7, 0
  72. %endif
  73. pxor m6, m6
  74. add v1q, orderq
  75. add v2q, orderq
  76. add v3q, orderq
  77. neg orderq
  78. .loop:
  79. movu m0, [v2q + orderq]
  80. movu m1, [v2q + orderq + mmsize]
  81. mova m4, [v1q + orderq]
  82. mova m5, [v1q + orderq + mmsize]
  83. movu m2, [v3q + orderq]
  84. movu m3, [v3q + orderq + mmsize]
  85. pmaddwd m0, m4
  86. pmaddwd m1, m5
  87. pmullw m2, m7
  88. pmullw m3, m7
  89. paddd m6, m0
  90. paddd m6, m1
  91. paddw m2, m4
  92. paddw m3, m5
  93. mova [v1q + orderq], m2
  94. mova [v1q + orderq + mmsize], m3
  95. add orderq, mmsize*2
  96. jl .loop
  97. %if mmsize == 16
  98. movhlps m0, m6
  99. paddd m6, m0
  100. pshuflw m0, m6, 0x4e
  101. %else
  102. pshufw m0, m6, 0x4e
  103. %endif
  104. paddd m6, m0
  105. movd eax, m6
  106. RET
  107. %endmacro
  108. INIT_MMX
  109. SCALARPRODUCT mmx2
  110. INIT_XMM
  111. SCALARPRODUCT sse2
  112. %macro SCALARPRODUCT_LOOP 1
  113. align 16
  114. .loop%1:
  115. sub orderq, mmsize*2
  116. %if %1
  117. mova m1, m4
  118. mova m4, [v2q + orderq]
  119. mova m0, [v2q + orderq + mmsize]
  120. palignr m1, m0, %1
  121. palignr m0, m4, %1
  122. mova m3, m5
  123. mova m5, [v3q + orderq]
  124. mova m2, [v3q + orderq + mmsize]
  125. palignr m3, m2, %1
  126. palignr m2, m5, %1
  127. %else
  128. mova m0, [v2q + orderq]
  129. mova m1, [v2q + orderq + mmsize]
  130. mova m2, [v3q + orderq]
  131. mova m3, [v3q + orderq + mmsize]
  132. %endif
  133. %define t0 [v1q + orderq]
  134. %define t1 [v1q + orderq + mmsize]
  135. %if ARCH_X86_64
  136. mova m8, t0
  137. mova m9, t1
  138. %define t0 m8
  139. %define t1 m9
  140. %endif
  141. pmaddwd m0, t0
  142. pmaddwd m1, t1
  143. pmullw m2, m7
  144. pmullw m3, m7
  145. paddw m2, t0
  146. paddw m3, t1
  147. paddd m6, m0
  148. paddd m6, m1
  149. mova [v1q + orderq], m2
  150. mova [v1q + orderq + mmsize], m3
  151. jg .loop%1
  152. %if %1
  153. jmp .end
  154. %endif
  155. %endmacro
  156. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  157. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  158. shl orderq, 1
  159. movd m7, mulm
  160. pshuflw m7, m7, 0
  161. punpcklqdq m7, m7
  162. pxor m6, m6
  163. mov r4d, v2d
  164. and r4d, 15
  165. and v2q, ~15
  166. and v3q, ~15
  167. mova m4, [v2q + orderq]
  168. mova m5, [v3q + orderq]
  169. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  170. cmp r4d, 0
  171. je .loop0
  172. cmp r4d, 2
  173. je .loop2
  174. cmp r4d, 4
  175. je .loop4
  176. cmp r4d, 6
  177. je .loop6
  178. cmp r4d, 8
  179. je .loop8
  180. cmp r4d, 10
  181. je .loop10
  182. cmp r4d, 12
  183. je .loop12
  184. SCALARPRODUCT_LOOP 14
  185. SCALARPRODUCT_LOOP 12
  186. SCALARPRODUCT_LOOP 10
  187. SCALARPRODUCT_LOOP 8
  188. SCALARPRODUCT_LOOP 6
  189. SCALARPRODUCT_LOOP 4
  190. SCALARPRODUCT_LOOP 2
  191. SCALARPRODUCT_LOOP 0
  192. .end:
  193. movhlps m0, m6
  194. paddd m6, m0
  195. pshuflw m0, m6, 0x4e
  196. paddd m6, m0
  197. movd eax, m6
  198. RET
  199. ;-----------------------------------------------------------------------------
  200. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  201. ; const int16_t *window, unsigned int len)
  202. ;-----------------------------------------------------------------------------
  203. %macro REVERSE_WORDS_MMXEXT 1-2
  204. pshufw %1, %1, 0x1B
  205. %endmacro
  206. %macro REVERSE_WORDS_SSE2 1-2
  207. pshuflw %1, %1, 0x1B
  208. pshufhw %1, %1, 0x1B
  209. pshufd %1, %1, 0x4E
  210. %endmacro
  211. %macro REVERSE_WORDS_SSSE3 2
  212. pshufb %1, %2
  213. %endmacro
  214. ; dst = (dst * src) >> 15
  215. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  216. ; in from the pmullw result.
  217. %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
  218. mova %3, %1
  219. pmulhw %1, %2
  220. pmullw %3, %2
  221. psrlw %3, 15
  222. psllw %1, 1
  223. por %1, %3
  224. %endmacro
  225. ; dst = ((dst * src) + (1<<14)) >> 15
  226. %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
  227. pmulhrsw %1, %2
  228. %endmacro
  229. %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
  230. cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
  231. lea offset2q, [offsetq-mmsize]
  232. %if %2
  233. mova m5, [pd_16384]
  234. %elifidn %1, ssse3
  235. mova m5, [pb_revwords]
  236. ALIGN 16
  237. %endif
  238. .loop:
  239. %if %2
  240. ; This version expands 16-bit to 32-bit, multiplies by the window,
  241. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  242. ; save to the output. The window is reversed for the second half.
  243. mova m3, [windowq+offset2q]
  244. mova m4, [ inputq+offset2q]
  245. pxor m0, m0
  246. punpcklwd m0, m3
  247. punpcklwd m1, m4
  248. pmaddwd m0, m1
  249. paddd m0, m5
  250. psrad m0, 15
  251. pxor m2, m2
  252. punpckhwd m2, m3
  253. punpckhwd m1, m4
  254. pmaddwd m2, m1
  255. paddd m2, m5
  256. psrad m2, 15
  257. packssdw m0, m2
  258. mova [outputq+offset2q], m0
  259. REVERSE_WORDS m3
  260. mova m4, [ inputq+offsetq]
  261. pxor m0, m0
  262. punpcklwd m0, m3
  263. punpcklwd m1, m4
  264. pmaddwd m0, m1
  265. paddd m0, m5
  266. psrad m0, 15
  267. pxor m2, m2
  268. punpckhwd m2, m3
  269. punpckhwd m1, m4
  270. pmaddwd m2, m1
  271. paddd m2, m5
  272. psrad m2, 15
  273. packssdw m0, m2
  274. mova [outputq+offsetq], m0
  275. %elif %3
  276. ; This version does the 16x16->16 multiplication in-place without expanding
  277. ; to 32-bit. The ssse3 version is bit-identical.
  278. mova m0, [windowq+offset2q]
  279. mova m1, [ inputq+offset2q]
  280. pmulhrsw m1, m0
  281. REVERSE_WORDS m0, m5
  282. pmulhrsw m0, [ inputq+offsetq ]
  283. mova [outputq+offset2q], m1
  284. mova [outputq+offsetq ], m0
  285. %else
  286. ; This version does the 16x16->16 multiplication in-place without expanding
  287. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  288. ; therefore are not bit-identical to the C version.
  289. mova m0, [windowq+offset2q]
  290. mova m1, [ inputq+offset2q]
  291. mova m2, [ inputq+offsetq ]
  292. MUL16FIXED m1, m0, m3
  293. REVERSE_WORDS m0
  294. MUL16FIXED m2, m0, m3
  295. mova [outputq+offset2q], m1
  296. mova [outputq+offsetq ], m2
  297. %endif
  298. add offsetd, mmsize
  299. sub offset2d, mmsize
  300. jae .loop
  301. REP_RET
  302. %endmacro
  303. INIT_MMX
  304. %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
  305. %define MUL16FIXED MUL16FIXED_MMXEXT
  306. APPLY_WINDOW_INT16 mmxext, 0, 0
  307. APPLY_WINDOW_INT16 mmxext_ba, 1, 0
  308. INIT_XMM
  309. %define REVERSE_WORDS REVERSE_WORDS_SSE2
  310. APPLY_WINDOW_INT16 sse2, 0, 0
  311. APPLY_WINDOW_INT16 sse2_ba, 1, 0
  312. APPLY_WINDOW_INT16 ssse3_atom, 0, 1
  313. %define REVERSE_WORDS REVERSE_WORDS_SSSE3
  314. APPLY_WINDOW_INT16 ssse3, 0, 1
  315. ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  316. cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
  317. movq mm0, [topq]
  318. movq mm2, mm0
  319. movd mm4, [left_topq]
  320. psllq mm2, 8
  321. movq mm1, mm0
  322. por mm4, mm2
  323. movd mm3, [leftq]
  324. psubb mm0, mm4 ; t-tl
  325. add dstq, wq
  326. add topq, wq
  327. add diffq, wq
  328. neg wq
  329. jmp .skip
  330. .loop:
  331. movq mm4, [topq+wq]
  332. movq mm0, mm4
  333. psllq mm4, 8
  334. por mm4, mm1
  335. movq mm1, mm0 ; t
  336. psubb mm0, mm4 ; t-tl
  337. .skip:
  338. movq mm2, [diffq+wq]
  339. %assign i 0
  340. %rep 8
  341. movq mm4, mm0
  342. paddb mm4, mm3 ; t-tl+l
  343. movq mm5, mm3
  344. pmaxub mm3, mm1
  345. pminub mm5, mm1
  346. pminub mm3, mm4
  347. pmaxub mm3, mm5 ; median
  348. paddb mm3, mm2 ; +residual
  349. %if i==0
  350. movq mm7, mm3
  351. psllq mm7, 56
  352. %else
  353. movq mm6, mm3
  354. psrlq mm7, 8
  355. psllq mm6, 56
  356. por mm7, mm6
  357. %endif
  358. %if i<7
  359. psrlq mm0, 8
  360. psrlq mm1, 8
  361. psrlq mm2, 8
  362. %endif
  363. %assign i i+1
  364. %endrep
  365. movq [dstq+wq], mm7
  366. add wq, 8
  367. jl .loop
  368. movzx r2d, byte [dstq-1]
  369. mov [leftq], r2d
  370. movzx r2d, byte [topq-1]
  371. mov [left_topq], r2d
  372. RET
  373. %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
  374. add srcq, wq
  375. add dstq, wq
  376. neg wq
  377. %%.loop:
  378. mova m1, [srcq+wq]
  379. mova m2, m1
  380. psllw m1, 8
  381. paddb m1, m2
  382. mova m2, m1
  383. pshufb m1, m3
  384. paddb m1, m2
  385. pshufb m0, m5
  386. mova m2, m1
  387. pshufb m1, m4
  388. paddb m1, m2
  389. %if mmsize == 16
  390. mova m2, m1
  391. pshufb m1, m6
  392. paddb m1, m2
  393. %endif
  394. paddb m0, m1
  395. %if %1
  396. mova [dstq+wq], m0
  397. %else
  398. movq [dstq+wq], m0
  399. movhps [dstq+wq+8], m0
  400. %endif
  401. add wq, mmsize
  402. jl %%.loop
  403. mov eax, mmsize-1
  404. sub eax, wd
  405. movd m1, eax
  406. pshufb m0, m1
  407. movd eax, m0
  408. RET
  409. %endmacro
  410. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  411. INIT_MMX
  412. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  413. .skip_prologue:
  414. mova m5, [pb_7]
  415. mova m4, [pb_zzzz3333zzzzbbbb]
  416. mova m3, [pb_zz11zz55zz99zzdd]
  417. movd m0, leftm
  418. psllq m0, 56
  419. ADD_HFYU_LEFT_LOOP 1
  420. INIT_XMM
  421. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  422. mova m5, [pb_f]
  423. mova m6, [pb_zzzzzzzz77777777]
  424. mova m4, [pb_zzzz3333zzzzbbbb]
  425. mova m3, [pb_zz11zz55zz99zzdd]
  426. movd m0, leftm
  427. pslldq m0, 15
  428. test srcq, 15
  429. jnz add_hfyu_left_prediction_ssse3.skip_prologue
  430. test dstq, 15
  431. jnz .unaligned
  432. ADD_HFYU_LEFT_LOOP 1
  433. .unaligned:
  434. ADD_HFYU_LEFT_LOOP 0
  435. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  436. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  437. neg offsetq
  438. shl offsetq, 2
  439. sub v1q, offsetq
  440. sub v2q, offsetq
  441. xorps xmm0, xmm0
  442. .loop:
  443. movaps xmm1, [v1q+offsetq]
  444. mulps xmm1, [v2q+offsetq]
  445. addps xmm0, xmm1
  446. add offsetq, 16
  447. js .loop
  448. movhlps xmm1, xmm0
  449. addps xmm0, xmm1
  450. movss xmm1, xmm0
  451. shufps xmm0, xmm0, 1
  452. addss xmm0, xmm1
  453. %if ARCH_X86_64 == 0
  454. movd r0m, xmm0
  455. fld dword r0m
  456. %endif
  457. RET
  458. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  459. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  460. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  461. ;
  462. ; The actual function itself is below. It basically wraps a very simple
  463. ; w = end_x - start_x
  464. ; if (w) {
  465. ; if (w > 22) {
  466. ; jump to the slow loop functions
  467. ; } else {
  468. ; jump to the fast loop functions
  469. ; }
  470. ; }
  471. ;
  472. ; ... and then the same for left/right extend also. See below for loop
  473. ; function implementations. Fast are fixed-width, slow is variable-width
  474. %macro EMU_EDGE_FUNC 0
  475. %if ARCH_X86_64
  476. %define w_reg r10
  477. cglobal emu_edge_core, 6, 7, 1
  478. mov r11, r5 ; save block_h
  479. %else
  480. %define w_reg r6
  481. cglobal emu_edge_core, 2, 7, 0
  482. mov r4, r4m ; end_y
  483. mov r5, r5m ; block_h
  484. %endif
  485. ; start with vertical extend (top/bottom) and body pixel copy
  486. mov w_reg, r7m
  487. sub w_reg, r6m ; w = start_x - end_x
  488. sub r5, r4
  489. %if ARCH_X86_64
  490. sub r4, r3
  491. %else
  492. sub r4, dword r3m
  493. %endif
  494. cmp w_reg, 22
  495. jg .slow_v_extend_loop
  496. %if ARCH_X86_32
  497. mov r2, r2m ; linesize
  498. %endif
  499. sal w_reg, 7 ; w * 128
  500. %ifdef PIC
  501. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  502. add w_reg, rax
  503. %else
  504. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  505. %endif
  506. call w_reg ; fast top extend, body copy and bottom extend
  507. .v_extend_end:
  508. ; horizontal extend (left/right)
  509. mov w_reg, r6m ; start_x
  510. sub r0, w_reg
  511. %if ARCH_X86_64
  512. mov r3, r0 ; backup of buf+block_h*linesize
  513. mov r5, r11
  514. %else
  515. mov r0m, r0 ; backup of buf+block_h*linesize
  516. mov r5, r5m
  517. %endif
  518. test w_reg, w_reg
  519. jz .right_extend
  520. cmp w_reg, 22
  521. jg .slow_left_extend_loop
  522. mov r1, w_reg
  523. dec w_reg
  524. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  525. sar w_reg, 1
  526. sal w_reg, 6
  527. ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
  528. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  529. %ifdef PIC
  530. lea rax, [.emuedge_extend_left_2]
  531. add w_reg, rax
  532. %else
  533. lea w_reg, [.emuedge_extend_left_2+w_reg]
  534. %endif
  535. call w_reg
  536. ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
  537. .right_extend:
  538. %if ARCH_X86_32
  539. mov r0, r0m
  540. mov r5, r5m
  541. %endif
  542. mov w_reg, r7m ; end_x
  543. mov r1, r8m ; block_w
  544. mov r4, r1
  545. sub r1, w_reg
  546. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  547. cmp r1, 22
  548. jg .slow_right_extend_loop
  549. dec r1
  550. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  551. sar r1, 1
  552. sal r1, 6
  553. %ifdef PIC
  554. lea rax, [.emuedge_extend_right_2]
  555. add r1, rax
  556. %else
  557. lea r1, [.emuedge_extend_right_2+r1]
  558. %endif
  559. call r1
  560. .h_extend_end:
  561. RET
  562. %if ARCH_X86_64
  563. %define vall al
  564. %define valh ah
  565. %define valw ax
  566. %define valw2 r10w
  567. %define valw3 r3w
  568. %if WIN64
  569. %define valw4 r4w
  570. %else ; unix64
  571. %define valw4 r3w
  572. %endif
  573. %define vald eax
  574. %else
  575. %define vall bl
  576. %define valh bh
  577. %define valw bx
  578. %define valw2 r6w
  579. %define valw3 valw2
  580. %define valw4 valw3
  581. %define vald ebx
  582. %define stack_offset 0x14
  583. %endif
  584. %endmacro
  585. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  586. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  587. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  588. ; - else if (%2 & 8) fills 8 bytes into mm0
  589. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  590. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  591. ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
  592. ; (note that we're using r3 for body/bottom because it's a shorter
  593. ; opcode, and then the loop fits in 128 bytes)
  594. ; - else fills remaining bytes into rax
  595. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  596. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  597. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  598. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  599. ; - else fills remaining bytes into ebx
  600. ; writing data out is in the same way
  601. %macro READ_NUM_BYTES 2
  602. %assign %%src_off 0 ; offset in source buffer
  603. %assign %%smidx 0 ; mmx register idx
  604. %assign %%sxidx 0 ; xmm register idx
  605. %if cpuflag(sse)
  606. %rep %2/16
  607. movups xmm %+ %%sxidx, [r1+%%src_off]
  608. %assign %%src_off %%src_off+16
  609. %assign %%sxidx %%sxidx+1
  610. %endrep ; %2/16
  611. %endif
  612. %if ARCH_X86_64
  613. %if (%2-%%src_off) == 8
  614. mov rax, [r1+%%src_off]
  615. %assign %%src_off %%src_off+8
  616. %endif ; (%2-%%src_off) == 8
  617. %endif ; x86-64
  618. %rep (%2-%%src_off)/8
  619. movq mm %+ %%smidx, [r1+%%src_off]
  620. %assign %%src_off %%src_off+8
  621. %assign %%smidx %%smidx+1
  622. %endrep ; (%2-%%dst_off)/8
  623. %if (%2-%%src_off) == 4
  624. mov vald, [r1+%%src_off]
  625. %elif (%2-%%src_off) & 4
  626. movd mm %+ %%smidx, [r1+%%src_off]
  627. %assign %%src_off %%src_off+4
  628. %endif ; (%2-%%src_off) ==/& 4
  629. %if (%2-%%src_off) == 1
  630. mov vall, [r1+%%src_off]
  631. %elif (%2-%%src_off) == 2
  632. mov valw, [r1+%%src_off]
  633. %elif (%2-%%src_off) == 3
  634. %ifidn %1, top
  635. mov valw2, [r1+%%src_off]
  636. %elifidn %1, body
  637. mov valw3, [r1+%%src_off]
  638. %elifidn %1, bottom
  639. mov valw4, [r1+%%src_off]
  640. %endif ; %1 ==/!= top
  641. mov vall, [r1+%%src_off+2]
  642. %endif ; (%2-%%src_off) == 1/2/3
  643. %endmacro ; READ_NUM_BYTES
  644. %macro WRITE_NUM_BYTES 2
  645. %assign %%dst_off 0 ; offset in destination buffer
  646. %assign %%dmidx 0 ; mmx register idx
  647. %assign %%dxidx 0 ; xmm register idx
  648. %if cpuflag(sse)
  649. %rep %2/16
  650. movups [r0+%%dst_off], xmm %+ %%dxidx
  651. %assign %%dst_off %%dst_off+16
  652. %assign %%dxidx %%dxidx+1
  653. %endrep ; %2/16
  654. %endif
  655. %if ARCH_X86_64
  656. %if (%2-%%dst_off) == 8
  657. mov [r0+%%dst_off], rax
  658. %assign %%dst_off %%dst_off+8
  659. %endif ; (%2-%%dst_off) == 8
  660. %endif ; x86-64
  661. %rep (%2-%%dst_off)/8
  662. movq [r0+%%dst_off], mm %+ %%dmidx
  663. %assign %%dst_off %%dst_off+8
  664. %assign %%dmidx %%dmidx+1
  665. %endrep ; (%2-%%dst_off)/8
  666. %if (%2-%%dst_off) == 4
  667. mov [r0+%%dst_off], vald
  668. %elif (%2-%%dst_off) & 4
  669. movd [r0+%%dst_off], mm %+ %%dmidx
  670. %assign %%dst_off %%dst_off+4
  671. %endif ; (%2-%%dst_off) ==/& 4
  672. %if (%2-%%dst_off) == 1
  673. mov [r0+%%dst_off], vall
  674. %elif (%2-%%dst_off) == 2
  675. mov [r0+%%dst_off], valw
  676. %elif (%2-%%dst_off) == 3
  677. %ifidn %1, top
  678. mov [r0+%%dst_off], valw2
  679. %elifidn %1, body
  680. mov [r0+%%dst_off], valw3
  681. %elifidn %1, bottom
  682. mov [r0+%%dst_off], valw4
  683. %endif ; %1 ==/!= top
  684. mov [r0+%%dst_off+2], vall
  685. %endif ; (%2-%%dst_off) == 1/2/3
  686. %endmacro ; WRITE_NUM_BYTES
  687. ; vertical top/bottom extend and body copy fast loops
  688. ; these are function pointers to set-width line copy functions, i.e.
  689. ; they read a fixed number of pixels into set registers, and write
  690. ; those out into the destination buffer
  691. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  692. ; r6(eax/64)/r3(ebx/32)=val_reg
  693. %macro VERTICAL_EXTEND 0
  694. %assign %%n 1
  695. %rep 22
  696. ALIGN 128
  697. .emuedge_v_extend_ %+ %%n:
  698. ; extend pixels above body
  699. %if ARCH_X86_64
  700. test r3 , r3 ; if (!start_y)
  701. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  702. %else ; ARCH_X86_32
  703. cmp dword r3m, 0
  704. je .emuedge_copy_body_ %+ %%n %+ _loop
  705. %endif ; ARCH_X86_64/32
  706. READ_NUM_BYTES top, %%n ; read bytes
  707. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  708. WRITE_NUM_BYTES top, %%n ; write bytes
  709. add r0 , r2 ; dst += linesize
  710. %if ARCH_X86_64
  711. dec r3d
  712. %else ; ARCH_X86_32
  713. dec dword r3m
  714. %endif ; ARCH_X86_64/32
  715. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  716. ; copy body pixels
  717. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  718. READ_NUM_BYTES body, %%n ; read bytes
  719. WRITE_NUM_BYTES body, %%n ; write bytes
  720. add r0 , r2 ; dst += linesize
  721. add r1 , r2 ; src += linesize
  722. dec r4d
  723. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  724. ; copy bottom pixels
  725. test r5 , r5 ; if (!block_h)
  726. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  727. sub r1 , r2 ; src -= linesize
  728. READ_NUM_BYTES bottom, %%n ; read bytes
  729. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  730. WRITE_NUM_BYTES bottom, %%n ; write bytes
  731. add r0 , r2 ; dst += linesize
  732. dec r5d
  733. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  734. .emuedge_v_extend_end_ %+ %%n:
  735. %if ARCH_X86_64
  736. ret
  737. %else ; ARCH_X86_32
  738. rep ret
  739. %endif ; ARCH_X86_64/32
  740. %assign %%n %%n+1
  741. %endrep
  742. %endmacro VERTICAL_EXTEND
  743. ; left/right (horizontal) fast extend functions
  744. ; these are essentially identical to the vertical extend ones above,
  745. ; just left/right separated because number of pixels to extend is
  746. ; obviously not the same on both sides.
  747. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  748. ; lowest two bytes of the register (so val*0x0101), and are splatted
  749. ; into each byte of mm0 as well if n_pixels >= 8
  750. %macro READ_V_PIXEL 2
  751. mov vall, %2
  752. mov valh, vall
  753. %if %1 >= 8
  754. movd mm0, vald
  755. %if cpuflag(mmx2)
  756. pshufw mm0, mm0, 0
  757. %else ; mmx
  758. punpcklwd mm0, mm0
  759. punpckldq mm0, mm0
  760. %endif ; sse
  761. %endif ; %1 >= 8
  762. %endmacro
  763. %macro WRITE_V_PIXEL 2
  764. %assign %%dst_off 0
  765. %rep %1/8
  766. movq [%2+%%dst_off], mm0
  767. %assign %%dst_off %%dst_off+8
  768. %endrep
  769. %if %1 & 4
  770. %if %1 >= 8
  771. movd [%2+%%dst_off], mm0
  772. %else ; %1 < 8
  773. mov [%2+%%dst_off] , valw
  774. mov [%2+%%dst_off+2], valw
  775. %endif ; %1 >=/< 8
  776. %assign %%dst_off %%dst_off+4
  777. %endif ; %1 & 4
  778. %if %1&2
  779. mov [%2+%%dst_off], valw
  780. %endif ; %1 & 2
  781. %endmacro
  782. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  783. %macro LEFT_EXTEND 0
  784. %assign %%n 2
  785. %rep 11
  786. ALIGN 64
  787. .emuedge_extend_left_ %+ %%n: ; do {
  788. sub r0, r2 ; dst -= linesize
  789. READ_V_PIXEL %%n, [r0+r1] ; read pixels
  790. WRITE_V_PIXEL %%n, r0 ; write pixels
  791. dec r5
  792. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  793. %if ARCH_X86_64
  794. ret
  795. %else ; ARCH_X86_32
  796. rep ret
  797. %endif ; ARCH_X86_64/32
  798. %assign %%n %%n+2
  799. %endrep
  800. %endmacro ; LEFT_EXTEND
  801. ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
  802. %macro RIGHT_EXTEND 0
  803. %assign %%n 2
  804. %rep 11
  805. ALIGN 64
  806. .emuedge_extend_right_ %+ %%n: ; do {
  807. %if ARCH_X86_64
  808. sub r3, r2 ; dst -= linesize
  809. READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
  810. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  811. dec r11
  812. %else ; ARCH_X86_32
  813. sub r0, r2 ; dst -= linesize
  814. READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
  815. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  816. dec r5
  817. %endif ; ARCH_X86_64/32
  818. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  819. %if ARCH_X86_64
  820. ret
  821. %else ; ARCH_X86_32
  822. rep ret
  823. %endif ; ARCH_X86_64/32
  824. %assign %%n %%n+2
  825. %endrep
  826. %if ARCH_X86_32
  827. %define stack_offset 0x10
  828. %endif
  829. %endmacro ; RIGHT_EXTEND
  830. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  831. ; width specified in a register, and run a loop to copy the full amount
  832. ; of bytes. They are optimized for copying of large amounts of pixels per
  833. ; line, so they unconditionally splat data into mm registers to copy 8
  834. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  835. ; also, but I haven't optimized this as much (i.e. FIXME)
  836. %macro V_COPY_NPX 4-5
  837. %if %0 == 4
  838. test w_reg, %4
  839. jz .%1_skip_%4_px
  840. %else ; %0 == 5
  841. .%1_%4_px_loop:
  842. %endif
  843. %3 %2, [r1+cnt_reg]
  844. %3 [r0+cnt_reg], %2
  845. add cnt_reg, %4
  846. %if %0 == 5
  847. sub w_reg, %4
  848. test w_reg, %5
  849. jnz .%1_%4_px_loop
  850. %endif
  851. .%1_skip_%4_px:
  852. %endmacro
  853. %macro V_COPY_ROW 2
  854. %ifidn %1, bottom
  855. sub r1, linesize
  856. %endif
  857. .%1_copy_loop:
  858. xor cnt_reg, cnt_reg
  859. %if notcpuflag(sse)
  860. %define linesize r2m
  861. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  862. %else ; sse
  863. V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
  864. %if ARCH_X86_64
  865. %define linesize r2
  866. V_COPY_NPX %1, rax , mov, 8
  867. %else ; ARCH_X86_32
  868. %define linesize r2m
  869. V_COPY_NPX %1, mm0, movq, 8
  870. %endif ; ARCH_X86_64/32
  871. %endif ; sse
  872. V_COPY_NPX %1, vald, mov, 4
  873. V_COPY_NPX %1, valw, mov, 2
  874. V_COPY_NPX %1, vall, mov, 1
  875. mov w_reg, cnt_reg
  876. %ifidn %1, body
  877. add r1, linesize
  878. %endif
  879. add r0, linesize
  880. dec %2
  881. jnz .%1_copy_loop
  882. %endmacro
  883. %macro SLOW_V_EXTEND 0
  884. .slow_v_extend_loop:
  885. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  886. ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
  887. %if ARCH_X86_64
  888. push r11 ; save old value of block_h
  889. test r3, r3
  890. %define cnt_reg r11
  891. jz .do_body_copy ; if (!start_y) goto do_body_copy
  892. V_COPY_ROW top, r3
  893. %else
  894. cmp dword r3m, 0
  895. %define cnt_reg r2
  896. je .do_body_copy ; if (!start_y) goto do_body_copy
  897. V_COPY_ROW top, dword r3m
  898. %endif
  899. .do_body_copy:
  900. V_COPY_ROW body, r4
  901. %if ARCH_X86_64
  902. pop r11 ; restore old value of block_h
  903. %define cnt_reg r3
  904. %endif
  905. test r5, r5
  906. %if ARCH_X86_64
  907. jz .v_extend_end
  908. %else
  909. jz .skip_bottom_extend
  910. %endif
  911. V_COPY_ROW bottom, r5
  912. %if ARCH_X86_32
  913. .skip_bottom_extend:
  914. mov r2, r2m
  915. %endif
  916. jmp .v_extend_end
  917. %endmacro
  918. %macro SLOW_LEFT_EXTEND 0
  919. .slow_left_extend_loop:
  920. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
  921. mov r4, 8
  922. sub r0, linesize
  923. READ_V_PIXEL 8, [r0+w_reg]
  924. .left_extend_8px_loop:
  925. movq [r0+r4-8], mm0
  926. add r4, 8
  927. cmp r4, w_reg
  928. jle .left_extend_8px_loop
  929. sub r4, 8
  930. cmp r4, w_reg
  931. jge .left_extend_loop_end
  932. .left_extend_2px_loop:
  933. mov [r0+r4], valw
  934. add r4, 2
  935. cmp r4, w_reg
  936. jl .left_extend_2px_loop
  937. .left_extend_loop_end:
  938. dec r5
  939. jnz .slow_left_extend_loop
  940. %if ARCH_X86_32
  941. mov r2, r2m
  942. %endif
  943. jmp .right_extend
  944. %endmacro
  945. %macro SLOW_RIGHT_EXTEND 0
  946. .slow_right_extend_loop:
  947. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
  948. ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  949. %if ARCH_X86_64
  950. %define buf_reg r3
  951. %define bh_reg r11
  952. %else
  953. %define buf_reg r0
  954. %define bh_reg r5
  955. %endif
  956. lea r1, [r4-8]
  957. sub buf_reg, linesize
  958. READ_V_PIXEL 8, [buf_reg+w_reg-1]
  959. .right_extend_8px_loop:
  960. movq [buf_reg+r1], mm0
  961. sub r1, 8
  962. cmp r1, w_reg
  963. jge .right_extend_8px_loop
  964. add r1, 8
  965. cmp r1, w_reg
  966. je .right_extend_loop_end
  967. .right_extend_2px_loop:
  968. sub r1, 2
  969. mov [buf_reg+r1], valw
  970. cmp r1, w_reg
  971. jg .right_extend_2px_loop
  972. .right_extend_loop_end:
  973. dec bh_reg
  974. jnz .slow_right_extend_loop
  975. jmp .h_extend_end
  976. %endmacro
  977. %macro emu_edge 1
  978. INIT_XMM %1
  979. EMU_EDGE_FUNC
  980. VERTICAL_EXTEND
  981. LEFT_EXTEND
  982. RIGHT_EXTEND
  983. SLOW_V_EXTEND
  984. SLOW_LEFT_EXTEND
  985. SLOW_RIGHT_EXTEND
  986. %endmacro
  987. emu_edge sse
  988. %if ARCH_X86_32
  989. emu_edge mmx
  990. %endif
  991. ;-----------------------------------------------------------------------------
  992. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  993. ; int32_t max, unsigned int len)
  994. ;-----------------------------------------------------------------------------
  995. ; %1 = number of xmm registers used
  996. ; %2 = number of inline load/process/store loops per asm loop
  997. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  998. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  999. ; %5 = suffix
  1000. %macro VECTOR_CLIP_INT32 4-5
  1001. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  1002. %if %4
  1003. cvtsi2ss m4, minm
  1004. cvtsi2ss m5, maxm
  1005. %else
  1006. movd m4, minm
  1007. movd m5, maxm
  1008. %endif
  1009. SPLATD m4
  1010. SPLATD m5
  1011. .loop:
  1012. %assign %%i 1
  1013. %rep %2
  1014. mova m0, [srcq+mmsize*0*%%i]
  1015. mova m1, [srcq+mmsize*1*%%i]
  1016. mova m2, [srcq+mmsize*2*%%i]
  1017. mova m3, [srcq+mmsize*3*%%i]
  1018. %if %3
  1019. mova m7, [srcq+mmsize*4*%%i]
  1020. mova m8, [srcq+mmsize*5*%%i]
  1021. mova m9, [srcq+mmsize*6*%%i]
  1022. mova m10, [srcq+mmsize*7*%%i]
  1023. %endif
  1024. CLIPD m0, m4, m5, m6
  1025. CLIPD m1, m4, m5, m6
  1026. CLIPD m2, m4, m5, m6
  1027. CLIPD m3, m4, m5, m6
  1028. %if %3
  1029. CLIPD m7, m4, m5, m6
  1030. CLIPD m8, m4, m5, m6
  1031. CLIPD m9, m4, m5, m6
  1032. CLIPD m10, m4, m5, m6
  1033. %endif
  1034. mova [dstq+mmsize*0*%%i], m0
  1035. mova [dstq+mmsize*1*%%i], m1
  1036. mova [dstq+mmsize*2*%%i], m2
  1037. mova [dstq+mmsize*3*%%i], m3
  1038. %if %3
  1039. mova [dstq+mmsize*4*%%i], m7
  1040. mova [dstq+mmsize*5*%%i], m8
  1041. mova [dstq+mmsize*6*%%i], m9
  1042. mova [dstq+mmsize*7*%%i], m10
  1043. %endif
  1044. %assign %%i %%i+1
  1045. %endrep
  1046. add srcq, mmsize*4*(%2+%3)
  1047. add dstq, mmsize*4*(%2+%3)
  1048. sub lend, mmsize*(%2+%3)
  1049. jg .loop
  1050. REP_RET
  1051. %endmacro
  1052. INIT_MMX mmx
  1053. %define SPLATD SPLATD_MMX
  1054. %define CLIPD CLIPD_MMX
  1055. VECTOR_CLIP_INT32 0, 1, 0, 0
  1056. INIT_XMM sse2
  1057. %define SPLATD SPLATD_SSE2
  1058. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  1059. %define CLIPD CLIPD_SSE2
  1060. VECTOR_CLIP_INT32 6, 2, 0, 1
  1061. INIT_XMM sse4
  1062. %define CLIPD CLIPD_SSE41
  1063. %ifdef m8
  1064. VECTOR_CLIP_INT32 11, 1, 1, 0
  1065. %else
  1066. VECTOR_CLIP_INT32 6, 1, 0, 0
  1067. %endif
  1068. ;-----------------------------------------------------------------------------
  1069. ; void ff_butterflies_float_interleave(float *dst, const float *src0,
  1070. ; const float *src1, int len);
  1071. ;-----------------------------------------------------------------------------
  1072. %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
  1073. cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
  1074. %if ARCH_X86_64
  1075. movsxd lenq, lend
  1076. %endif
  1077. test lenq, lenq
  1078. jz .end
  1079. shl lenq, 2
  1080. lea src0q, [src0q + lenq]
  1081. lea src1q, [src1q + lenq]
  1082. lea dstq, [ dstq + 2*lenq]
  1083. neg lenq
  1084. .loop:
  1085. mova m0, [src0q + lenq]
  1086. mova m1, [src1q + lenq]
  1087. subps m2, m0, m1
  1088. addps m0, m0, m1
  1089. unpcklps m1, m0, m2
  1090. unpckhps m0, m0, m2
  1091. %if cpuflag(avx)
  1092. vextractf128 [dstq + 2*lenq ], m1, 0
  1093. vextractf128 [dstq + 2*lenq + 16], m0, 0
  1094. vextractf128 [dstq + 2*lenq + 32], m1, 1
  1095. vextractf128 [dstq + 2*lenq + 48], m0, 1
  1096. %else
  1097. mova [dstq + 2*lenq ], m1
  1098. mova [dstq + 2*lenq + mmsize], m0
  1099. %endif
  1100. add lenq, mmsize
  1101. jl .loop
  1102. %if mmsize == 32
  1103. vzeroupper
  1104. RET
  1105. %endif
  1106. .end:
  1107. REP_RET
  1108. %endmacro
  1109. INIT_XMM sse
  1110. BUTTERFLIES_FLOAT_INTERLEAVE
  1111. INIT_YMM avx
  1112. BUTTERFLIES_FLOAT_INTERLEAVE
  1113. INIT_XMM sse2
  1114. ; %1 = aligned/unaligned
  1115. %macro BSWAP_LOOPS_SSE2 1
  1116. mov r3, r2
  1117. sar r2, 3
  1118. jz .left4_%1
  1119. .loop8_%1:
  1120. mov%1 m0, [r1 + 0]
  1121. mov%1 m1, [r1 + 16]
  1122. pshuflw m0, m0, 10110001b
  1123. pshuflw m1, m1, 10110001b
  1124. pshufhw m0, m0, 10110001b
  1125. pshufhw m1, m1, 10110001b
  1126. mova m2, m0
  1127. mova m3, m1
  1128. psllw m0, 8
  1129. psllw m1, 8
  1130. psrlw m2, 8
  1131. psrlw m3, 8
  1132. por m2, m0
  1133. por m3, m1
  1134. mova [r0 + 0], m2
  1135. mova [r0 + 16], m3
  1136. add r1, 32
  1137. add r0, 32
  1138. dec r2
  1139. jnz .loop8_%1
  1140. .left4_%1:
  1141. mov r2, r3
  1142. and r3, 4
  1143. jz .left
  1144. mov%1 m0, [r1]
  1145. pshuflw m0, m0, 10110001b
  1146. pshufhw m0, m0, 10110001b
  1147. mova m2, m0
  1148. psllw m0, 8
  1149. psrlw m2, 8
  1150. por m2, m0
  1151. mova [r0], m2
  1152. add r1, 16
  1153. add r0, 16
  1154. %endmacro
  1155. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1156. cglobal bswap32_buf, 3,4,5
  1157. mov r3, r1
  1158. and r3, 15
  1159. jz .start_align
  1160. BSWAP_LOOPS_SSE2 u
  1161. jmp .left
  1162. .start_align:
  1163. BSWAP_LOOPS_SSE2 a
  1164. .left:
  1165. and r2, 3
  1166. jz .end
  1167. .loop2:
  1168. mov r3d, [r1]
  1169. bswap r3d
  1170. mov [r0], r3d
  1171. add r1, 4
  1172. add r0, 4
  1173. dec r2
  1174. jnz .loop2
  1175. .end
  1176. RET
  1177. ; %1 = aligned/unaligned
  1178. %macro BSWAP_LOOPS_SSSE3 1
  1179. mov r3, r2
  1180. sar r2, 3
  1181. jz .left4_%1
  1182. .loop8_%1:
  1183. mov%1 m0, [r1 + 0]
  1184. mov%1 m1, [r1 + 16]
  1185. pshufb m0, m2
  1186. pshufb m1, m2
  1187. mova [r0 + 0], m0
  1188. mova [r0 + 16], m1
  1189. add r0, 32
  1190. add r1, 32
  1191. dec r2
  1192. jnz .loop8_%1
  1193. .left4_%1:
  1194. mov r2, r3
  1195. and r3, 4
  1196. jz .left2
  1197. mov%1 m0, [r1]
  1198. pshufb m0, m2
  1199. mova [r0], m0
  1200. add r1, 16
  1201. add r0, 16
  1202. %endmacro
  1203. INIT_XMM ssse3
  1204. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  1205. cglobal bswap32_buf, 3,4,3
  1206. mov r3, r1
  1207. mova m2, [pb_bswap32]
  1208. and r3, 15
  1209. jz .start_align
  1210. BSWAP_LOOPS_SSSE3 u
  1211. jmp .left2
  1212. .start_align:
  1213. BSWAP_LOOPS_SSSE3 a
  1214. .left2:
  1215. mov r3, r2
  1216. and r2, 2
  1217. jz .left1
  1218. movq m0, [r1]
  1219. pshufb m0, m2
  1220. movq [r0], m0
  1221. add r1, 8
  1222. add r0, 8
  1223. .left1:
  1224. and r3, 1
  1225. jz .end
  1226. mov r2d, [r1]
  1227. bswap r2d
  1228. mov [r0], r2d
  1229. .end:
  1230. RET