You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1180 lines
31KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_RODATA
  24. pb_f: times 16 db 15
  25. pb_zzzzzzzz77777777: times 8 db -1
  26. pb_7: times 8 db 7
  27. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  28. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  29. pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
  30. pd_16384: times 4 dd 16384
  31. SECTION_TEXT
  32. %macro SCALARPRODUCT 1
  33. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
  34. cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
  35. shl orderq, 1
  36. add v1q, orderq
  37. add v2q, orderq
  38. neg orderq
  39. movd m3, shiftm
  40. pxor m2, m2
  41. .loop:
  42. movu m0, [v1q + orderq]
  43. movu m1, [v1q + orderq + mmsize]
  44. pmaddwd m0, [v2q + orderq]
  45. pmaddwd m1, [v2q + orderq + mmsize]
  46. paddd m2, m0
  47. paddd m2, m1
  48. add orderq, mmsize*2
  49. jl .loop
  50. %if mmsize == 16
  51. movhlps m0, m2
  52. paddd m2, m0
  53. psrad m2, m3
  54. pshuflw m0, m2, 0x4e
  55. %else
  56. psrad m2, m3
  57. pshufw m0, m2, 0x4e
  58. %endif
  59. paddd m2, m0
  60. movd eax, m2
  61. RET
  62. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  63. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  64. shl orderq, 1
  65. movd m7, mulm
  66. %if mmsize == 16
  67. pshuflw m7, m7, 0
  68. punpcklqdq m7, m7
  69. %else
  70. pshufw m7, m7, 0
  71. %endif
  72. pxor m6, m6
  73. add v1q, orderq
  74. add v2q, orderq
  75. add v3q, orderq
  76. neg orderq
  77. .loop:
  78. movu m0, [v2q + orderq]
  79. movu m1, [v2q + orderq + mmsize]
  80. mova m4, [v1q + orderq]
  81. mova m5, [v1q + orderq + mmsize]
  82. movu m2, [v3q + orderq]
  83. movu m3, [v3q + orderq + mmsize]
  84. pmaddwd m0, m4
  85. pmaddwd m1, m5
  86. pmullw m2, m7
  87. pmullw m3, m7
  88. paddd m6, m0
  89. paddd m6, m1
  90. paddw m2, m4
  91. paddw m3, m5
  92. mova [v1q + orderq], m2
  93. mova [v1q + orderq + mmsize], m3
  94. add orderq, mmsize*2
  95. jl .loop
  96. %if mmsize == 16
  97. movhlps m0, m6
  98. paddd m6, m0
  99. pshuflw m0, m6, 0x4e
  100. %else
  101. pshufw m0, m6, 0x4e
  102. %endif
  103. paddd m6, m0
  104. movd eax, m6
  105. RET
  106. %endmacro
  107. INIT_MMX
  108. SCALARPRODUCT mmx2
  109. INIT_XMM
  110. SCALARPRODUCT sse2
  111. %macro SCALARPRODUCT_LOOP 1
  112. align 16
  113. .loop%1:
  114. sub orderq, mmsize*2
  115. %if %1
  116. mova m1, m4
  117. mova m4, [v2q + orderq]
  118. mova m0, [v2q + orderq + mmsize]
  119. palignr m1, m0, %1
  120. palignr m0, m4, %1
  121. mova m3, m5
  122. mova m5, [v3q + orderq]
  123. mova m2, [v3q + orderq + mmsize]
  124. palignr m3, m2, %1
  125. palignr m2, m5, %1
  126. %else
  127. mova m0, [v2q + orderq]
  128. mova m1, [v2q + orderq + mmsize]
  129. mova m2, [v3q + orderq]
  130. mova m3, [v3q + orderq + mmsize]
  131. %endif
  132. %define t0 [v1q + orderq]
  133. %define t1 [v1q + orderq + mmsize]
  134. %ifdef ARCH_X86_64
  135. mova m8, t0
  136. mova m9, t1
  137. %define t0 m8
  138. %define t1 m9
  139. %endif
  140. pmaddwd m0, t0
  141. pmaddwd m1, t1
  142. pmullw m2, m7
  143. pmullw m3, m7
  144. paddw m2, t0
  145. paddw m3, t1
  146. paddd m6, m0
  147. paddd m6, m1
  148. mova [v1q + orderq], m2
  149. mova [v1q + orderq + mmsize], m3
  150. jg .loop%1
  151. %if %1
  152. jmp .end
  153. %endif
  154. %endmacro
  155. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  156. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  157. shl orderq, 1
  158. movd m7, mulm
  159. pshuflw m7, m7, 0
  160. punpcklqdq m7, m7
  161. pxor m6, m6
  162. mov r4d, v2d
  163. and r4d, 15
  164. and v2q, ~15
  165. and v3q, ~15
  166. mova m4, [v2q + orderq]
  167. mova m5, [v3q + orderq]
  168. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  169. cmp r4d, 0
  170. je .loop0
  171. cmp r4d, 2
  172. je .loop2
  173. cmp r4d, 4
  174. je .loop4
  175. cmp r4d, 6
  176. je .loop6
  177. cmp r4d, 8
  178. je .loop8
  179. cmp r4d, 10
  180. je .loop10
  181. cmp r4d, 12
  182. je .loop12
  183. SCALARPRODUCT_LOOP 14
  184. SCALARPRODUCT_LOOP 12
  185. SCALARPRODUCT_LOOP 10
  186. SCALARPRODUCT_LOOP 8
  187. SCALARPRODUCT_LOOP 6
  188. SCALARPRODUCT_LOOP 4
  189. SCALARPRODUCT_LOOP 2
  190. SCALARPRODUCT_LOOP 0
  191. .end:
  192. movhlps m0, m6
  193. paddd m6, m0
  194. pshuflw m0, m6, 0x4e
  195. paddd m6, m0
  196. movd eax, m6
  197. RET
  198. ;-----------------------------------------------------------------------------
  199. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  200. ; const int16_t *window, unsigned int len)
  201. ;-----------------------------------------------------------------------------
  202. %macro REVERSE_WORDS_MMXEXT 1-2
  203. pshufw %1, %1, 0x1B
  204. %endmacro
  205. %macro REVERSE_WORDS_SSE2 1-2
  206. pshuflw %1, %1, 0x1B
  207. pshufhw %1, %1, 0x1B
  208. pshufd %1, %1, 0x4E
  209. %endmacro
  210. %macro REVERSE_WORDS_SSSE3 2
  211. pshufb %1, %2
  212. %endmacro
  213. ; dst = (dst * src) >> 15
  214. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  215. ; in from the pmullw result.
  216. %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
  217. mova %3, %1
  218. pmulhw %1, %2
  219. pmullw %3, %2
  220. psrlw %3, 15
  221. psllw %1, 1
  222. por %1, %3
  223. %endmacro
  224. ; dst = ((dst * src) + (1<<14)) >> 15
  225. %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
  226. pmulhrsw %1, %2
  227. %endmacro
  228. %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
  229. cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
  230. lea offset2q, [offsetq-mmsize]
  231. %if %2
  232. mova m5, [pd_16384]
  233. %elifidn %1, ssse3
  234. mova m5, [pb_revwords]
  235. ALIGN 16
  236. %endif
  237. .loop:
  238. %if %2
  239. ; This version expands 16-bit to 32-bit, multiplies by the window,
  240. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  241. ; save to the output. The window is reversed for the second half.
  242. mova m3, [windowq+offset2q]
  243. mova m4, [ inputq+offset2q]
  244. pxor m0, m0
  245. punpcklwd m0, m3
  246. punpcklwd m1, m4
  247. pmaddwd m0, m1
  248. paddd m0, m5
  249. psrad m0, 15
  250. pxor m2, m2
  251. punpckhwd m2, m3
  252. punpckhwd m1, m4
  253. pmaddwd m2, m1
  254. paddd m2, m5
  255. psrad m2, 15
  256. packssdw m0, m2
  257. mova [outputq+offset2q], m0
  258. REVERSE_WORDS m3
  259. mova m4, [ inputq+offsetq]
  260. pxor m0, m0
  261. punpcklwd m0, m3
  262. punpcklwd m1, m4
  263. pmaddwd m0, m1
  264. paddd m0, m5
  265. psrad m0, 15
  266. pxor m2, m2
  267. punpckhwd m2, m3
  268. punpckhwd m1, m4
  269. pmaddwd m2, m1
  270. paddd m2, m5
  271. psrad m2, 15
  272. packssdw m0, m2
  273. mova [outputq+offsetq], m0
  274. %elif %3
  275. ; This version does the 16x16->16 multiplication in-place without expanding
  276. ; to 32-bit. The ssse3 version is bit-identical.
  277. mova m0, [windowq+offset2q]
  278. mova m1, [ inputq+offset2q]
  279. pmulhrsw m1, m0
  280. REVERSE_WORDS m0, m5
  281. pmulhrsw m0, [ inputq+offsetq ]
  282. mova [outputq+offset2q], m1
  283. mova [outputq+offsetq ], m0
  284. %else
  285. ; This version does the 16x16->16 multiplication in-place without expanding
  286. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  287. ; therefore are not bit-identical to the C version.
  288. mova m0, [windowq+offset2q]
  289. mova m1, [ inputq+offset2q]
  290. mova m2, [ inputq+offsetq ]
  291. MUL16FIXED m1, m0, m3
  292. REVERSE_WORDS m0
  293. MUL16FIXED m2, m0, m3
  294. mova [outputq+offset2q], m1
  295. mova [outputq+offsetq ], m2
  296. %endif
  297. add offsetd, mmsize
  298. sub offset2d, mmsize
  299. jae .loop
  300. REP_RET
  301. %endmacro
  302. INIT_MMX
  303. %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
  304. %define MUL16FIXED MUL16FIXED_MMXEXT
  305. APPLY_WINDOW_INT16 mmxext, 0, 0
  306. APPLY_WINDOW_INT16 mmxext_ba, 1, 0
  307. INIT_XMM
  308. %define REVERSE_WORDS REVERSE_WORDS_SSE2
  309. APPLY_WINDOW_INT16 sse2, 0, 0
  310. APPLY_WINDOW_INT16 sse2_ba, 1, 0
  311. APPLY_WINDOW_INT16 ssse3_atom, 0, 1
  312. %define REVERSE_WORDS REVERSE_WORDS_SSSE3
  313. APPLY_WINDOW_INT16 ssse3, 0, 1
  314. ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  315. cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
  316. movq mm0, [topq]
  317. movq mm2, mm0
  318. movd mm4, [left_topq]
  319. psllq mm2, 8
  320. movq mm1, mm0
  321. por mm4, mm2
  322. movd mm3, [leftq]
  323. psubb mm0, mm4 ; t-tl
  324. add dstq, wq
  325. add topq, wq
  326. add diffq, wq
  327. neg wq
  328. jmp .skip
  329. .loop:
  330. movq mm4, [topq+wq]
  331. movq mm0, mm4
  332. psllq mm4, 8
  333. por mm4, mm1
  334. movq mm1, mm0 ; t
  335. psubb mm0, mm4 ; t-tl
  336. .skip:
  337. movq mm2, [diffq+wq]
  338. %assign i 0
  339. %rep 8
  340. movq mm4, mm0
  341. paddb mm4, mm3 ; t-tl+l
  342. movq mm5, mm3
  343. pmaxub mm3, mm1
  344. pminub mm5, mm1
  345. pminub mm3, mm4
  346. pmaxub mm3, mm5 ; median
  347. paddb mm3, mm2 ; +residual
  348. %if i==0
  349. movq mm7, mm3
  350. psllq mm7, 56
  351. %else
  352. movq mm6, mm3
  353. psrlq mm7, 8
  354. psllq mm6, 56
  355. por mm7, mm6
  356. %endif
  357. %if i<7
  358. psrlq mm0, 8
  359. psrlq mm1, 8
  360. psrlq mm2, 8
  361. %endif
  362. %assign i i+1
  363. %endrep
  364. movq [dstq+wq], mm7
  365. add wq, 8
  366. jl .loop
  367. movzx r2d, byte [dstq-1]
  368. mov [leftq], r2d
  369. movzx r2d, byte [topq-1]
  370. mov [left_topq], r2d
  371. RET
  372. %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
  373. add srcq, wq
  374. add dstq, wq
  375. neg wq
  376. %%.loop:
  377. mova m1, [srcq+wq]
  378. mova m2, m1
  379. psllw m1, 8
  380. paddb m1, m2
  381. mova m2, m1
  382. pshufb m1, m3
  383. paddb m1, m2
  384. pshufb m0, m5
  385. mova m2, m1
  386. pshufb m1, m4
  387. paddb m1, m2
  388. %if mmsize == 16
  389. mova m2, m1
  390. pshufb m1, m6
  391. paddb m1, m2
  392. %endif
  393. paddb m0, m1
  394. %if %1
  395. mova [dstq+wq], m0
  396. %else
  397. movq [dstq+wq], m0
  398. movhps [dstq+wq+8], m0
  399. %endif
  400. add wq, mmsize
  401. jl %%.loop
  402. mov eax, mmsize-1
  403. sub eax, wd
  404. movd m1, eax
  405. pshufb m0, m1
  406. movd eax, m0
  407. RET
  408. %endmacro
  409. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  410. INIT_MMX
  411. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  412. .skip_prologue:
  413. mova m5, [pb_7]
  414. mova m4, [pb_zzzz3333zzzzbbbb]
  415. mova m3, [pb_zz11zz55zz99zzdd]
  416. movd m0, leftm
  417. psllq m0, 56
  418. ADD_HFYU_LEFT_LOOP 1
  419. INIT_XMM
  420. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  421. mova m5, [pb_f]
  422. mova m6, [pb_zzzzzzzz77777777]
  423. mova m4, [pb_zzzz3333zzzzbbbb]
  424. mova m3, [pb_zz11zz55zz99zzdd]
  425. movd m0, leftm
  426. pslldq m0, 15
  427. test srcq, 15
  428. jnz add_hfyu_left_prediction_ssse3.skip_prologue
  429. test dstq, 15
  430. jnz .unaligned
  431. ADD_HFYU_LEFT_LOOP 1
  432. .unaligned:
  433. ADD_HFYU_LEFT_LOOP 0
  434. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  435. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  436. neg offsetq
  437. shl offsetq, 2
  438. sub v1q, offsetq
  439. sub v2q, offsetq
  440. xorps xmm0, xmm0
  441. .loop:
  442. movaps xmm1, [v1q+offsetq]
  443. mulps xmm1, [v2q+offsetq]
  444. addps xmm0, xmm1
  445. add offsetq, 16
  446. js .loop
  447. movhlps xmm1, xmm0
  448. addps xmm0, xmm1
  449. movss xmm1, xmm0
  450. shufps xmm0, xmm0, 1
  451. addss xmm0, xmm1
  452. %ifndef ARCH_X86_64
  453. movd r0m, xmm0
  454. fld dword r0m
  455. %endif
  456. RET
  457. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  458. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  459. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  460. ;
  461. ; The actual function itself is below. It basically wraps a very simple
  462. ; w = end_x - start_x
  463. ; if (w) {
  464. ; if (w > 22) {
  465. ; jump to the slow loop functions
  466. ; } else {
  467. ; jump to the fast loop functions
  468. ; }
  469. ; }
  470. ;
  471. ; ... and then the same for left/right extend also. See below for loop
  472. ; function implementations. Fast are fixed-width, slow is variable-width
  473. %macro EMU_EDGE_FUNC 1
  474. %ifdef ARCH_X86_64
  475. %define w_reg r10
  476. cglobal emu_edge_core_%1, 6, 7, 1
  477. mov r11, r5 ; save block_h
  478. %else
  479. %define w_reg r6
  480. cglobal emu_edge_core_%1, 2, 7, 0
  481. mov r4, r4m ; end_y
  482. mov r5, r5m ; block_h
  483. %endif
  484. ; start with vertical extend (top/bottom) and body pixel copy
  485. mov w_reg, r7m
  486. sub w_reg, r6m ; w = start_x - end_x
  487. sub r5, r4
  488. %ifdef ARCH_X86_64
  489. sub r4, r3
  490. %else
  491. sub r4, dword r3m
  492. %endif
  493. cmp w_reg, 22
  494. jg .slow_v_extend_loop
  495. %ifdef ARCH_X86_32
  496. mov r2, r2m ; linesize
  497. %endif
  498. sal w_reg, 7 ; w * 128
  499. %ifdef PIC
  500. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  501. add w_reg, rax
  502. %else
  503. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  504. %endif
  505. call w_reg ; fast top extend, body copy and bottom extend
  506. .v_extend_end:
  507. ; horizontal extend (left/right)
  508. mov w_reg, r6m ; start_x
  509. sub r0, w_reg
  510. %ifdef ARCH_X86_64
  511. mov r3, r0 ; backup of buf+block_h*linesize
  512. mov r5, r11
  513. %else
  514. mov r0m, r0 ; backup of buf+block_h*linesize
  515. mov r5, r5m
  516. %endif
  517. test w_reg, w_reg
  518. jz .right_extend
  519. cmp w_reg, 22
  520. jg .slow_left_extend_loop
  521. mov r1, w_reg
  522. dec w_reg
  523. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  524. sar w_reg, 1
  525. sal w_reg, 6
  526. ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
  527. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  528. %ifdef PIC
  529. lea rax, [.emuedge_extend_left_2]
  530. add w_reg, rax
  531. %else
  532. lea w_reg, [.emuedge_extend_left_2+w_reg]
  533. %endif
  534. call w_reg
  535. ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
  536. .right_extend:
  537. %ifdef ARCH_X86_32
  538. mov r0, r0m
  539. mov r5, r5m
  540. %endif
  541. mov w_reg, r7m ; end_x
  542. mov r1, r8m ; block_w
  543. mov r4, r1
  544. sub r1, w_reg
  545. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  546. cmp r1, 22
  547. jg .slow_right_extend_loop
  548. dec r1
  549. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  550. sar r1, 1
  551. sal r1, 6
  552. %ifdef PIC
  553. lea rax, [.emuedge_extend_right_2]
  554. add r1, rax
  555. %else
  556. lea r1, [.emuedge_extend_right_2+r1]
  557. %endif
  558. call r1
  559. .h_extend_end:
  560. RET
  561. %ifdef ARCH_X86_64
  562. %define vall al
  563. %define valh ah
  564. %define valw ax
  565. %define valw2 r10w
  566. %define valw3 r3w
  567. %ifdef WIN64
  568. %define valw4 r4w
  569. %else ; unix64
  570. %define valw4 r3w
  571. %endif
  572. %define vald eax
  573. %else
  574. %define vall bl
  575. %define valh bh
  576. %define valw bx
  577. %define valw2 r6w
  578. %define valw3 valw2
  579. %define valw4 valw3
  580. %define vald ebx
  581. %define stack_offset 0x14
  582. %endif
  583. %endmacro
  584. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  585. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  586. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  587. ; - else if (%2 & 8) fills 8 bytes into mm0
  588. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  589. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  590. ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
  591. ; (note that we're using r3 for body/bottom because it's a shorter
  592. ; opcode, and then the loop fits in 128 bytes)
  593. ; - else fills remaining bytes into rax
  594. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  595. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  596. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  597. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  598. ; - else fills remaining bytes into ebx
  599. ; writing data out is in the same way
  600. %macro READ_NUM_BYTES 3
  601. %assign %%src_off 0 ; offset in source buffer
  602. %assign %%smidx 0 ; mmx register idx
  603. %assign %%sxidx 0 ; xmm register idx
  604. %ifnidn %3, mmx
  605. %rep %2/16
  606. movdqu xmm %+ %%sxidx, [r1+%%src_off]
  607. %assign %%src_off %%src_off+16
  608. %assign %%sxidx %%sxidx+1
  609. %endrep ; %2/16
  610. %endif ; !mmx
  611. %ifdef ARCH_X86_64
  612. %if (%2-%%src_off) == 8
  613. mov rax, [r1+%%src_off]
  614. %assign %%src_off %%src_off+8
  615. %endif ; (%2-%%src_off) == 8
  616. %endif ; x86-64
  617. %rep (%2-%%src_off)/8
  618. movq mm %+ %%smidx, [r1+%%src_off]
  619. %assign %%src_off %%src_off+8
  620. %assign %%smidx %%smidx+1
  621. %endrep ; (%2-%%dst_off)/8
  622. %if (%2-%%src_off) == 4
  623. mov vald, [r1+%%src_off]
  624. %elif (%2-%%src_off) & 4
  625. movd mm %+ %%smidx, [r1+%%src_off]
  626. %assign %%src_off %%src_off+4
  627. %endif ; (%2-%%src_off) ==/& 4
  628. %if (%2-%%src_off) == 1
  629. mov vall, [r1+%%src_off]
  630. %elif (%2-%%src_off) == 2
  631. mov valw, [r1+%%src_off]
  632. %elif (%2-%%src_off) == 3
  633. %ifidn %1, top
  634. mov valw2, [r1+%%src_off]
  635. %elifidn %1, body
  636. mov valw3, [r1+%%src_off]
  637. %elifidn %1, bottom
  638. mov valw4, [r1+%%src_off]
  639. %endif ; %1 ==/!= top
  640. mov vall, [r1+%%src_off+2]
  641. %endif ; (%2-%%src_off) == 1/2/3
  642. %endmacro ; READ_NUM_BYTES
  643. %macro WRITE_NUM_BYTES 3
  644. %assign %%dst_off 0 ; offset in destination buffer
  645. %assign %%dmidx 0 ; mmx register idx
  646. %assign %%dxidx 0 ; xmm register idx
  647. %ifnidn %3, mmx
  648. %rep %2/16
  649. movdqu [r0+%%dst_off], xmm %+ %%dxidx
  650. %assign %%dst_off %%dst_off+16
  651. %assign %%dxidx %%dxidx+1
  652. %endrep ; %2/16
  653. %endif
  654. %ifdef ARCH_X86_64
  655. %if (%2-%%dst_off) == 8
  656. mov [r0+%%dst_off], rax
  657. %assign %%dst_off %%dst_off+8
  658. %endif ; (%2-%%dst_off) == 8
  659. %endif ; x86-64
  660. %rep (%2-%%dst_off)/8
  661. movq [r0+%%dst_off], mm %+ %%dmidx
  662. %assign %%dst_off %%dst_off+8
  663. %assign %%dmidx %%dmidx+1
  664. %endrep ; (%2-%%dst_off)/8
  665. %if (%2-%%dst_off) == 4
  666. mov [r0+%%dst_off], vald
  667. %elif (%2-%%dst_off) & 4
  668. movd [r0+%%dst_off], mm %+ %%dmidx
  669. %assign %%dst_off %%dst_off+4
  670. %endif ; (%2-%%dst_off) ==/& 4
  671. %if (%2-%%dst_off) == 1
  672. mov [r0+%%dst_off], vall
  673. %elif (%2-%%dst_off) == 2
  674. mov [r0+%%dst_off], valw
  675. %elif (%2-%%dst_off) == 3
  676. %ifidn %1, top
  677. mov [r0+%%dst_off], valw2
  678. %elifidn %1, body
  679. mov [r0+%%dst_off], valw3
  680. %elifidn %1, bottom
  681. mov [r0+%%dst_off], valw4
  682. %endif ; %1 ==/!= top
  683. mov [r0+%%dst_off+2], vall
  684. %endif ; (%2-%%dst_off) == 1/2/3
  685. %endmacro ; WRITE_NUM_BYTES
  686. ; vertical top/bottom extend and body copy fast loops
  687. ; these are function pointers to set-width line copy functions, i.e.
  688. ; they read a fixed number of pixels into set registers, and write
  689. ; those out into the destination buffer
  690. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  691. ; r6(eax/64)/r3(ebx/32)=val_reg
  692. %macro VERTICAL_EXTEND 1
  693. %assign %%n 1
  694. %rep 22
  695. ALIGN 128
  696. .emuedge_v_extend_ %+ %%n:
  697. ; extend pixels above body
  698. %ifdef ARCH_X86_64
  699. test r3 , r3 ; if (!start_y)
  700. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  701. %else ; ARCH_X86_32
  702. cmp dword r3m, 0
  703. je .emuedge_copy_body_ %+ %%n %+ _loop
  704. %endif ; ARCH_X86_64/32
  705. READ_NUM_BYTES top, %%n, %1 ; read bytes
  706. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  707. WRITE_NUM_BYTES top, %%n, %1 ; write bytes
  708. add r0 , r2 ; dst += linesize
  709. %ifdef ARCH_X86_64
  710. dec r3d
  711. %else ; ARCH_X86_32
  712. dec dword r3m
  713. %endif ; ARCH_X86_64/32
  714. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  715. ; copy body pixels
  716. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  717. READ_NUM_BYTES body, %%n, %1 ; read bytes
  718. WRITE_NUM_BYTES body, %%n, %1 ; write bytes
  719. add r0 , r2 ; dst += linesize
  720. add r1 , r2 ; src += linesize
  721. dec r4d
  722. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  723. ; copy bottom pixels
  724. test r5 , r5 ; if (!block_h)
  725. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  726. sub r1 , r2 ; src -= linesize
  727. READ_NUM_BYTES bottom, %%n, %1 ; read bytes
  728. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  729. WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
  730. add r0 , r2 ; dst += linesize
  731. dec r5d
  732. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  733. .emuedge_v_extend_end_ %+ %%n:
  734. %ifdef ARCH_X86_64
  735. ret
  736. %else ; ARCH_X86_32
  737. rep ret
  738. %endif ; ARCH_X86_64/32
  739. %assign %%n %%n+1
  740. %endrep
  741. %endmacro VERTICAL_EXTEND
  742. ; left/right (horizontal) fast extend functions
  743. ; these are essentially identical to the vertical extend ones above,
  744. ; just left/right separated because number of pixels to extend is
  745. ; obviously not the same on both sides.
  746. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  747. ; lowest two bytes of the register (so val*0x0101), and are splatted
  748. ; into each byte of mm0 as well if n_pixels >= 8
  749. %macro READ_V_PIXEL 3
  750. mov vall, %2
  751. mov valh, vall
  752. %if %1 >= 8
  753. movd mm0, vald
  754. %ifidn %3, mmx
  755. punpcklwd mm0, mm0
  756. punpckldq mm0, mm0
  757. %else ; !mmx
  758. pshufw mm0, mm0, 0
  759. %endif ; mmx
  760. %endif ; %1 >= 8
  761. %endmacro
  762. %macro WRITE_V_PIXEL 2
  763. %assign %%dst_off 0
  764. %rep %1/8
  765. movq [%2+%%dst_off], mm0
  766. %assign %%dst_off %%dst_off+8
  767. %endrep
  768. %if %1 & 4
  769. %if %1 >= 8
  770. movd [%2+%%dst_off], mm0
  771. %else ; %1 < 8
  772. mov [%2+%%dst_off] , valw
  773. mov [%2+%%dst_off+2], valw
  774. %endif ; %1 >=/< 8
  775. %assign %%dst_off %%dst_off+4
  776. %endif ; %1 & 4
  777. %if %1&2
  778. mov [%2+%%dst_off], valw
  779. %endif ; %1 & 2
  780. %endmacro
  781. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  782. %macro LEFT_EXTEND 1
  783. %assign %%n 2
  784. %rep 11
  785. ALIGN 64
  786. .emuedge_extend_left_ %+ %%n: ; do {
  787. sub r0, r2 ; dst -= linesize
  788. READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
  789. WRITE_V_PIXEL %%n, r0 ; write pixels
  790. dec r5
  791. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  792. %ifdef ARCH_X86_64
  793. ret
  794. %else ; ARCH_X86_32
  795. rep ret
  796. %endif ; ARCH_X86_64/32
  797. %assign %%n %%n+2
  798. %endrep
  799. %endmacro ; LEFT_EXTEND
  800. ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
  801. %macro RIGHT_EXTEND 1
  802. %assign %%n 2
  803. %rep 11
  804. ALIGN 64
  805. .emuedge_extend_right_ %+ %%n: ; do {
  806. %ifdef ARCH_X86_64
  807. sub r3, r2 ; dst -= linesize
  808. READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
  809. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  810. dec r11
  811. %else ; ARCH_X86_32
  812. sub r0, r2 ; dst -= linesize
  813. READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
  814. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  815. dec r5
  816. %endif ; ARCH_X86_64/32
  817. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  818. %ifdef ARCH_X86_64
  819. ret
  820. %else ; ARCH_X86_32
  821. rep ret
  822. %endif ; ARCH_X86_64/32
  823. %assign %%n %%n+2
  824. %endrep
  825. %ifdef ARCH_X86_32
  826. %define stack_offset 0x10
  827. %endif
  828. %endmacro ; RIGHT_EXTEND
  829. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  830. ; width specified in a register, and run a loop to copy the full amount
  831. ; of bytes. They are optimized for copying of large amounts of pixels per
  832. ; line, so they unconditionally splat data into mm registers to copy 8
  833. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  834. ; also, but I haven't optimized this as much (i.e. FIXME)
  835. %macro V_COPY_NPX 4-5
  836. %if %0 == 4
  837. test w_reg, %4
  838. jz .%1_skip_%4_px
  839. %else ; %0 == 5
  840. .%1_%4_px_loop:
  841. %endif
  842. %3 %2, [r1+cnt_reg]
  843. %3 [r0+cnt_reg], %2
  844. add cnt_reg, %4
  845. %if %0 == 5
  846. sub w_reg, %4
  847. test w_reg, %5
  848. jnz .%1_%4_px_loop
  849. %endif
  850. .%1_skip_%4_px:
  851. %endmacro
  852. %macro V_COPY_ROW 3
  853. %ifidn %1, bottom
  854. sub r1, linesize
  855. %endif
  856. .%1_copy_loop:
  857. xor cnt_reg, cnt_reg
  858. %ifidn %3, mmx
  859. %define linesize r2m
  860. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  861. %else ; !mmx
  862. V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
  863. %ifdef ARCH_X86_64
  864. %define linesize r2
  865. V_COPY_NPX %1, rax , mov, 8
  866. %else ; ARCH_X86_32
  867. %define linesize r2m
  868. V_COPY_NPX %1, mm0, movq, 8
  869. %endif ; ARCH_X86_64/32
  870. %endif ; mmx
  871. V_COPY_NPX %1, vald, mov, 4
  872. V_COPY_NPX %1, valw, mov, 2
  873. V_COPY_NPX %1, vall, mov, 1
  874. mov w_reg, cnt_reg
  875. %ifidn %1, body
  876. add r1, linesize
  877. %endif
  878. add r0, linesize
  879. dec %2
  880. jnz .%1_copy_loop
  881. %endmacro
  882. %macro SLOW_V_EXTEND 1
  883. .slow_v_extend_loop:
  884. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  885. ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
  886. %ifdef ARCH_X86_64
  887. push r11 ; save old value of block_h
  888. test r3, r3
  889. %define cnt_reg r11
  890. jz .do_body_copy ; if (!start_y) goto do_body_copy
  891. V_COPY_ROW top, r3, %1
  892. %else
  893. cmp dword r3m, 0
  894. %define cnt_reg r2
  895. je .do_body_copy ; if (!start_y) goto do_body_copy
  896. V_COPY_ROW top, dword r3m, %1
  897. %endif
  898. .do_body_copy:
  899. V_COPY_ROW body, r4, %1
  900. %ifdef ARCH_X86_64
  901. pop r11 ; restore old value of block_h
  902. %define cnt_reg r3
  903. %endif
  904. test r5, r5
  905. %ifdef ARCH_X86_64
  906. jz .v_extend_end
  907. %else
  908. jz .skip_bottom_extend
  909. %endif
  910. V_COPY_ROW bottom, r5, %1
  911. %ifdef ARCH_X86_32
  912. .skip_bottom_extend:
  913. mov r2, r2m
  914. %endif
  915. jmp .v_extend_end
  916. %endmacro
  917. %macro SLOW_LEFT_EXTEND 1
  918. .slow_left_extend_loop:
  919. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
  920. mov r4, 8
  921. sub r0, linesize
  922. READ_V_PIXEL 8, [r0+w_reg], %1
  923. .left_extend_8px_loop:
  924. movq [r0+r4-8], mm0
  925. add r4, 8
  926. cmp r4, w_reg
  927. jle .left_extend_8px_loop
  928. sub r4, 8
  929. cmp r4, w_reg
  930. jge .left_extend_loop_end
  931. .left_extend_2px_loop:
  932. mov [r0+r4], valw
  933. add r4, 2
  934. cmp r4, w_reg
  935. jl .left_extend_2px_loop
  936. .left_extend_loop_end:
  937. dec r5
  938. jnz .slow_left_extend_loop
  939. %ifdef ARCH_X86_32
  940. mov r2, r2m
  941. %endif
  942. jmp .right_extend
  943. %endmacro
  944. %macro SLOW_RIGHT_EXTEND 1
  945. .slow_right_extend_loop:
  946. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
  947. ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  948. %ifdef ARCH_X86_64
  949. %define buf_reg r3
  950. %define bh_reg r11
  951. %else
  952. %define buf_reg r0
  953. %define bh_reg r5
  954. %endif
  955. lea r1, [r4-8]
  956. sub buf_reg, linesize
  957. READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
  958. .right_extend_8px_loop:
  959. movq [buf_reg+r1], mm0
  960. sub r1, 8
  961. cmp r1, w_reg
  962. jge .right_extend_8px_loop
  963. add r1, 8
  964. cmp r1, w_reg
  965. je .right_extend_loop_end
  966. .right_extend_2px_loop:
  967. sub r1, 2
  968. mov [buf_reg+r1], valw
  969. cmp r1, w_reg
  970. jg .right_extend_2px_loop
  971. .right_extend_loop_end:
  972. dec bh_reg
  973. jnz .slow_right_extend_loop
  974. jmp .h_extend_end
  975. %endmacro
  976. %macro emu_edge 1
  977. EMU_EDGE_FUNC %1
  978. VERTICAL_EXTEND %1
  979. LEFT_EXTEND %1
  980. RIGHT_EXTEND %1
  981. SLOW_V_EXTEND %1
  982. SLOW_LEFT_EXTEND %1
  983. SLOW_RIGHT_EXTEND %1
  984. %endmacro
  985. emu_edge sse
  986. %ifdef ARCH_X86_32
  987. emu_edge mmx
  988. %endif
  989. ;-----------------------------------------------------------------------------
  990. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  991. ; int32_t max, unsigned int len)
  992. ;-----------------------------------------------------------------------------
  993. ; %1 = number of xmm registers used
  994. ; %2 = number of inline load/process/store loops per asm loop
  995. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  996. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  997. ; %5 = suffix
  998. %macro VECTOR_CLIP_INT32 4-5
  999. cglobal vector_clip_int32%5, 5,5,%2, dst, src, min, max, len
  1000. %if %4
  1001. cvtsi2ss m4, minm
  1002. cvtsi2ss m5, maxm
  1003. %else
  1004. movd m4, minm
  1005. movd m5, maxm
  1006. %endif
  1007. SPLATD m4
  1008. SPLATD m5
  1009. .loop:
  1010. %assign %%i 1
  1011. %rep %2
  1012. mova m0, [srcq+mmsize*0*%%i]
  1013. mova m1, [srcq+mmsize*1*%%i]
  1014. mova m2, [srcq+mmsize*2*%%i]
  1015. mova m3, [srcq+mmsize*3*%%i]
  1016. %if %3
  1017. mova m7, [srcq+mmsize*4*%%i]
  1018. mova m8, [srcq+mmsize*5*%%i]
  1019. mova m9, [srcq+mmsize*6*%%i]
  1020. mova m10, [srcq+mmsize*7*%%i]
  1021. %endif
  1022. CLIPD m0, m4, m5, m6
  1023. CLIPD m1, m4, m5, m6
  1024. CLIPD m2, m4, m5, m6
  1025. CLIPD m3, m4, m5, m6
  1026. %if %3
  1027. CLIPD m7, m4, m5, m6
  1028. CLIPD m8, m4, m5, m6
  1029. CLIPD m9, m4, m5, m6
  1030. CLIPD m10, m4, m5, m6
  1031. %endif
  1032. mova [dstq+mmsize*0*%%i], m0
  1033. mova [dstq+mmsize*1*%%i], m1
  1034. mova [dstq+mmsize*2*%%i], m2
  1035. mova [dstq+mmsize*3*%%i], m3
  1036. %if %3
  1037. mova [dstq+mmsize*4*%%i], m7
  1038. mova [dstq+mmsize*5*%%i], m8
  1039. mova [dstq+mmsize*6*%%i], m9
  1040. mova [dstq+mmsize*7*%%i], m10
  1041. %endif
  1042. %assign %%i %%i+1
  1043. %endrep
  1044. add srcq, mmsize*4*(%2+%3)
  1045. add dstq, mmsize*4*(%2+%3)
  1046. sub lend, mmsize*(%2+%3)
  1047. jg .loop
  1048. REP_RET
  1049. %endmacro
  1050. INIT_MMX mmx
  1051. %define SPLATD SPLATD_MMX
  1052. %define CLIPD CLIPD_MMX
  1053. VECTOR_CLIP_INT32 0, 1, 0, 0
  1054. INIT_XMM sse2
  1055. %define SPLATD SPLATD_SSE2
  1056. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  1057. %define CLIPD CLIPD_SSE2
  1058. VECTOR_CLIP_INT32 6, 2, 0, 1
  1059. INIT_XMM sse4
  1060. %define CLIPD CLIPD_SSE41
  1061. %ifdef m8
  1062. VECTOR_CLIP_INT32 11, 1, 1, 0
  1063. %else
  1064. VECTOR_CLIP_INT32 6, 1, 0, 0
  1065. %endif
  1066. ;-----------------------------------------------------------------------------
  1067. ; void ff_butterflies_float_interleave(float *dst, const float *src0,
  1068. ; const float *src1, int len);
  1069. ;-----------------------------------------------------------------------------
  1070. %macro BUTTERFLIES_FLOAT_INTERLEAVE 0
  1071. cglobal butterflies_float_interleave, 4,4,3, dst, src0, src1, len
  1072. %ifdef ARCH_X86_64
  1073. movsxd lenq, lend
  1074. %endif
  1075. test lenq, lenq
  1076. jz .end
  1077. shl lenq, 2
  1078. lea src0q, [src0q + lenq]
  1079. lea src1q, [src1q + lenq]
  1080. lea dstq, [ dstq + 2*lenq]
  1081. neg lenq
  1082. .loop:
  1083. mova m0, [src0q + lenq]
  1084. mova m1, [src1q + lenq]
  1085. subps m2, m0, m1
  1086. addps m0, m0, m1
  1087. unpcklps m1, m0, m2
  1088. unpckhps m0, m0, m2
  1089. %if cpuflag(avx)
  1090. vextractf128 [dstq + 2*lenq ], m1, 0
  1091. vextractf128 [dstq + 2*lenq + 16], m0, 0
  1092. vextractf128 [dstq + 2*lenq + 32], m1, 1
  1093. vextractf128 [dstq + 2*lenq + 48], m0, 1
  1094. %else
  1095. mova [dstq + 2*lenq ], m1
  1096. mova [dstq + 2*lenq + mmsize], m0
  1097. %endif
  1098. add lenq, mmsize
  1099. jl .loop
  1100. %if mmsize == 32
  1101. vzeroupper
  1102. RET
  1103. %endif
  1104. .end:
  1105. REP_RET
  1106. %endmacro
  1107. INIT_XMM sse
  1108. BUTTERFLIES_FLOAT_INTERLEAVE
  1109. INIT_YMM avx
  1110. BUTTERFLIES_FLOAT_INTERLEAVE