You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1051 lines
28KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. pb_revwords: db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
  29. pd_16384: times 4 dd 16384
  30. section .text align=16
  31. %macro SCALARPRODUCT 1
  32. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
  33. cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
  34. shl orderq, 1
  35. add v1q, orderq
  36. add v2q, orderq
  37. neg orderq
  38. movd m3, shiftm
  39. pxor m2, m2
  40. .loop:
  41. movu m0, [v1q + orderq]
  42. movu m1, [v1q + orderq + mmsize]
  43. pmaddwd m0, [v2q + orderq]
  44. pmaddwd m1, [v2q + orderq + mmsize]
  45. paddd m2, m0
  46. paddd m2, m1
  47. add orderq, mmsize*2
  48. jl .loop
  49. %if mmsize == 16
  50. movhlps m0, m2
  51. paddd m2, m0
  52. psrad m2, m3
  53. pshuflw m0, m2, 0x4e
  54. %else
  55. psrad m2, m3
  56. pshufw m0, m2, 0x4e
  57. %endif
  58. paddd m2, m0
  59. movd eax, m2
  60. RET
  61. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  62. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  63. shl orderq, 1
  64. movd m7, mulm
  65. %if mmsize == 16
  66. pshuflw m7, m7, 0
  67. punpcklqdq m7, m7
  68. %else
  69. pshufw m7, m7, 0
  70. %endif
  71. pxor m6, m6
  72. add v1q, orderq
  73. add v2q, orderq
  74. add v3q, orderq
  75. neg orderq
  76. .loop:
  77. movu m0, [v2q + orderq]
  78. movu m1, [v2q + orderq + mmsize]
  79. mova m4, [v1q + orderq]
  80. mova m5, [v1q + orderq + mmsize]
  81. movu m2, [v3q + orderq]
  82. movu m3, [v3q + orderq + mmsize]
  83. pmaddwd m0, m4
  84. pmaddwd m1, m5
  85. pmullw m2, m7
  86. pmullw m3, m7
  87. paddd m6, m0
  88. paddd m6, m1
  89. paddw m2, m4
  90. paddw m3, m5
  91. mova [v1q + orderq], m2
  92. mova [v1q + orderq + mmsize], m3
  93. add orderq, mmsize*2
  94. jl .loop
  95. %if mmsize == 16
  96. movhlps m0, m6
  97. paddd m6, m0
  98. pshuflw m0, m6, 0x4e
  99. %else
  100. pshufw m0, m6, 0x4e
  101. %endif
  102. paddd m6, m0
  103. movd eax, m6
  104. RET
  105. %endmacro
  106. INIT_MMX
  107. SCALARPRODUCT mmx2
  108. INIT_XMM
  109. SCALARPRODUCT sse2
  110. %macro SCALARPRODUCT_LOOP 1
  111. align 16
  112. .loop%1:
  113. sub orderq, mmsize*2
  114. %if %1
  115. mova m1, m4
  116. mova m4, [v2q + orderq]
  117. mova m0, [v2q + orderq + mmsize]
  118. palignr m1, m0, %1
  119. palignr m0, m4, %1
  120. mova m3, m5
  121. mova m5, [v3q + orderq]
  122. mova m2, [v3q + orderq + mmsize]
  123. palignr m3, m2, %1
  124. palignr m2, m5, %1
  125. %else
  126. mova m0, [v2q + orderq]
  127. mova m1, [v2q + orderq + mmsize]
  128. mova m2, [v3q + orderq]
  129. mova m3, [v3q + orderq + mmsize]
  130. %endif
  131. %define t0 [v1q + orderq]
  132. %define t1 [v1q + orderq + mmsize]
  133. %ifdef ARCH_X86_64
  134. mova m8, t0
  135. mova m9, t1
  136. %define t0 m8
  137. %define t1 m9
  138. %endif
  139. pmaddwd m0, t0
  140. pmaddwd m1, t1
  141. pmullw m2, m7
  142. pmullw m3, m7
  143. paddw m2, t0
  144. paddw m3, t1
  145. paddd m6, m0
  146. paddd m6, m1
  147. mova [v1q + orderq], m2
  148. mova [v1q + orderq + mmsize], m3
  149. jg .loop%1
  150. %if %1
  151. jmp .end
  152. %endif
  153. %endmacro
  154. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  155. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  156. shl orderq, 1
  157. movd m7, mulm
  158. pshuflw m7, m7, 0
  159. punpcklqdq m7, m7
  160. pxor m6, m6
  161. mov r4d, v2d
  162. and r4d, 15
  163. and v2q, ~15
  164. and v3q, ~15
  165. mova m4, [v2q + orderq]
  166. mova m5, [v3q + orderq]
  167. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  168. cmp r4d, 0
  169. je .loop0
  170. cmp r4d, 2
  171. je .loop2
  172. cmp r4d, 4
  173. je .loop4
  174. cmp r4d, 6
  175. je .loop6
  176. cmp r4d, 8
  177. je .loop8
  178. cmp r4d, 10
  179. je .loop10
  180. cmp r4d, 12
  181. je .loop12
  182. SCALARPRODUCT_LOOP 14
  183. SCALARPRODUCT_LOOP 12
  184. SCALARPRODUCT_LOOP 10
  185. SCALARPRODUCT_LOOP 8
  186. SCALARPRODUCT_LOOP 6
  187. SCALARPRODUCT_LOOP 4
  188. SCALARPRODUCT_LOOP 2
  189. SCALARPRODUCT_LOOP 0
  190. .end:
  191. movhlps m0, m6
  192. paddd m6, m0
  193. pshuflw m0, m6, 0x4e
  194. paddd m6, m0
  195. movd eax, m6
  196. RET
  197. ;-----------------------------------------------------------------------------
  198. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  199. ; const int16_t *window, unsigned int len)
  200. ;-----------------------------------------------------------------------------
  201. %macro REVERSE_WORDS_MMXEXT 1-2
  202. pshufw %1, %1, 0x1B
  203. %endmacro
  204. %macro REVERSE_WORDS_SSE2 1-2
  205. pshuflw %1, %1, 0x1B
  206. pshufhw %1, %1, 0x1B
  207. pshufd %1, %1, 0x4E
  208. %endmacro
  209. %macro REVERSE_WORDS_SSSE3 2
  210. pshufb %1, %2
  211. %endmacro
  212. ; dst = (dst * src) >> 15
  213. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  214. ; in from the pmullw result.
  215. %macro MUL16FIXED_MMXEXT 3 ; dst, src, temp
  216. mova %3, %1
  217. pmulhw %1, %2
  218. pmullw %3, %2
  219. psrlw %3, 15
  220. psllw %1, 1
  221. por %1, %3
  222. %endmacro
  223. ; dst = ((dst * src) + (1<<14)) >> 15
  224. %macro MUL16FIXED_SSSE3 3 ; dst, src, unused
  225. pmulhrsw %1, %2
  226. %endmacro
  227. %macro APPLY_WINDOW_INT16 3 ; %1=instruction set, %2=mmxext/sse2 bit exact version, %3=has_ssse3
  228. cglobal apply_window_int16_%1, 4,5,6, output, input, window, offset, offset2
  229. lea offset2q, [offsetq-mmsize]
  230. %if %2
  231. mova m5, [pd_16384]
  232. %elifidn %1, ssse3
  233. mova m5, [pb_revwords]
  234. ALIGN 16
  235. %endif
  236. .loop:
  237. %if %2
  238. ; This version expands 16-bit to 32-bit, multiplies by the window,
  239. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  240. ; save to the output. The window is reversed for the second half.
  241. mova m3, [windowq+offset2q]
  242. mova m4, [ inputq+offset2q]
  243. pxor m0, m0
  244. punpcklwd m0, m3
  245. punpcklwd m1, m4
  246. pmaddwd m0, m1
  247. paddd m0, m5
  248. psrad m0, 15
  249. pxor m2, m2
  250. punpckhwd m2, m3
  251. punpckhwd m1, m4
  252. pmaddwd m2, m1
  253. paddd m2, m5
  254. psrad m2, 15
  255. packssdw m0, m2
  256. mova [outputq+offset2q], m0
  257. REVERSE_WORDS m3
  258. mova m4, [ inputq+offsetq]
  259. pxor m0, m0
  260. punpcklwd m0, m3
  261. punpcklwd m1, m4
  262. pmaddwd m0, m1
  263. paddd m0, m5
  264. psrad m0, 15
  265. pxor m2, m2
  266. punpckhwd m2, m3
  267. punpckhwd m1, m4
  268. pmaddwd m2, m1
  269. paddd m2, m5
  270. psrad m2, 15
  271. packssdw m0, m2
  272. mova [outputq+offsetq], m0
  273. %elif %3
  274. ; This version does the 16x16->16 multiplication in-place without expanding
  275. ; to 32-bit. The ssse3 version is bit-identical.
  276. mova m0, [windowq+offset2q]
  277. mova m1, [ inputq+offset2q]
  278. pmulhrsw m1, m0
  279. REVERSE_WORDS m0, m5
  280. pmulhrsw m0, [ inputq+offsetq ]
  281. mova [outputq+offset2q], m1
  282. mova [outputq+offsetq ], m0
  283. %else
  284. ; This version does the 16x16->16 multiplication in-place without expanding
  285. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  286. ; therefore are not bit-identical to the C version.
  287. mova m0, [windowq+offset2q]
  288. mova m1, [ inputq+offset2q]
  289. mova m2, [ inputq+offsetq ]
  290. MUL16FIXED m1, m0, m3
  291. REVERSE_WORDS m0
  292. MUL16FIXED m2, m0, m3
  293. mova [outputq+offset2q], m1
  294. mova [outputq+offsetq ], m2
  295. %endif
  296. add offsetd, mmsize
  297. sub offset2d, mmsize
  298. jae .loop
  299. REP_RET
  300. %endmacro
  301. INIT_MMX
  302. %define REVERSE_WORDS REVERSE_WORDS_MMXEXT
  303. %define MUL16FIXED MUL16FIXED_MMXEXT
  304. APPLY_WINDOW_INT16 mmxext, 0, 0
  305. APPLY_WINDOW_INT16 mmxext_ba, 1, 0
  306. INIT_XMM
  307. %define REVERSE_WORDS REVERSE_WORDS_SSE2
  308. APPLY_WINDOW_INT16 sse2, 0, 0
  309. APPLY_WINDOW_INT16 sse2_ba, 1, 0
  310. APPLY_WINDOW_INT16 ssse3_atom, 0, 1
  311. %define REVERSE_WORDS REVERSE_WORDS_SSSE3
  312. APPLY_WINDOW_INT16 ssse3, 0, 1
  313. ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  314. cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
  315. movq mm0, [topq]
  316. movq mm2, mm0
  317. movd mm4, [left_topq]
  318. psllq mm2, 8
  319. movq mm1, mm0
  320. por mm4, mm2
  321. movd mm3, [leftq]
  322. psubb mm0, mm4 ; t-tl
  323. add dstq, wq
  324. add topq, wq
  325. add diffq, wq
  326. neg wq
  327. jmp .skip
  328. .loop:
  329. movq mm4, [topq+wq]
  330. movq mm0, mm4
  331. psllq mm4, 8
  332. por mm4, mm1
  333. movq mm1, mm0 ; t
  334. psubb mm0, mm4 ; t-tl
  335. .skip:
  336. movq mm2, [diffq+wq]
  337. %assign i 0
  338. %rep 8
  339. movq mm4, mm0
  340. paddb mm4, mm3 ; t-tl+l
  341. movq mm5, mm3
  342. pmaxub mm3, mm1
  343. pminub mm5, mm1
  344. pminub mm3, mm4
  345. pmaxub mm3, mm5 ; median
  346. paddb mm3, mm2 ; +residual
  347. %if i==0
  348. movq mm7, mm3
  349. psllq mm7, 56
  350. %else
  351. movq mm6, mm3
  352. psrlq mm7, 8
  353. psllq mm6, 56
  354. por mm7, mm6
  355. %endif
  356. %if i<7
  357. psrlq mm0, 8
  358. psrlq mm1, 8
  359. psrlq mm2, 8
  360. %endif
  361. %assign i i+1
  362. %endrep
  363. movq [dstq+wq], mm7
  364. add wq, 8
  365. jl .loop
  366. movzx r2d, byte [dstq-1]
  367. mov [leftq], r2d
  368. movzx r2d, byte [topq-1]
  369. mov [left_topq], r2d
  370. RET
  371. %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
  372. add srcq, wq
  373. add dstq, wq
  374. neg wq
  375. %%.loop:
  376. mova m1, [srcq+wq]
  377. mova m2, m1
  378. psllw m1, 8
  379. paddb m1, m2
  380. mova m2, m1
  381. pshufb m1, m3
  382. paddb m1, m2
  383. pshufb m0, m5
  384. mova m2, m1
  385. pshufb m1, m4
  386. paddb m1, m2
  387. %if mmsize == 16
  388. mova m2, m1
  389. pshufb m1, m6
  390. paddb m1, m2
  391. %endif
  392. paddb m0, m1
  393. %if %1
  394. mova [dstq+wq], m0
  395. %else
  396. movq [dstq+wq], m0
  397. movhps [dstq+wq+8], m0
  398. %endif
  399. add wq, mmsize
  400. jl %%.loop
  401. mov eax, mmsize-1
  402. sub eax, wd
  403. movd m1, eax
  404. pshufb m0, m1
  405. movd eax, m0
  406. RET
  407. %endmacro
  408. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  409. INIT_MMX
  410. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  411. .skip_prologue:
  412. mova m5, [pb_7]
  413. mova m4, [pb_zzzz3333zzzzbbbb]
  414. mova m3, [pb_zz11zz55zz99zzdd]
  415. movd m0, leftm
  416. psllq m0, 56
  417. ADD_HFYU_LEFT_LOOP 1
  418. INIT_XMM
  419. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  420. mova m5, [pb_f]
  421. mova m6, [pb_zzzzzzzz77777777]
  422. mova m4, [pb_zzzz3333zzzzbbbb]
  423. mova m3, [pb_zz11zz55zz99zzdd]
  424. movd m0, leftm
  425. pslldq m0, 15
  426. test srcq, 15
  427. jnz add_hfyu_left_prediction_ssse3.skip_prologue
  428. test dstq, 15
  429. jnz .unaligned
  430. ADD_HFYU_LEFT_LOOP 1
  431. .unaligned:
  432. ADD_HFYU_LEFT_LOOP 0
  433. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  434. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  435. neg offsetq
  436. shl offsetq, 2
  437. sub v1q, offsetq
  438. sub v2q, offsetq
  439. xorps xmm0, xmm0
  440. .loop:
  441. movaps xmm1, [v1q+offsetq]
  442. mulps xmm1, [v2q+offsetq]
  443. addps xmm0, xmm1
  444. add offsetq, 16
  445. js .loop
  446. movhlps xmm1, xmm0
  447. addps xmm0, xmm1
  448. movss xmm1, xmm0
  449. shufps xmm0, xmm0, 1
  450. addss xmm0, xmm1
  451. %ifndef ARCH_X86_64
  452. movd r0m, xmm0
  453. fld dword r0m
  454. %endif
  455. RET
  456. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  457. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  458. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  459. ;
  460. ; The actual function itself is below. It basically wraps a very simple
  461. ; w = end_x - start_x
  462. ; if (w) {
  463. ; if (w > 22) {
  464. ; jump to the slow loop functions
  465. ; } else {
  466. ; jump to the fast loop functions
  467. ; }
  468. ; }
  469. ;
  470. ; ... and then the same for left/right extend also. See below for loop
  471. ; function implementations. Fast are fixed-width, slow is variable-width
  472. %macro EMU_EDGE_FUNC 1
  473. %ifdef ARCH_X86_64
  474. %define w_reg r10
  475. cglobal emu_edge_core_%1, 6, 7, 1
  476. mov r11, r5 ; save block_h
  477. %else
  478. %define w_reg r6
  479. cglobal emu_edge_core_%1, 2, 7, 0
  480. mov r4, r4m ; end_y
  481. mov r5, r5m ; block_h
  482. %endif
  483. ; start with vertical extend (top/bottom) and body pixel copy
  484. mov w_reg, r7m
  485. sub w_reg, r6m ; w = start_x - end_x
  486. sub r5, r4
  487. %ifdef ARCH_X86_64
  488. sub r4, r3
  489. %else
  490. sub r4, dword r3m
  491. %endif
  492. cmp w_reg, 22
  493. jg .slow_v_extend_loop
  494. %ifdef ARCH_X86_32
  495. mov r2, r2m ; linesize
  496. %endif
  497. sal w_reg, 7 ; w * 128
  498. %ifdef PIC
  499. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  500. add w_reg, rax
  501. %else
  502. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  503. %endif
  504. call w_reg ; fast top extend, body copy and bottom extend
  505. .v_extend_end:
  506. ; horizontal extend (left/right)
  507. mov w_reg, r6m ; start_x
  508. sub r0, w_reg
  509. %ifdef ARCH_X86_64
  510. mov r3, r0 ; backup of buf+block_h*linesize
  511. mov r5, r11
  512. %else
  513. mov r0m, r0 ; backup of buf+block_h*linesize
  514. mov r5, r5m
  515. %endif
  516. test w_reg, w_reg
  517. jz .right_extend
  518. cmp w_reg, 22
  519. jg .slow_left_extend_loop
  520. mov r1, w_reg
  521. dec w_reg
  522. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  523. sar w_reg, 1
  524. sal w_reg, 6
  525. ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
  526. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  527. %ifdef PIC
  528. lea rax, [.emuedge_extend_left_2]
  529. add w_reg, rax
  530. %else
  531. lea w_reg, [.emuedge_extend_left_2+w_reg]
  532. %endif
  533. call w_reg
  534. ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
  535. .right_extend:
  536. %ifdef ARCH_X86_32
  537. mov r0, r0m
  538. mov r5, r5m
  539. %endif
  540. mov w_reg, r7m ; end_x
  541. mov r1, r8m ; block_w
  542. mov r4, r1
  543. sub r1, w_reg
  544. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  545. cmp r1, 22
  546. jg .slow_right_extend_loop
  547. dec r1
  548. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  549. sar r1, 1
  550. sal r1, 6
  551. %ifdef PIC
  552. lea rax, [.emuedge_extend_right_2]
  553. add r1, rax
  554. %else
  555. lea r1, [.emuedge_extend_right_2+r1]
  556. %endif
  557. call r1
  558. .h_extend_end:
  559. RET
  560. %ifdef ARCH_X86_64
  561. %define vall al
  562. %define valh ah
  563. %define valw ax
  564. %define valw2 r10w
  565. %define valw3 r3w
  566. %ifdef WIN64
  567. %define valw4 r4w
  568. %else ; unix64
  569. %define valw4 r3w
  570. %endif
  571. %define vald eax
  572. %else
  573. %define vall bl
  574. %define valh bh
  575. %define valw bx
  576. %define valw2 r6w
  577. %define valw3 valw2
  578. %define valw4 valw3
  579. %define vald ebx
  580. %define stack_offset 0x14
  581. %endif
  582. %endmacro
  583. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  584. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  585. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  586. ; - else if (%2 & 8) fills 8 bytes into mm0
  587. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  588. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  589. ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
  590. ; (note that we're using r3 for body/bottom because it's a shorter
  591. ; opcode, and then the loop fits in 128 bytes)
  592. ; - else fills remaining bytes into rax
  593. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  594. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  595. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  596. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  597. ; - else fills remaining bytes into ebx
  598. ; writing data out is in the same way
  599. %macro READ_NUM_BYTES 3
  600. %assign %%src_off 0 ; offset in source buffer
  601. %assign %%smidx 0 ; mmx register idx
  602. %assign %%sxidx 0 ; xmm register idx
  603. %ifnidn %3, mmx
  604. %rep %2/16
  605. movdqu xmm %+ %%sxidx, [r1+%%src_off]
  606. %assign %%src_off %%src_off+16
  607. %assign %%sxidx %%sxidx+1
  608. %endrep ; %2/16
  609. %endif ; !mmx
  610. %ifdef ARCH_X86_64
  611. %if (%2-%%src_off) == 8
  612. mov rax, [r1+%%src_off]
  613. %assign %%src_off %%src_off+8
  614. %endif ; (%2-%%src_off) == 8
  615. %endif ; x86-64
  616. %rep (%2-%%src_off)/8
  617. movq mm %+ %%smidx, [r1+%%src_off]
  618. %assign %%src_off %%src_off+8
  619. %assign %%smidx %%smidx+1
  620. %endrep ; (%2-%%dst_off)/8
  621. %if (%2-%%src_off) == 4
  622. mov vald, [r1+%%src_off]
  623. %elif (%2-%%src_off) & 4
  624. movd mm %+ %%smidx, [r1+%%src_off]
  625. %assign %%src_off %%src_off+4
  626. %endif ; (%2-%%src_off) ==/& 4
  627. %if (%2-%%src_off) == 1
  628. mov vall, [r1+%%src_off]
  629. %elif (%2-%%src_off) == 2
  630. mov valw, [r1+%%src_off]
  631. %elif (%2-%%src_off) == 3
  632. %ifidn %1, top
  633. mov valw2, [r1+%%src_off]
  634. %elifidn %1, body
  635. mov valw3, [r1+%%src_off]
  636. %elifidn %1, bottom
  637. mov valw4, [r1+%%src_off]
  638. %endif ; %1 ==/!= top
  639. mov vall, [r1+%%src_off+2]
  640. %endif ; (%2-%%src_off) == 1/2/3
  641. %endmacro ; READ_NUM_BYTES
  642. %macro WRITE_NUM_BYTES 3
  643. %assign %%dst_off 0 ; offset in destination buffer
  644. %assign %%dmidx 0 ; mmx register idx
  645. %assign %%dxidx 0 ; xmm register idx
  646. %ifnidn %3, mmx
  647. %rep %2/16
  648. movdqu [r0+%%dst_off], xmm %+ %%dxidx
  649. %assign %%dst_off %%dst_off+16
  650. %assign %%dxidx %%dxidx+1
  651. %endrep ; %2/16
  652. %endif
  653. %ifdef ARCH_X86_64
  654. %if (%2-%%dst_off) == 8
  655. mov [r0+%%dst_off], rax
  656. %assign %%dst_off %%dst_off+8
  657. %endif ; (%2-%%dst_off) == 8
  658. %endif ; x86-64
  659. %rep (%2-%%dst_off)/8
  660. movq [r0+%%dst_off], mm %+ %%dmidx
  661. %assign %%dst_off %%dst_off+8
  662. %assign %%dmidx %%dmidx+1
  663. %endrep ; (%2-%%dst_off)/8
  664. %if (%2-%%dst_off) == 4
  665. mov [r0+%%dst_off], vald
  666. %elif (%2-%%dst_off) & 4
  667. movd [r0+%%dst_off], mm %+ %%dmidx
  668. %assign %%dst_off %%dst_off+4
  669. %endif ; (%2-%%dst_off) ==/& 4
  670. %if (%2-%%dst_off) == 1
  671. mov [r0+%%dst_off], vall
  672. %elif (%2-%%dst_off) == 2
  673. mov [r0+%%dst_off], valw
  674. %elif (%2-%%dst_off) == 3
  675. %ifidn %1, top
  676. mov [r0+%%dst_off], valw2
  677. %elifidn %1, body
  678. mov [r0+%%dst_off], valw3
  679. %elifidn %1, bottom
  680. mov [r0+%%dst_off], valw4
  681. %endif ; %1 ==/!= top
  682. mov [r0+%%dst_off+2], vall
  683. %endif ; (%2-%%dst_off) == 1/2/3
  684. %endmacro ; WRITE_NUM_BYTES
  685. ; vertical top/bottom extend and body copy fast loops
  686. ; these are function pointers to set-width line copy functions, i.e.
  687. ; they read a fixed number of pixels into set registers, and write
  688. ; those out into the destination buffer
  689. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  690. ; r6(eax/64)/r3(ebx/32)=val_reg
  691. %macro VERTICAL_EXTEND 1
  692. %assign %%n 1
  693. %rep 22
  694. ALIGN 128
  695. .emuedge_v_extend_ %+ %%n:
  696. ; extend pixels above body
  697. %ifdef ARCH_X86_64
  698. test r3 , r3 ; if (!start_y)
  699. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  700. %else ; ARCH_X86_32
  701. cmp dword r3m, 0
  702. je .emuedge_copy_body_ %+ %%n %+ _loop
  703. %endif ; ARCH_X86_64/32
  704. READ_NUM_BYTES top, %%n, %1 ; read bytes
  705. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  706. WRITE_NUM_BYTES top, %%n, %1 ; write bytes
  707. add r0 , r2 ; dst += linesize
  708. %ifdef ARCH_X86_64
  709. dec r3d
  710. %else ; ARCH_X86_32
  711. dec dword r3m
  712. %endif ; ARCH_X86_64/32
  713. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  714. ; copy body pixels
  715. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  716. READ_NUM_BYTES body, %%n, %1 ; read bytes
  717. WRITE_NUM_BYTES body, %%n, %1 ; write bytes
  718. add r0 , r2 ; dst += linesize
  719. add r1 , r2 ; src += linesize
  720. dec r4d
  721. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  722. ; copy bottom pixels
  723. test r5 , r5 ; if (!block_h)
  724. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  725. sub r1 , r2 ; src -= linesize
  726. READ_NUM_BYTES bottom, %%n, %1 ; read bytes
  727. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  728. WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
  729. add r0 , r2 ; dst += linesize
  730. dec r5d
  731. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  732. .emuedge_v_extend_end_ %+ %%n:
  733. %ifdef ARCH_X86_64
  734. ret
  735. %else ; ARCH_X86_32
  736. rep ret
  737. %endif ; ARCH_X86_64/32
  738. %assign %%n %%n+1
  739. %endrep
  740. %endmacro VERTICAL_EXTEND
  741. ; left/right (horizontal) fast extend functions
  742. ; these are essentially identical to the vertical extend ones above,
  743. ; just left/right separated because number of pixels to extend is
  744. ; obviously not the same on both sides.
  745. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  746. ; lowest two bytes of the register (so val*0x0101), and are splatted
  747. ; into each byte of mm0 as well if n_pixels >= 8
  748. %macro READ_V_PIXEL 3
  749. mov vall, %2
  750. mov valh, vall
  751. %if %1 >= 8
  752. movd mm0, vald
  753. %ifidn %3, mmx
  754. punpcklwd mm0, mm0
  755. punpckldq mm0, mm0
  756. %else ; !mmx
  757. pshufw mm0, mm0, 0
  758. %endif ; mmx
  759. %endif ; %1 >= 8
  760. %endmacro
  761. %macro WRITE_V_PIXEL 2
  762. %assign %%dst_off 0
  763. %rep %1/8
  764. movq [%2+%%dst_off], mm0
  765. %assign %%dst_off %%dst_off+8
  766. %endrep
  767. %if %1 & 4
  768. %if %1 >= 8
  769. movd [%2+%%dst_off], mm0
  770. %else ; %1 < 8
  771. mov [%2+%%dst_off] , valw
  772. mov [%2+%%dst_off+2], valw
  773. %endif ; %1 >=/< 8
  774. %assign %%dst_off %%dst_off+4
  775. %endif ; %1 & 4
  776. %if %1&2
  777. mov [%2+%%dst_off], valw
  778. %endif ; %1 & 2
  779. %endmacro
  780. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  781. %macro LEFT_EXTEND 1
  782. %assign %%n 2
  783. %rep 11
  784. ALIGN 64
  785. .emuedge_extend_left_ %+ %%n: ; do {
  786. sub r0, r2 ; dst -= linesize
  787. READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
  788. WRITE_V_PIXEL %%n, r0 ; write pixels
  789. dec r5
  790. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  791. %ifdef ARCH_X86_64
  792. ret
  793. %else ; ARCH_X86_32
  794. rep ret
  795. %endif ; ARCH_X86_64/32
  796. %assign %%n %%n+2
  797. %endrep
  798. %endmacro ; LEFT_EXTEND
  799. ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
  800. %macro RIGHT_EXTEND 1
  801. %assign %%n 2
  802. %rep 11
  803. ALIGN 64
  804. .emuedge_extend_right_ %+ %%n: ; do {
  805. %ifdef ARCH_X86_64
  806. sub r3, r2 ; dst -= linesize
  807. READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
  808. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  809. dec r11
  810. %else ; ARCH_X86_32
  811. sub r0, r2 ; dst -= linesize
  812. READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
  813. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  814. dec r5
  815. %endif ; ARCH_X86_64/32
  816. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  817. %ifdef ARCH_X86_64
  818. ret
  819. %else ; ARCH_X86_32
  820. rep ret
  821. %endif ; ARCH_X86_64/32
  822. %assign %%n %%n+2
  823. %endrep
  824. %ifdef ARCH_X86_32
  825. %define stack_offset 0x10
  826. %endif
  827. %endmacro ; RIGHT_EXTEND
  828. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  829. ; width specified in a register, and run a loop to copy the full amount
  830. ; of bytes. They are optimized for copying of large amounts of pixels per
  831. ; line, so they unconditionally splat data into mm registers to copy 8
  832. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  833. ; also, but I haven't optimized this as much (i.e. FIXME)
  834. %macro V_COPY_NPX 4-5
  835. %if %0 == 4
  836. test w_reg, %4
  837. jz .%1_skip_%4_px
  838. %else ; %0 == 5
  839. .%1_%4_px_loop:
  840. %endif
  841. %3 %2, [r1+cnt_reg]
  842. %3 [r0+cnt_reg], %2
  843. add cnt_reg, %4
  844. %if %0 == 5
  845. sub w_reg, %4
  846. test w_reg, %5
  847. jnz .%1_%4_px_loop
  848. %endif
  849. .%1_skip_%4_px:
  850. %endmacro
  851. %macro V_COPY_ROW 3
  852. %ifidn %1, bottom
  853. sub r1, linesize
  854. %endif
  855. .%1_copy_loop:
  856. xor cnt_reg, cnt_reg
  857. %ifidn %3, mmx
  858. %define linesize r2m
  859. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  860. %else ; !mmx
  861. V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
  862. %ifdef ARCH_X86_64
  863. %define linesize r2
  864. V_COPY_NPX %1, rax , mov, 8
  865. %else ; ARCH_X86_32
  866. %define linesize r2m
  867. V_COPY_NPX %1, mm0, movq, 8
  868. %endif ; ARCH_X86_64/32
  869. %endif ; mmx
  870. V_COPY_NPX %1, vald, mov, 4
  871. V_COPY_NPX %1, valw, mov, 2
  872. V_COPY_NPX %1, vall, mov, 1
  873. mov w_reg, cnt_reg
  874. %ifidn %1, body
  875. add r1, linesize
  876. %endif
  877. add r0, linesize
  878. dec %2
  879. jnz .%1_copy_loop
  880. %endmacro
  881. %macro SLOW_V_EXTEND 1
  882. .slow_v_extend_loop:
  883. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  884. ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
  885. %ifdef ARCH_X86_64
  886. push r11 ; save old value of block_h
  887. test r3, r3
  888. %define cnt_reg r11
  889. jz .do_body_copy ; if (!start_y) goto do_body_copy
  890. V_COPY_ROW top, r3, %1
  891. %else
  892. cmp dword r3m, 0
  893. %define cnt_reg r2
  894. je .do_body_copy ; if (!start_y) goto do_body_copy
  895. V_COPY_ROW top, dword r3m, %1
  896. %endif
  897. .do_body_copy:
  898. V_COPY_ROW body, r4, %1
  899. %ifdef ARCH_X86_64
  900. pop r11 ; restore old value of block_h
  901. %define cnt_reg r3
  902. %endif
  903. test r5, r5
  904. %ifdef ARCH_X86_64
  905. jz .v_extend_end
  906. %else
  907. jz .skip_bottom_extend
  908. %endif
  909. V_COPY_ROW bottom, r5, %1
  910. %ifdef ARCH_X86_32
  911. .skip_bottom_extend:
  912. mov r2, r2m
  913. %endif
  914. jmp .v_extend_end
  915. %endmacro
  916. %macro SLOW_LEFT_EXTEND 1
  917. .slow_left_extend_loop:
  918. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
  919. mov r4, 8
  920. sub r0, linesize
  921. READ_V_PIXEL 8, [r0+w_reg], %1
  922. .left_extend_8px_loop:
  923. movq [r0+r4-8], mm0
  924. add r4, 8
  925. cmp r4, w_reg
  926. jle .left_extend_8px_loop
  927. sub r4, 8
  928. cmp r4, w_reg
  929. jge .left_extend_loop_end
  930. .left_extend_2px_loop:
  931. mov [r0+r4], valw
  932. add r4, 2
  933. cmp r4, w_reg
  934. jl .left_extend_2px_loop
  935. .left_extend_loop_end:
  936. dec r5
  937. jnz .slow_left_extend_loop
  938. %ifdef ARCH_X86_32
  939. mov r2, r2m
  940. %endif
  941. jmp .right_extend
  942. %endmacro
  943. %macro SLOW_RIGHT_EXTEND 1
  944. .slow_right_extend_loop:
  945. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
  946. ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  947. %ifdef ARCH_X86_64
  948. %define buf_reg r3
  949. %define bh_reg r11
  950. %else
  951. %define buf_reg r0
  952. %define bh_reg r5
  953. %endif
  954. lea r1, [r4-8]
  955. sub buf_reg, linesize
  956. READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
  957. .right_extend_8px_loop:
  958. movq [buf_reg+r1], mm0
  959. sub r1, 8
  960. cmp r1, w_reg
  961. jge .right_extend_8px_loop
  962. add r1, 8
  963. cmp r1, w_reg
  964. je .right_extend_loop_end
  965. .right_extend_2px_loop:
  966. sub r1, 2
  967. mov [buf_reg+r1], valw
  968. cmp r1, w_reg
  969. jg .right_extend_2px_loop
  970. .right_extend_loop_end:
  971. dec bh_reg
  972. jnz .slow_right_extend_loop
  973. jmp .h_extend_end
  974. %endmacro
  975. %macro emu_edge 1
  976. EMU_EDGE_FUNC %1
  977. VERTICAL_EXTEND %1
  978. LEFT_EXTEND %1
  979. RIGHT_EXTEND %1
  980. SLOW_V_EXTEND %1
  981. SLOW_LEFT_EXTEND %1
  982. SLOW_RIGHT_EXTEND %1
  983. %endmacro
  984. emu_edge sse
  985. %ifdef ARCH_X86_32
  986. emu_edge mmx
  987. %endif