You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

925 lines
24KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. section .text align=16
  29. %macro SCALARPRODUCT 1
  30. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
  31. cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
  32. shl orderq, 1
  33. add v1q, orderq
  34. add v2q, orderq
  35. neg orderq
  36. movd m3, shiftm
  37. pxor m2, m2
  38. .loop:
  39. movu m0, [v1q + orderq]
  40. movu m1, [v1q + orderq + mmsize]
  41. pmaddwd m0, [v2q + orderq]
  42. pmaddwd m1, [v2q + orderq + mmsize]
  43. paddd m2, m0
  44. paddd m2, m1
  45. add orderq, mmsize*2
  46. jl .loop
  47. %if mmsize == 16
  48. movhlps m0, m2
  49. paddd m2, m0
  50. psrad m2, m3
  51. pshuflw m0, m2, 0x4e
  52. %else
  53. psrad m2, m3
  54. pshufw m0, m2, 0x4e
  55. %endif
  56. paddd m2, m0
  57. movd eax, m2
  58. RET
  59. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  60. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  61. shl orderq, 1
  62. movd m7, mulm
  63. %if mmsize == 16
  64. pshuflw m7, m7, 0
  65. punpcklqdq m7, m7
  66. %else
  67. pshufw m7, m7, 0
  68. %endif
  69. pxor m6, m6
  70. add v1q, orderq
  71. add v2q, orderq
  72. add v3q, orderq
  73. neg orderq
  74. .loop:
  75. movu m0, [v2q + orderq]
  76. movu m1, [v2q + orderq + mmsize]
  77. mova m4, [v1q + orderq]
  78. mova m5, [v1q + orderq + mmsize]
  79. movu m2, [v3q + orderq]
  80. movu m3, [v3q + orderq + mmsize]
  81. pmaddwd m0, m4
  82. pmaddwd m1, m5
  83. pmullw m2, m7
  84. pmullw m3, m7
  85. paddd m6, m0
  86. paddd m6, m1
  87. paddw m2, m4
  88. paddw m3, m5
  89. mova [v1q + orderq], m2
  90. mova [v1q + orderq + mmsize], m3
  91. add orderq, mmsize*2
  92. jl .loop
  93. %if mmsize == 16
  94. movhlps m0, m6
  95. paddd m6, m0
  96. pshuflw m0, m6, 0x4e
  97. %else
  98. pshufw m0, m6, 0x4e
  99. %endif
  100. paddd m6, m0
  101. movd eax, m6
  102. RET
  103. %endmacro
  104. INIT_MMX
  105. SCALARPRODUCT mmx2
  106. INIT_XMM
  107. SCALARPRODUCT sse2
  108. %macro SCALARPRODUCT_LOOP 1
  109. align 16
  110. .loop%1:
  111. sub orderq, mmsize*2
  112. %if %1
  113. mova m1, m4
  114. mova m4, [v2q + orderq]
  115. mova m0, [v2q + orderq + mmsize]
  116. palignr m1, m0, %1
  117. palignr m0, m4, %1
  118. mova m3, m5
  119. mova m5, [v3q + orderq]
  120. mova m2, [v3q + orderq + mmsize]
  121. palignr m3, m2, %1
  122. palignr m2, m5, %1
  123. %else
  124. mova m0, [v2q + orderq]
  125. mova m1, [v2q + orderq + mmsize]
  126. mova m2, [v3q + orderq]
  127. mova m3, [v3q + orderq + mmsize]
  128. %endif
  129. %define t0 [v1q + orderq]
  130. %define t1 [v1q + orderq + mmsize]
  131. %ifdef ARCH_X86_64
  132. mova m8, t0
  133. mova m9, t1
  134. %define t0 m8
  135. %define t1 m9
  136. %endif
  137. pmaddwd m0, t0
  138. pmaddwd m1, t1
  139. pmullw m2, m7
  140. pmullw m3, m7
  141. paddw m2, t0
  142. paddw m3, t1
  143. paddd m6, m0
  144. paddd m6, m1
  145. mova [v1q + orderq], m2
  146. mova [v1q + orderq + mmsize], m3
  147. jg .loop%1
  148. %if %1
  149. jmp .end
  150. %endif
  151. %endmacro
  152. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  153. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  154. shl orderq, 1
  155. movd m7, mulm
  156. pshuflw m7, m7, 0
  157. punpcklqdq m7, m7
  158. pxor m6, m6
  159. mov r4d, v2d
  160. and r4d, 15
  161. and v2q, ~15
  162. and v3q, ~15
  163. mova m4, [v2q + orderq]
  164. mova m5, [v3q + orderq]
  165. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  166. cmp r4d, 0
  167. je .loop0
  168. cmp r4d, 2
  169. je .loop2
  170. cmp r4d, 4
  171. je .loop4
  172. cmp r4d, 6
  173. je .loop6
  174. cmp r4d, 8
  175. je .loop8
  176. cmp r4d, 10
  177. je .loop10
  178. cmp r4d, 12
  179. je .loop12
  180. SCALARPRODUCT_LOOP 14
  181. SCALARPRODUCT_LOOP 12
  182. SCALARPRODUCT_LOOP 10
  183. SCALARPRODUCT_LOOP 8
  184. SCALARPRODUCT_LOOP 6
  185. SCALARPRODUCT_LOOP 4
  186. SCALARPRODUCT_LOOP 2
  187. SCALARPRODUCT_LOOP 0
  188. .end:
  189. movhlps m0, m6
  190. paddd m6, m0
  191. pshuflw m0, m6, 0x4e
  192. paddd m6, m0
  193. movd eax, m6
  194. RET
  195. ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  196. cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
  197. movq mm0, [topq]
  198. movq mm2, mm0
  199. movd mm4, [left_topq]
  200. psllq mm2, 8
  201. movq mm1, mm0
  202. por mm4, mm2
  203. movd mm3, [leftq]
  204. psubb mm0, mm4 ; t-tl
  205. add dstq, wq
  206. add topq, wq
  207. add diffq, wq
  208. neg wq
  209. jmp .skip
  210. .loop:
  211. movq mm4, [topq+wq]
  212. movq mm0, mm4
  213. psllq mm4, 8
  214. por mm4, mm1
  215. movq mm1, mm0 ; t
  216. psubb mm0, mm4 ; t-tl
  217. .skip:
  218. movq mm2, [diffq+wq]
  219. %assign i 0
  220. %rep 8
  221. movq mm4, mm0
  222. paddb mm4, mm3 ; t-tl+l
  223. movq mm5, mm3
  224. pmaxub mm3, mm1
  225. pminub mm5, mm1
  226. pminub mm3, mm4
  227. pmaxub mm3, mm5 ; median
  228. paddb mm3, mm2 ; +residual
  229. %if i==0
  230. movq mm7, mm3
  231. psllq mm7, 56
  232. %else
  233. movq mm6, mm3
  234. psrlq mm7, 8
  235. psllq mm6, 56
  236. por mm7, mm6
  237. %endif
  238. %if i<7
  239. psrlq mm0, 8
  240. psrlq mm1, 8
  241. psrlq mm2, 8
  242. %endif
  243. %assign i i+1
  244. %endrep
  245. movq [dstq+wq], mm7
  246. add wq, 8
  247. jl .loop
  248. movzx r2d, byte [dstq-1]
  249. mov [leftq], r2d
  250. movzx r2d, byte [topq-1]
  251. mov [left_topq], r2d
  252. RET
  253. %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
  254. add srcq, wq
  255. add dstq, wq
  256. neg wq
  257. %%.loop:
  258. mova m1, [srcq+wq]
  259. mova m2, m1
  260. psllw m1, 8
  261. paddb m1, m2
  262. mova m2, m1
  263. pshufb m1, m3
  264. paddb m1, m2
  265. pshufb m0, m5
  266. mova m2, m1
  267. pshufb m1, m4
  268. paddb m1, m2
  269. %if mmsize == 16
  270. mova m2, m1
  271. pshufb m1, m6
  272. paddb m1, m2
  273. %endif
  274. paddb m0, m1
  275. %if %1
  276. mova [dstq+wq], m0
  277. %else
  278. movq [dstq+wq], m0
  279. movhps [dstq+wq+8], m0
  280. %endif
  281. add wq, mmsize
  282. jl %%.loop
  283. mov eax, mmsize-1
  284. sub eax, wd
  285. movd m1, eax
  286. pshufb m0, m1
  287. movd eax, m0
  288. RET
  289. %endmacro
  290. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  291. INIT_MMX
  292. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  293. .skip_prologue:
  294. mova m5, [pb_7]
  295. mova m4, [pb_zzzz3333zzzzbbbb]
  296. mova m3, [pb_zz11zz55zz99zzdd]
  297. movd m0, leftm
  298. psllq m0, 56
  299. ADD_HFYU_LEFT_LOOP 1
  300. INIT_XMM
  301. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  302. mova m5, [pb_f]
  303. mova m6, [pb_zzzzzzzz77777777]
  304. mova m4, [pb_zzzz3333zzzzbbbb]
  305. mova m3, [pb_zz11zz55zz99zzdd]
  306. movd m0, leftm
  307. pslldq m0, 15
  308. test srcq, 15
  309. jnz add_hfyu_left_prediction_ssse3.skip_prologue
  310. test dstq, 15
  311. jnz .unaligned
  312. ADD_HFYU_LEFT_LOOP 1
  313. .unaligned:
  314. ADD_HFYU_LEFT_LOOP 0
  315. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  316. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  317. neg offsetq
  318. shl offsetq, 2
  319. sub v1q, offsetq
  320. sub v2q, offsetq
  321. xorps xmm0, xmm0
  322. .loop:
  323. movaps xmm1, [v1q+offsetq]
  324. mulps xmm1, [v2q+offsetq]
  325. addps xmm0, xmm1
  326. add offsetq, 16
  327. js .loop
  328. movhlps xmm1, xmm0
  329. addps xmm0, xmm1
  330. movss xmm1, xmm0
  331. shufps xmm0, xmm0, 1
  332. addss xmm0, xmm1
  333. %ifndef ARCH_X86_64
  334. movd r0m, xmm0
  335. fld dword r0m
  336. %endif
  337. RET
  338. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  339. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  340. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  341. ;
  342. ; The actual function itself is below. It basically wraps a very simple
  343. ; w = end_x - start_x
  344. ; if (w) {
  345. ; if (w > 22) {
  346. ; jump to the slow loop functions
  347. ; } else {
  348. ; jump to the fast loop functions
  349. ; }
  350. ; }
  351. ;
  352. ; ... and then the same for left/right extend also. See below for loop
  353. ; function implementations. Fast are fixed-width, slow is variable-width
  354. %macro EMU_EDGE_FUNC 1
  355. %ifdef ARCH_X86_64
  356. %define w_reg r10
  357. cglobal emu_edge_core_%1, 6, 7, 1
  358. mov r11, r5 ; save block_h
  359. %else
  360. %define w_reg r6
  361. cglobal emu_edge_core_%1, 2, 7, 0
  362. mov r4, r4m ; end_y
  363. mov r5, r5m ; block_h
  364. %endif
  365. ; start with vertical extend (top/bottom) and body pixel copy
  366. mov w_reg, r7m
  367. sub w_reg, r6m ; w = start_x - end_x
  368. sub r5, r4
  369. %ifdef ARCH_X86_64
  370. sub r4, r3
  371. %else
  372. sub r4, dword r3m
  373. %endif
  374. cmp w_reg, 22
  375. jg .slow_v_extend_loop
  376. %ifdef ARCH_X86_32
  377. mov r2, r2m ; linesize
  378. %endif
  379. sal w_reg, 7 ; w * 128
  380. %ifdef PIC
  381. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  382. add w_reg, rax
  383. %else
  384. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  385. %endif
  386. call w_reg ; fast top extend, body copy and bottom extend
  387. .v_extend_end:
  388. ; horizontal extend (left/right)
  389. mov w_reg, r6m ; start_x
  390. sub r0, w_reg
  391. %ifdef ARCH_X86_64
  392. mov r3, r0 ; backup of buf+block_h*linesize
  393. mov r5, r11
  394. %else
  395. mov r0m, r0 ; backup of buf+block_h*linesize
  396. mov r5, r5m
  397. %endif
  398. test w_reg, w_reg
  399. jz .right_extend
  400. cmp w_reg, 22
  401. jg .slow_left_extend_loop
  402. mov r1, w_reg
  403. dec w_reg
  404. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  405. sar w_reg, 1
  406. sal w_reg, 6
  407. ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
  408. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  409. %ifdef PIC
  410. lea rax, [.emuedge_extend_left_2]
  411. add w_reg, rax
  412. %else
  413. lea w_reg, [.emuedge_extend_left_2+w_reg]
  414. %endif
  415. call w_reg
  416. ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
  417. .right_extend:
  418. %ifdef ARCH_X86_32
  419. mov r0, r0m
  420. mov r5, r5m
  421. %endif
  422. mov w_reg, r7m ; end_x
  423. mov r1, r8m ; block_w
  424. mov r4, r1
  425. sub r1, w_reg
  426. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  427. cmp r1, 22
  428. jg .slow_right_extend_loop
  429. dec r1
  430. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  431. sar r1, 1
  432. sal r1, 6
  433. %ifdef PIC
  434. lea rax, [.emuedge_extend_right_2]
  435. add r1, rax
  436. %else
  437. lea r1, [.emuedge_extend_right_2+r1]
  438. %endif
  439. call r1
  440. .h_extend_end:
  441. RET
  442. %ifdef ARCH_X86_64
  443. %define vall al
  444. %define valh ah
  445. %define valw ax
  446. %define valw2 r10w
  447. %define valw3 r3w
  448. %ifdef WIN64
  449. %define valw4 r4w
  450. %else ; unix64
  451. %define valw4 r3w
  452. %endif
  453. %define vald eax
  454. %else
  455. %define vall bl
  456. %define valh bh
  457. %define valw bx
  458. %define valw2 r6w
  459. %define valw3 valw2
  460. %define valw4 valw3
  461. %define vald ebx
  462. %define stack_offset 0x14
  463. %endif
  464. %endmacro
  465. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  466. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  467. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  468. ; - else if (%2 & 8) fills 8 bytes into mm0
  469. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  470. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  471. ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
  472. ; (note that we're using r3 for body/bottom because it's a shorter
  473. ; opcode, and then the loop fits in 128 bytes)
  474. ; - else fills remaining bytes into rax
  475. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  476. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  477. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  478. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  479. ; - else fills remaining bytes into ebx
  480. ; writing data out is in the same way
  481. %macro READ_NUM_BYTES 3
  482. %assign %%src_off 0 ; offset in source buffer
  483. %assign %%smidx 0 ; mmx register idx
  484. %assign %%sxidx 0 ; xmm register idx
  485. %ifnidn %3, mmx
  486. %rep %2/16
  487. movdqu xmm %+ %%sxidx, [r1+%%src_off]
  488. %assign %%src_off %%src_off+16
  489. %assign %%sxidx %%sxidx+1
  490. %endrep ; %2/16
  491. %endif ; !mmx
  492. %ifdef ARCH_X86_64
  493. %if (%2-%%src_off) == 8
  494. mov rax, [r1+%%src_off]
  495. %assign %%src_off %%src_off+8
  496. %endif ; (%2-%%src_off) == 8
  497. %endif ; x86-64
  498. %rep (%2-%%src_off)/8
  499. movq mm %+ %%smidx, [r1+%%src_off]
  500. %assign %%src_off %%src_off+8
  501. %assign %%smidx %%smidx+1
  502. %endrep ; (%2-%%dst_off)/8
  503. %if (%2-%%src_off) == 4
  504. mov vald, [r1+%%src_off]
  505. %elif (%2-%%src_off) & 4
  506. movd mm %+ %%smidx, [r1+%%src_off]
  507. %assign %%src_off %%src_off+4
  508. %endif ; (%2-%%src_off) ==/& 4
  509. %if (%2-%%src_off) == 1
  510. mov vall, [r1+%%src_off]
  511. %elif (%2-%%src_off) == 2
  512. mov valw, [r1+%%src_off]
  513. %elif (%2-%%src_off) == 3
  514. %ifidn %1, top
  515. mov valw2, [r1+%%src_off]
  516. %elifidn %1, body
  517. mov valw3, [r1+%%src_off]
  518. %elifidn %1, bottom
  519. mov valw4, [r1+%%src_off]
  520. %endif ; %1 ==/!= top
  521. mov vall, [r1+%%src_off+2]
  522. %endif ; (%2-%%src_off) == 1/2/3
  523. %endmacro ; READ_NUM_BYTES
  524. %macro WRITE_NUM_BYTES 3
  525. %assign %%dst_off 0 ; offset in destination buffer
  526. %assign %%dmidx 0 ; mmx register idx
  527. %assign %%dxidx 0 ; xmm register idx
  528. %ifnidn %3, mmx
  529. %rep %2/16
  530. movdqu [r0+%%dst_off], xmm %+ %%dxidx
  531. %assign %%dst_off %%dst_off+16
  532. %assign %%dxidx %%dxidx+1
  533. %endrep ; %2/16
  534. %endif
  535. %ifdef ARCH_X86_64
  536. %if (%2-%%dst_off) == 8
  537. mov [r0+%%dst_off], rax
  538. %assign %%dst_off %%dst_off+8
  539. %endif ; (%2-%%dst_off) == 8
  540. %endif ; x86-64
  541. %rep (%2-%%dst_off)/8
  542. movq [r0+%%dst_off], mm %+ %%dmidx
  543. %assign %%dst_off %%dst_off+8
  544. %assign %%dmidx %%dmidx+1
  545. %endrep ; (%2-%%dst_off)/8
  546. %if (%2-%%dst_off) == 4
  547. mov [r0+%%dst_off], vald
  548. %elif (%2-%%dst_off) & 4
  549. movd [r0+%%dst_off], mm %+ %%dmidx
  550. %assign %%dst_off %%dst_off+4
  551. %endif ; (%2-%%dst_off) ==/& 4
  552. %if (%2-%%dst_off) == 1
  553. mov [r0+%%dst_off], vall
  554. %elif (%2-%%dst_off) == 2
  555. mov [r0+%%dst_off], valw
  556. %elif (%2-%%dst_off) == 3
  557. %ifidn %1, top
  558. mov [r0+%%dst_off], valw2
  559. %elifidn %1, body
  560. mov [r0+%%dst_off], valw3
  561. %elifidn %1, bottom
  562. mov [r0+%%dst_off], valw4
  563. %endif ; %1 ==/!= top
  564. mov [r0+%%dst_off+2], vall
  565. %endif ; (%2-%%dst_off) == 1/2/3
  566. %endmacro ; WRITE_NUM_BYTES
  567. ; vertical top/bottom extend and body copy fast loops
  568. ; these are function pointers to set-width line copy functions, i.e.
  569. ; they read a fixed number of pixels into set registers, and write
  570. ; those out into the destination buffer
  571. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  572. ; r6(eax/64)/r3(ebx/32)=val_reg
  573. %macro VERTICAL_EXTEND 1
  574. %assign %%n 1
  575. %rep 22
  576. ALIGN 128
  577. .emuedge_v_extend_ %+ %%n:
  578. ; extend pixels above body
  579. %ifdef ARCH_X86_64
  580. test r3 , r3 ; if (!start_y)
  581. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  582. %else ; ARCH_X86_32
  583. cmp dword r3m, 0
  584. je .emuedge_copy_body_ %+ %%n %+ _loop
  585. %endif ; ARCH_X86_64/32
  586. READ_NUM_BYTES top, %%n, %1 ; read bytes
  587. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  588. WRITE_NUM_BYTES top, %%n, %1 ; write bytes
  589. add r0 , r2 ; dst += linesize
  590. %ifdef ARCH_X86_64
  591. dec r3d
  592. %else ; ARCH_X86_32
  593. dec dword r3m
  594. %endif ; ARCH_X86_64/32
  595. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  596. ; copy body pixels
  597. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  598. READ_NUM_BYTES body, %%n, %1 ; read bytes
  599. WRITE_NUM_BYTES body, %%n, %1 ; write bytes
  600. add r0 , r2 ; dst += linesize
  601. add r1 , r2 ; src += linesize
  602. dec r4d
  603. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  604. ; copy bottom pixels
  605. test r5 , r5 ; if (!block_h)
  606. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  607. sub r1 , r2 ; src -= linesize
  608. READ_NUM_BYTES bottom, %%n, %1 ; read bytes
  609. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  610. WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
  611. add r0 , r2 ; dst += linesize
  612. dec r5d
  613. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  614. .emuedge_v_extend_end_ %+ %%n:
  615. %ifdef ARCH_X86_64
  616. ret
  617. %else ; ARCH_X86_32
  618. rep ret
  619. %endif ; ARCH_X86_64/32
  620. %assign %%n %%n+1
  621. %endrep
  622. %endmacro VERTICAL_EXTEND
  623. ; left/right (horizontal) fast extend functions
  624. ; these are essentially identical to the vertical extend ones above,
  625. ; just left/right separated because number of pixels to extend is
  626. ; obviously not the same on both sides.
  627. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  628. ; lowest two bytes of the register (so val*0x0101), and are splatted
  629. ; into each byte of mm0 as well if n_pixels >= 8
  630. %macro READ_V_PIXEL 3
  631. mov vall, %2
  632. mov valh, vall
  633. %if %1 >= 8
  634. movd mm0, vald
  635. %ifidn %3, mmx
  636. punpcklwd mm0, mm0
  637. punpckldq mm0, mm0
  638. %else ; !mmx
  639. pshufw mm0, mm0, 0
  640. %endif ; mmx
  641. %endif ; %1 >= 8
  642. %endmacro
  643. %macro WRITE_V_PIXEL 2
  644. %assign %%dst_off 0
  645. %rep %1/8
  646. movq [%2+%%dst_off], mm0
  647. %assign %%dst_off %%dst_off+8
  648. %endrep
  649. %if %1 & 4
  650. %if %1 >= 8
  651. movd [%2+%%dst_off], mm0
  652. %else ; %1 < 8
  653. mov [%2+%%dst_off] , valw
  654. mov [%2+%%dst_off+2], valw
  655. %endif ; %1 >=/< 8
  656. %assign %%dst_off %%dst_off+4
  657. %endif ; %1 & 4
  658. %if %1&2
  659. mov [%2+%%dst_off], valw
  660. %endif ; %1 & 2
  661. %endmacro
  662. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  663. %macro LEFT_EXTEND 1
  664. %assign %%n 2
  665. %rep 11
  666. ALIGN 64
  667. .emuedge_extend_left_ %+ %%n: ; do {
  668. sub r0, r2 ; dst -= linesize
  669. READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
  670. WRITE_V_PIXEL %%n, r0 ; write pixels
  671. dec r5
  672. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  673. %ifdef ARCH_X86_64
  674. ret
  675. %else ; ARCH_X86_32
  676. rep ret
  677. %endif ; ARCH_X86_64/32
  678. %assign %%n %%n+2
  679. %endrep
  680. %endmacro ; LEFT_EXTEND
  681. ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
  682. %macro RIGHT_EXTEND 1
  683. %assign %%n 2
  684. %rep 11
  685. ALIGN 64
  686. .emuedge_extend_right_ %+ %%n: ; do {
  687. %ifdef ARCH_X86_64
  688. sub r3, r2 ; dst -= linesize
  689. READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
  690. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  691. dec r11
  692. %else ; ARCH_X86_32
  693. sub r0, r2 ; dst -= linesize
  694. READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
  695. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  696. dec r5
  697. %endif ; ARCH_X86_64/32
  698. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  699. %ifdef ARCH_X86_64
  700. ret
  701. %else ; ARCH_X86_32
  702. rep ret
  703. %endif ; ARCH_X86_64/32
  704. %assign %%n %%n+2
  705. %endrep
  706. %ifdef ARCH_X86_32
  707. %define stack_offset 0x10
  708. %endif
  709. %endmacro ; RIGHT_EXTEND
  710. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  711. ; width specified in a register, and run a loop to copy the full amount
  712. ; of bytes. They are optimized for copying of large amounts of pixels per
  713. ; line, so they unconditionally splat data into mm registers to copy 8
  714. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  715. ; also, but I haven't optimized this as much (i.e. FIXME)
  716. %macro V_COPY_NPX 4-5
  717. %if %0 == 4
  718. test w_reg, %4
  719. jz .%1_skip_%4_px
  720. %else ; %0 == 5
  721. .%1_%4_px_loop:
  722. %endif
  723. %3 %2, [r1+cnt_reg]
  724. %3 [r0+cnt_reg], %2
  725. add cnt_reg, %4
  726. %if %0 == 5
  727. sub w_reg, %4
  728. test w_reg, %5
  729. jnz .%1_%4_px_loop
  730. %endif
  731. .%1_skip_%4_px:
  732. %endmacro
  733. %macro V_COPY_ROW 3
  734. %ifidn %1, bottom
  735. sub r1, linesize
  736. %endif
  737. .%1_copy_loop:
  738. xor cnt_reg, cnt_reg
  739. %ifidn %3, mmx
  740. %define linesize r2m
  741. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  742. %else ; !mmx
  743. V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
  744. %ifdef ARCH_X86_64
  745. %define linesize r2
  746. V_COPY_NPX %1, rax , mov, 8
  747. %else ; ARCH_X86_32
  748. %define linesize r2m
  749. V_COPY_NPX %1, mm0, movq, 8
  750. %endif ; ARCH_X86_64/32
  751. %endif ; mmx
  752. V_COPY_NPX %1, vald, mov, 4
  753. V_COPY_NPX %1, valw, mov, 2
  754. V_COPY_NPX %1, vall, mov, 1
  755. mov w_reg, cnt_reg
  756. %ifidn %1, body
  757. add r1, linesize
  758. %endif
  759. add r0, linesize
  760. dec %2
  761. jnz .%1_copy_loop
  762. %endmacro
  763. %macro SLOW_V_EXTEND 1
  764. .slow_v_extend_loop:
  765. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  766. ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
  767. %ifdef ARCH_X86_64
  768. push r11 ; save old value of block_h
  769. test r3, r3
  770. %define cnt_reg r11
  771. jz .do_body_copy ; if (!start_y) goto do_body_copy
  772. V_COPY_ROW top, r3, %1
  773. %else
  774. cmp dword r3m, 0
  775. %define cnt_reg r2
  776. je .do_body_copy ; if (!start_y) goto do_body_copy
  777. V_COPY_ROW top, dword r3m, %1
  778. %endif
  779. .do_body_copy:
  780. V_COPY_ROW body, r4, %1
  781. %ifdef ARCH_X86_64
  782. pop r11 ; restore old value of block_h
  783. %define cnt_reg r3
  784. %endif
  785. test r5, r5
  786. %ifdef ARCH_X86_64
  787. jz .v_extend_end
  788. %else
  789. jz .skip_bottom_extend
  790. %endif
  791. V_COPY_ROW bottom, r5, %1
  792. %ifdef ARCH_X86_32
  793. .skip_bottom_extend:
  794. mov r2, r2m
  795. %endif
  796. jmp .v_extend_end
  797. %endmacro
  798. %macro SLOW_LEFT_EXTEND 1
  799. .slow_left_extend_loop:
  800. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
  801. mov r4, 8
  802. sub r0, linesize
  803. READ_V_PIXEL 8, [r0+w_reg], %1
  804. .left_extend_8px_loop:
  805. movq [r0+r4-8], mm0
  806. add r4, 8
  807. cmp r4, w_reg
  808. jle .left_extend_8px_loop
  809. sub r4, 8
  810. cmp r4, w_reg
  811. jge .left_extend_loop_end
  812. .left_extend_2px_loop:
  813. mov [r0+r4], valw
  814. add r4, 2
  815. cmp r4, w_reg
  816. jl .left_extend_2px_loop
  817. .left_extend_loop_end:
  818. dec r5
  819. jnz .slow_left_extend_loop
  820. %ifdef ARCH_X86_32
  821. mov r2, r2m
  822. %endif
  823. jmp .right_extend
  824. %endmacro
  825. %macro SLOW_RIGHT_EXTEND 1
  826. .slow_right_extend_loop:
  827. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
  828. ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  829. %ifdef ARCH_X86_64
  830. %define buf_reg r3
  831. %define bh_reg r11
  832. %else
  833. %define buf_reg r0
  834. %define bh_reg r5
  835. %endif
  836. lea r1, [r4-8]
  837. sub buf_reg, linesize
  838. READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
  839. .right_extend_8px_loop:
  840. movq [buf_reg+r1], mm0
  841. sub r1, 8
  842. cmp r1, w_reg
  843. jge .right_extend_8px_loop
  844. add r1, 8
  845. cmp r1, w_reg
  846. je .right_extend_loop_end
  847. .right_extend_2px_loop:
  848. sub r1, 2
  849. mov [buf_reg+r1], valw
  850. cmp r1, w_reg
  851. jg .right_extend_2px_loop
  852. .right_extend_loop_end:
  853. dec bh_reg
  854. jnz .slow_right_extend_loop
  855. jmp .h_extend_end
  856. %endmacro
  857. %macro emu_edge 1
  858. EMU_EDGE_FUNC %1
  859. VERTICAL_EXTEND %1
  860. LEFT_EXTEND %1
  861. RIGHT_EXTEND %1
  862. SLOW_V_EXTEND %1
  863. SLOW_LEFT_EXTEND %1
  864. SLOW_RIGHT_EXTEND %1
  865. %endmacro
  866. emu_edge sse
  867. %ifdef ARCH_X86_32
  868. emu_edge mmx
  869. %endif