You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

915 lines
24KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. section .text align=16
  29. %macro SCALARPRODUCT 1
  30. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order, int shift)
  31. cglobal scalarproduct_int16_%1, 3,3,4, v1, v2, order, shift
  32. shl orderq, 1
  33. add v1q, orderq
  34. add v2q, orderq
  35. neg orderq
  36. movd m3, shiftm
  37. pxor m2, m2
  38. .loop:
  39. movu m0, [v1q + orderq]
  40. movu m1, [v1q + orderq + mmsize]
  41. pmaddwd m0, [v2q + orderq]
  42. pmaddwd m1, [v2q + orderq + mmsize]
  43. paddd m2, m0
  44. paddd m2, m1
  45. add orderq, mmsize*2
  46. jl .loop
  47. %if mmsize == 16
  48. movhlps m0, m2
  49. paddd m2, m0
  50. psrad m2, m3
  51. pshuflw m0, m2, 0x4e
  52. %else
  53. psrad m2, m3
  54. pshufw m0, m2, 0x4e
  55. %endif
  56. paddd m2, m0
  57. movd eax, m2
  58. RET
  59. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  60. cglobal scalarproduct_and_madd_int16_%1, 4,4,8, v1, v2, v3, order, mul
  61. shl orderq, 1
  62. movd m7, mulm
  63. %if mmsize == 16
  64. pshuflw m7, m7, 0
  65. punpcklqdq m7, m7
  66. %else
  67. pshufw m7, m7, 0
  68. %endif
  69. pxor m6, m6
  70. add v1q, orderq
  71. add v2q, orderq
  72. add v3q, orderq
  73. neg orderq
  74. .loop:
  75. movu m0, [v2q + orderq]
  76. movu m1, [v2q + orderq + mmsize]
  77. mova m4, [v1q + orderq]
  78. mova m5, [v1q + orderq + mmsize]
  79. movu m2, [v3q + orderq]
  80. movu m3, [v3q + orderq + mmsize]
  81. pmaddwd m0, m4
  82. pmaddwd m1, m5
  83. pmullw m2, m7
  84. pmullw m3, m7
  85. paddd m6, m0
  86. paddd m6, m1
  87. paddw m2, m4
  88. paddw m3, m5
  89. mova [v1q + orderq], m2
  90. mova [v1q + orderq + mmsize], m3
  91. add orderq, mmsize*2
  92. jl .loop
  93. %if mmsize == 16
  94. movhlps m0, m6
  95. paddd m6, m0
  96. pshuflw m0, m6, 0x4e
  97. %else
  98. pshufw m0, m6, 0x4e
  99. %endif
  100. paddd m6, m0
  101. movd eax, m6
  102. RET
  103. %endmacro
  104. INIT_MMX
  105. SCALARPRODUCT mmx2
  106. INIT_XMM
  107. SCALARPRODUCT sse2
  108. %macro SCALARPRODUCT_LOOP 1
  109. align 16
  110. .loop%1:
  111. sub orderq, mmsize*2
  112. %if %1
  113. mova m1, m4
  114. mova m4, [v2q + orderq]
  115. mova m0, [v2q + orderq + mmsize]
  116. palignr m1, m0, %1
  117. palignr m0, m4, %1
  118. mova m3, m5
  119. mova m5, [v3q + orderq]
  120. mova m2, [v3q + orderq + mmsize]
  121. palignr m3, m2, %1
  122. palignr m2, m5, %1
  123. %else
  124. mova m0, [v2q + orderq]
  125. mova m1, [v2q + orderq + mmsize]
  126. mova m2, [v3q + orderq]
  127. mova m3, [v3q + orderq + mmsize]
  128. %endif
  129. %define t0 [v1q + orderq]
  130. %define t1 [v1q + orderq + mmsize]
  131. %ifdef ARCH_X86_64
  132. mova m8, t0
  133. mova m9, t1
  134. %define t0 m8
  135. %define t1 m9
  136. %endif
  137. pmaddwd m0, t0
  138. pmaddwd m1, t1
  139. pmullw m2, m7
  140. pmullw m3, m7
  141. paddw m2, t0
  142. paddw m3, t1
  143. paddd m6, m0
  144. paddd m6, m1
  145. mova [v1q + orderq], m2
  146. mova [v1q + orderq + mmsize], m3
  147. jg .loop%1
  148. %if %1
  149. jmp .end
  150. %endif
  151. %endmacro
  152. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  153. cglobal scalarproduct_and_madd_int16_ssse3, 4,5,10, v1, v2, v3, order, mul
  154. shl orderq, 1
  155. movd m7, mulm
  156. pshuflw m7, m7, 0
  157. punpcklqdq m7, m7
  158. pxor m6, m6
  159. mov r4d, v2d
  160. and r4d, 15
  161. and v2q, ~15
  162. and v3q, ~15
  163. mova m4, [v2q + orderq]
  164. mova m5, [v3q + orderq]
  165. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  166. cmp r4d, 0
  167. je .loop0
  168. cmp r4d, 2
  169. je .loop2
  170. cmp r4d, 4
  171. je .loop4
  172. cmp r4d, 6
  173. je .loop6
  174. cmp r4d, 8
  175. je .loop8
  176. cmp r4d, 10
  177. je .loop10
  178. cmp r4d, 12
  179. je .loop12
  180. SCALARPRODUCT_LOOP 14
  181. SCALARPRODUCT_LOOP 12
  182. SCALARPRODUCT_LOOP 10
  183. SCALARPRODUCT_LOOP 8
  184. SCALARPRODUCT_LOOP 6
  185. SCALARPRODUCT_LOOP 4
  186. SCALARPRODUCT_LOOP 2
  187. SCALARPRODUCT_LOOP 0
  188. .end:
  189. movhlps m0, m6
  190. paddd m6, m0
  191. pshuflw m0, m6, 0x4e
  192. paddd m6, m0
  193. movd eax, m6
  194. RET
  195. ; void add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  196. cglobal add_hfyu_median_prediction_mmx2, 6,6,0, dst, top, diff, w, left, left_top
  197. movq mm0, [topq]
  198. movq mm2, mm0
  199. movd mm4, [left_topq]
  200. psllq mm2, 8
  201. movq mm1, mm0
  202. por mm4, mm2
  203. movd mm3, [leftq]
  204. psubb mm0, mm4 ; t-tl
  205. add dstq, wq
  206. add topq, wq
  207. add diffq, wq
  208. neg wq
  209. jmp .skip
  210. .loop:
  211. movq mm4, [topq+wq]
  212. movq mm0, mm4
  213. psllq mm4, 8
  214. por mm4, mm1
  215. movq mm1, mm0 ; t
  216. psubb mm0, mm4 ; t-tl
  217. .skip:
  218. movq mm2, [diffq+wq]
  219. %assign i 0
  220. %rep 8
  221. movq mm4, mm0
  222. paddb mm4, mm3 ; t-tl+l
  223. movq mm5, mm3
  224. pmaxub mm3, mm1
  225. pminub mm5, mm1
  226. pminub mm3, mm4
  227. pmaxub mm3, mm5 ; median
  228. paddb mm3, mm2 ; +residual
  229. %if i==0
  230. movq mm7, mm3
  231. psllq mm7, 56
  232. %else
  233. movq mm6, mm3
  234. psrlq mm7, 8
  235. psllq mm6, 56
  236. por mm7, mm6
  237. %endif
  238. %if i<7
  239. psrlq mm0, 8
  240. psrlq mm1, 8
  241. psrlq mm2, 8
  242. %endif
  243. %assign i i+1
  244. %endrep
  245. movq [dstq+wq], mm7
  246. add wq, 8
  247. jl .loop
  248. movzx r2d, byte [dstq-1]
  249. mov [leftq], r2d
  250. movzx r2d, byte [topq-1]
  251. mov [left_topq], r2d
  252. RET
  253. %macro ADD_HFYU_LEFT_LOOP 1 ; %1 = is_aligned
  254. add srcq, wq
  255. add dstq, wq
  256. neg wq
  257. %%.loop:
  258. mova m1, [srcq+wq]
  259. mova m2, m1
  260. psllw m1, 8
  261. paddb m1, m2
  262. mova m2, m1
  263. pshufb m1, m3
  264. paddb m1, m2
  265. pshufb m0, m5
  266. mova m2, m1
  267. pshufb m1, m4
  268. paddb m1, m2
  269. %if mmsize == 16
  270. mova m2, m1
  271. pshufb m1, m6
  272. paddb m1, m2
  273. %endif
  274. paddb m0, m1
  275. %if %1
  276. mova [dstq+wq], m0
  277. %else
  278. movq [dstq+wq], m0
  279. movhps [dstq+wq+8], m0
  280. %endif
  281. add wq, mmsize
  282. jl %%.loop
  283. mov eax, mmsize-1
  284. sub eax, wd
  285. movd m1, eax
  286. pshufb m0, m1
  287. movd eax, m0
  288. RET
  289. %endmacro
  290. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  291. INIT_MMX
  292. cglobal add_hfyu_left_prediction_ssse3, 3,3,7, dst, src, w, left
  293. .skip_prologue:
  294. mova m5, [pb_7]
  295. mova m4, [pb_zzzz3333zzzzbbbb]
  296. mova m3, [pb_zz11zz55zz99zzdd]
  297. movd m0, leftm
  298. psllq m0, 56
  299. ADD_HFYU_LEFT_LOOP 1
  300. INIT_XMM
  301. cglobal add_hfyu_left_prediction_sse4, 3,3,7, dst, src, w, left
  302. mova m5, [pb_f]
  303. mova m6, [pb_zzzzzzzz77777777]
  304. mova m4, [pb_zzzz3333zzzzbbbb]
  305. mova m3, [pb_zz11zz55zz99zzdd]
  306. movd m0, leftm
  307. pslldq m0, 15
  308. test srcq, 15
  309. jnz add_hfyu_left_prediction_ssse3.skip_prologue
  310. test dstq, 15
  311. jnz .unaligned
  312. ADD_HFYU_LEFT_LOOP 1
  313. .unaligned:
  314. ADD_HFYU_LEFT_LOOP 0
  315. ; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
  316. cglobal scalarproduct_float_sse, 3,3,2, v1, v2, offset
  317. neg offsetq
  318. shl offsetq, 2
  319. sub v1q, offsetq
  320. sub v2q, offsetq
  321. xorps xmm0, xmm0
  322. .loop:
  323. movaps xmm1, [v1q+offsetq]
  324. mulps xmm1, [v2q+offsetq]
  325. addps xmm0, xmm1
  326. add offsetq, 16
  327. js .loop
  328. movhlps xmm1, xmm0
  329. addps xmm0, xmm1
  330. movss xmm1, xmm0
  331. shufps xmm0, xmm0, 1
  332. addss xmm0, xmm1
  333. %ifndef ARCH_X86_64
  334. movd r0m, xmm0
  335. fld dword r0m
  336. %endif
  337. RET
  338. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  339. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  340. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  341. ;
  342. ; The actual function itself is below. It basically wraps a very simple
  343. ; w = end_x - start_x
  344. ; if (w) {
  345. ; if (w > 22) {
  346. ; jump to the slow loop functions
  347. ; } else {
  348. ; jump to the fast loop functions
  349. ; }
  350. ; }
  351. ;
  352. ; ... and then the same for left/right extend also. See below for loop
  353. ; function implementations. Fast are fixed-width, slow is variable-width
  354. %macro EMU_EDGE_FUNC 1
  355. %ifdef ARCH_X86_64
  356. %define w_reg r10
  357. cglobal emu_edge_core_%1, 6, 7, 1
  358. mov r11, r5 ; save block_h
  359. %else
  360. %define w_reg r6
  361. cglobal emu_edge_core_%1, 2, 7, 0
  362. mov r4, r4m ; end_y
  363. mov r5, r5m ; block_h
  364. %endif
  365. ; start with vertical extend (top/bottom) and body pixel copy
  366. mov w_reg, r7m
  367. sub w_reg, r6m ; w = start_x - end_x
  368. sub r5, r4
  369. %ifdef ARCH_X86_64
  370. sub r4, r3
  371. %else
  372. sub r4, dword r3m
  373. %endif
  374. cmp w_reg, 22
  375. jg .slow_v_extend_loop
  376. %ifdef ARCH_X86_32
  377. mov r2, r2m ; linesize
  378. %endif
  379. sal w_reg, 7 ; w * 128
  380. %ifdef PIC
  381. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  382. add w_reg, rax
  383. %else
  384. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  385. %endif
  386. call w_reg ; fast top extend, body copy and bottom extend
  387. .v_extend_end:
  388. ; horizontal extend (left/right)
  389. mov w_reg, r6m ; start_x
  390. sub r0, w_reg
  391. %ifdef ARCH_X86_64
  392. mov r3, r0 ; backup of buf+block_h*linesize
  393. mov r5, r11
  394. %else
  395. mov r0m, r0 ; backup of buf+block_h*linesize
  396. mov r5, r5m
  397. %endif
  398. test w_reg, w_reg
  399. jz .right_extend
  400. cmp w_reg, 22
  401. jg .slow_left_extend_loop
  402. mov r1, w_reg
  403. dec w_reg
  404. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  405. sar w_reg, 1
  406. sal w_reg, 6
  407. ; r0=buf+block_h*linesize,r10(64)/r6(32)=start_x offset for funcs
  408. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  409. %ifdef PIC
  410. lea rax, [.emuedge_extend_left_2]
  411. add w_reg, rax
  412. %else
  413. lea w_reg, [.emuedge_extend_left_2+w_reg]
  414. %endif
  415. call w_reg
  416. ; now r3(64)/r0(32)=buf,r2=linesize,r11/r5=block_h,r6/r3=val, r10/r6=end_x, r1=block_w
  417. .right_extend:
  418. %ifdef ARCH_X86_32
  419. mov r0, r0m
  420. mov r5, r5m
  421. %endif
  422. mov w_reg, r7m ; end_x
  423. mov r1, r8m ; block_w
  424. mov r4, r1
  425. sub r1, w_reg
  426. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  427. cmp r1, 22
  428. jg .slow_right_extend_loop
  429. dec r1
  430. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  431. sar r1, 1
  432. sal r1, 6
  433. %ifdef PIC
  434. lea rax, [.emuedge_extend_right_2]
  435. add r1, rax
  436. %else
  437. lea r1, [.emuedge_extend_right_2+r1]
  438. %endif
  439. call r1
  440. .h_extend_end:
  441. RET
  442. %ifdef ARCH_X86_64
  443. %define vall al
  444. %define valh ah
  445. %define valw ax
  446. %define valw2 r10w
  447. %define valw3 r3w
  448. %define vald eax
  449. %else
  450. %define vall bl
  451. %define valh bh
  452. %define valw bx
  453. %define valw2 r6w
  454. %define valw3 valw2
  455. %define vald ebx
  456. %define stack_offset 0x14
  457. %endif
  458. %endmacro
  459. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  460. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  461. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  462. ; - else if (%2 & 8) fills 8 bytes into mm0
  463. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  464. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  465. ; - if (%2 & 3 == 3) fills 2 bytes into r10/r3, and 1 into eax
  466. ; (note that we're using r3 for body/bottom because it's a shorter
  467. ; opcode, and then the loop fits in 128 bytes)
  468. ; - else fills remaining bytes into rax
  469. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  470. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  471. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  472. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  473. ; - else fills remaining bytes into ebx
  474. ; writing data out is in the same way
  475. %macro READ_NUM_BYTES 3
  476. %assign %%src_off 0 ; offset in source buffer
  477. %assign %%smidx 0 ; mmx register idx
  478. %assign %%sxidx 0 ; xmm register idx
  479. %ifnidn %3, mmx
  480. %rep %2/16
  481. movdqu xmm %+ %%sxidx, [r1+%%src_off]
  482. %assign %%src_off %%src_off+16
  483. %assign %%sxidx %%sxidx+1
  484. %endrep ; %2/16
  485. %endif ; !mmx
  486. %ifdef ARCH_X86_64
  487. %if (%2-%%src_off) == 8
  488. mov rax, [r1+%%src_off]
  489. %assign %%src_off %%src_off+8
  490. %endif ; (%2-%%src_off) == 8
  491. %endif ; x86-64
  492. %rep (%2-%%src_off)/8
  493. movq mm %+ %%smidx, [r1+%%src_off]
  494. %assign %%src_off %%src_off+8
  495. %assign %%smidx %%smidx+1
  496. %endrep ; (%2-%%dst_off)/8
  497. %if (%2-%%src_off) == 4
  498. mov vald, [r1+%%src_off]
  499. %elif (%2-%%src_off) & 4
  500. movd mm %+ %%smidx, [r1+%%src_off]
  501. %assign %%src_off %%src_off+4
  502. %endif ; (%2-%%src_off) ==/& 4
  503. %if (%2-%%src_off) == 1
  504. mov vall, [r1+%%src_off]
  505. %elif (%2-%%src_off) == 2
  506. mov valw, [r1+%%src_off]
  507. %elif (%2-%%src_off) == 3
  508. %ifidn %1, top
  509. mov valw2, [r1+%%src_off]
  510. %else ; %1 != top
  511. mov valw3, [r1+%%src_off]
  512. %endif ; %1 ==/!= top
  513. mov vall, [r1+%%src_off+2]
  514. %endif ; (%2-%%src_off) == 1/2/3
  515. %endmacro ; READ_NUM_BYTES
  516. %macro WRITE_NUM_BYTES 3
  517. %assign %%dst_off 0 ; offset in destination buffer
  518. %assign %%dmidx 0 ; mmx register idx
  519. %assign %%dxidx 0 ; xmm register idx
  520. %ifnidn %3, mmx
  521. %rep %2/16
  522. movdqu [r0+%%dst_off], xmm %+ %%dxidx
  523. %assign %%dst_off %%dst_off+16
  524. %assign %%dxidx %%dxidx+1
  525. %endrep ; %2/16
  526. %endif
  527. %ifdef ARCH_X86_64
  528. %if (%2-%%dst_off) == 8
  529. mov [r0+%%dst_off], rax
  530. %assign %%dst_off %%dst_off+8
  531. %endif ; (%2-%%dst_off) == 8
  532. %endif ; x86-64
  533. %rep (%2-%%dst_off)/8
  534. movq [r0+%%dst_off], mm %+ %%dmidx
  535. %assign %%dst_off %%dst_off+8
  536. %assign %%dmidx %%dmidx+1
  537. %endrep ; (%2-%%dst_off)/8
  538. %if (%2-%%dst_off) == 4
  539. mov [r0+%%dst_off], vald
  540. %elif (%2-%%dst_off) & 4
  541. movd [r0+%%dst_off], mm %+ %%dmidx
  542. %assign %%dst_off %%dst_off+4
  543. %endif ; (%2-%%dst_off) ==/& 4
  544. %if (%2-%%dst_off) == 1
  545. mov [r0+%%dst_off], vall
  546. %elif (%2-%%dst_off) == 2
  547. mov [r0+%%dst_off], valw
  548. %elif (%2-%%dst_off) == 3
  549. %ifidn %1, top
  550. mov [r0+%%dst_off], valw2
  551. %else ; %1 != top
  552. mov [r0+%%dst_off], valw3
  553. %endif ; %1 ==/!= top
  554. mov [r0+%%dst_off+2], vall
  555. %endif ; (%2-%%dst_off) == 1/2/3
  556. %endmacro ; WRITE_NUM_BYTES
  557. ; vertical top/bottom extend and body copy fast loops
  558. ; these are function pointers to set-width line copy functions, i.e.
  559. ; they read a fixed number of pixels into set registers, and write
  560. ; those out into the destination buffer
  561. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  562. ; r6(eax/64)/r3(ebx/32)=val_reg
  563. %macro VERTICAL_EXTEND 1
  564. %assign %%n 1
  565. %rep 22
  566. ALIGN 128
  567. .emuedge_v_extend_ %+ %%n:
  568. ; extend pixels above body
  569. %ifdef ARCH_X86_64
  570. test r3 , r3 ; if (!start_y)
  571. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  572. %else ; ARCH_X86_32
  573. cmp dword r3m, 0
  574. je .emuedge_copy_body_ %+ %%n %+ _loop
  575. %endif ; ARCH_X86_64/32
  576. READ_NUM_BYTES top, %%n, %1 ; read bytes
  577. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  578. WRITE_NUM_BYTES top, %%n, %1 ; write bytes
  579. add r0 , r2 ; dst += linesize
  580. %ifdef ARCH_X86_64
  581. dec r3
  582. %else ; ARCH_X86_32
  583. dec dword r3m
  584. %endif ; ARCH_X86_64/32
  585. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  586. ; copy body pixels
  587. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  588. READ_NUM_BYTES body, %%n, %1 ; read bytes
  589. WRITE_NUM_BYTES body, %%n, %1 ; write bytes
  590. add r0 , r2 ; dst += linesize
  591. add r1 , r2 ; src += linesize
  592. dec r4
  593. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  594. ; copy bottom pixels
  595. test r5 , r5 ; if (!block_h)
  596. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  597. sub r1 , r2 ; src -= linesize
  598. READ_NUM_BYTES bottom, %%n, %1 ; read bytes
  599. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  600. WRITE_NUM_BYTES bottom, %%n, %1 ; write bytes
  601. add r0 , r2 ; dst += linesize
  602. dec r5
  603. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  604. .emuedge_v_extend_end_ %+ %%n:
  605. %ifdef ARCH_X86_64
  606. ret
  607. %else ; ARCH_X86_32
  608. rep ret
  609. %endif ; ARCH_X86_64/32
  610. %assign %%n %%n+1
  611. %endrep
  612. %endmacro VERTICAL_EXTEND
  613. ; left/right (horizontal) fast extend functions
  614. ; these are essentially identical to the vertical extend ones above,
  615. ; just left/right separated because number of pixels to extend is
  616. ; obviously not the same on both sides.
  617. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  618. ; lowest two bytes of the register (so val*0x0101), and are splatted
  619. ; into each byte of mm0 as well if n_pixels >= 8
  620. %macro READ_V_PIXEL 3
  621. mov vall, %2
  622. mov valh, vall
  623. %if %1 >= 8
  624. movd mm0, vald
  625. %ifidn %3, mmx
  626. punpcklwd mm0, mm0
  627. punpckldq mm0, mm0
  628. %else ; !mmx
  629. pshufw mm0, mm0, 0
  630. %endif ; mmx
  631. %endif ; %1 >= 8
  632. %endmacro
  633. %macro WRITE_V_PIXEL 2
  634. %assign %%dst_off 0
  635. %rep %1/8
  636. movq [%2+%%dst_off], mm0
  637. %assign %%dst_off %%dst_off+8
  638. %endrep
  639. %if %1 & 4
  640. %if %1 >= 8
  641. movd [%2+%%dst_off], mm0
  642. %else ; %1 < 8
  643. mov [%2+%%dst_off] , valw
  644. mov [%2+%%dst_off+2], valw
  645. %endif ; %1 >=/< 8
  646. %assign %%dst_off %%dst_off+4
  647. %endif ; %1 & 4
  648. %if %1&2
  649. mov [%2+%%dst_off], valw
  650. %endif ; %1 & 2
  651. %endmacro
  652. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  653. %macro LEFT_EXTEND 1
  654. %assign %%n 2
  655. %rep 11
  656. ALIGN 64
  657. .emuedge_extend_left_ %+ %%n: ; do {
  658. sub r0, r2 ; dst -= linesize
  659. READ_V_PIXEL %%n, [r0+r1], %1 ; read pixels
  660. WRITE_V_PIXEL %%n, r0 ; write pixels
  661. dec r5
  662. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  663. %ifdef ARCH_X86_64
  664. ret
  665. %else ; ARCH_X86_32
  666. rep ret
  667. %endif ; ARCH_X86_64/32
  668. %assign %%n %%n+2
  669. %endrep
  670. %endmacro ; LEFT_EXTEND
  671. ; r3/r0=buf+block_h*linesize, r2=linesize, r11/r5=block_h, r0/r6=end_x, r6/r3=val
  672. %macro RIGHT_EXTEND 1
  673. %assign %%n 2
  674. %rep 11
  675. ALIGN 64
  676. .emuedge_extend_right_ %+ %%n: ; do {
  677. %ifdef ARCH_X86_64
  678. sub r3, r2 ; dst -= linesize
  679. READ_V_PIXEL %%n, [r3+w_reg-1], %1 ; read pixels
  680. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  681. dec r11
  682. %else ; ARCH_X86_32
  683. sub r0, r2 ; dst -= linesize
  684. READ_V_PIXEL %%n, [r0+w_reg-1], %1 ; read pixels
  685. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  686. dec r5
  687. %endif ; ARCH_X86_64/32
  688. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  689. %ifdef ARCH_X86_64
  690. ret
  691. %else ; ARCH_X86_32
  692. rep ret
  693. %endif ; ARCH_X86_64/32
  694. %assign %%n %%n+2
  695. %endrep
  696. %ifdef ARCH_X86_32
  697. %define stack_offset 0x10
  698. %endif
  699. %endmacro ; RIGHT_EXTEND
  700. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  701. ; width specified in a register, and run a loop to copy the full amount
  702. ; of bytes. They are optimized for copying of large amounts of pixels per
  703. ; line, so they unconditionally splat data into mm registers to copy 8
  704. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  705. ; also, but I haven't optimized this as much (i.e. FIXME)
  706. %macro V_COPY_NPX 4-5
  707. %if %0 == 4
  708. test w_reg, %4
  709. jz .%1_skip_%4_px
  710. %else ; %0 == 5
  711. .%1_%4_px_loop:
  712. %endif
  713. %3 %2, [r1+cnt_reg]
  714. %3 [r0+cnt_reg], %2
  715. add cnt_reg, %4
  716. %if %0 == 5
  717. sub w_reg, %4
  718. test w_reg, %5
  719. jnz .%1_%4_px_loop
  720. %endif
  721. .%1_skip_%4_px:
  722. %endmacro
  723. %macro V_COPY_ROW 3
  724. %ifidn %1, bottom
  725. sub r1, linesize
  726. %endif
  727. .%1_copy_loop:
  728. xor cnt_reg, cnt_reg
  729. %ifidn %3, mmx
  730. %define linesize r2m
  731. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  732. %else ; !mmx
  733. V_COPY_NPX %1, xmm0, movdqu, 16, 0xFFFFFFF0
  734. %ifdef ARCH_X86_64
  735. %define linesize r2
  736. V_COPY_NPX %1, rax , mov, 8
  737. %else ; ARCH_X86_32
  738. %define linesize r2m
  739. V_COPY_NPX %1, mm0, movq, 8
  740. %endif ; ARCH_X86_64/32
  741. %endif ; mmx
  742. V_COPY_NPX %1, vald, mov, 4
  743. V_COPY_NPX %1, valw, mov, 2
  744. V_COPY_NPX %1, vall, mov, 1
  745. mov w_reg, cnt_reg
  746. %ifidn %1, body
  747. add r1, linesize
  748. %endif
  749. add r0, linesize
  750. dec %2
  751. jnz .%1_copy_loop
  752. %endmacro
  753. %macro SLOW_V_EXTEND 1
  754. .slow_v_extend_loop:
  755. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  756. ; r11(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r10(64)/r6(32)=w=end_x-start_x
  757. %ifdef ARCH_X86_64
  758. push r11 ; save old value of block_h
  759. test r3, r3
  760. %define cnt_reg r11
  761. jz .do_body_copy ; if (!start_y) goto do_body_copy
  762. V_COPY_ROW top, r3, %1
  763. %else
  764. cmp dword r3m, 0
  765. %define cnt_reg r2
  766. je .do_body_copy ; if (!start_y) goto do_body_copy
  767. V_COPY_ROW top, dword r3m, %1
  768. %endif
  769. .do_body_copy:
  770. V_COPY_ROW body, r4, %1
  771. %ifdef ARCH_X86_64
  772. pop r11 ; restore old value of block_h
  773. %define cnt_reg r3
  774. %endif
  775. test r5, r5
  776. %ifdef ARCH_X86_64
  777. jz .v_extend_end
  778. %else
  779. jz .skip_bottom_extend
  780. %endif
  781. V_COPY_ROW bottom, r5, %1
  782. %ifdef ARCH_X86_32
  783. .skip_bottom_extend:
  784. mov r2, r2m
  785. %endif
  786. jmp .v_extend_end
  787. %endmacro
  788. %macro SLOW_LEFT_EXTEND 1
  789. .slow_left_extend_loop:
  790. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r10/r6=start_x
  791. mov r4, 8
  792. sub r0, linesize
  793. READ_V_PIXEL 8, [r0+w_reg], %1
  794. .left_extend_8px_loop:
  795. movq [r0+r4-8], mm0
  796. add r4, 8
  797. cmp r4, w_reg
  798. jle .left_extend_8px_loop
  799. sub r4, 8
  800. cmp r4, w_reg
  801. jge .left_extend_loop_end
  802. .left_extend_2px_loop:
  803. mov [r0+r4], valw
  804. add r4, 2
  805. cmp r4, w_reg
  806. jl .left_extend_2px_loop
  807. .left_extend_loop_end:
  808. dec r5
  809. jnz .slow_left_extend_loop
  810. %ifdef ARCH_X86_32
  811. mov r2, r2m
  812. %endif
  813. jmp .right_extend
  814. %endmacro
  815. %macro SLOW_RIGHT_EXTEND 1
  816. .slow_right_extend_loop:
  817. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r11(64)/r5(32)=block_h,
  818. ; r10(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  819. %ifdef ARCH_X86_64
  820. %define buf_reg r3
  821. %define bh_reg r11
  822. %else
  823. %define buf_reg r0
  824. %define bh_reg r5
  825. %endif
  826. lea r1, [r4-8]
  827. sub buf_reg, linesize
  828. READ_V_PIXEL 8, [buf_reg+w_reg-1], %1
  829. .right_extend_8px_loop:
  830. movq [buf_reg+r1], mm0
  831. sub r1, 8
  832. cmp r1, w_reg
  833. jge .right_extend_8px_loop
  834. add r1, 8
  835. cmp r1, w_reg
  836. je .right_extend_loop_end
  837. .right_extend_2px_loop:
  838. sub r1, 2
  839. mov [buf_reg+r1], valw
  840. cmp r1, w_reg
  841. jg .right_extend_2px_loop
  842. .right_extend_loop_end:
  843. dec bh_reg
  844. jnz .slow_right_extend_loop
  845. jmp .h_extend_end
  846. %endmacro
  847. %macro emu_edge 1
  848. EMU_EDGE_FUNC %1
  849. VERTICAL_EXTEND %1
  850. LEFT_EXTEND %1
  851. RIGHT_EXTEND %1
  852. SLOW_V_EXTEND %1
  853. SLOW_LEFT_EXTEND %1
  854. SLOW_RIGHT_EXTEND %1
  855. %endmacro
  856. emu_edge sse
  857. %ifdef ARCH_X86_32
  858. emu_edge mmx
  859. %endif