You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

613 lines
17KB

  1. ;******************************************************************************
  2. ;* Core video DSP functions
  3. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION .text
  23. ; extern void ff_emu_edge_core(uint8_t *buf, const uint8_t *src, x86_reg linesize,
  24. ; x86_reg start_y, x86_reg end_y, x86_reg block_h,
  25. ; x86_reg start_x, x86_reg end_x, x86_reg block_w);
  26. ;
  27. ; The actual function itself is below. It basically wraps a very simple
  28. ; w = end_x - start_x
  29. ; if (w) {
  30. ; if (w > 22) {
  31. ; jump to the slow loop functions
  32. ; } else {
  33. ; jump to the fast loop functions
  34. ; }
  35. ; }
  36. ;
  37. ; ... and then the same for left/right extend also. See below for loop
  38. ; function implementations. Fast are fixed-width, slow is variable-width
  39. %macro EMU_EDGE_FUNC 0
  40. %if ARCH_X86_64
  41. %define w_reg r7
  42. cglobal emu_edge_core, 6, 9, 1
  43. mov r8, r5 ; save block_h
  44. %else
  45. %define w_reg r6
  46. cglobal emu_edge_core, 2, 7, 0
  47. mov r4, r4m ; end_y
  48. mov r5, r5m ; block_h
  49. %endif
  50. ; start with vertical extend (top/bottom) and body pixel copy
  51. mov w_reg, r7m
  52. sub w_reg, r6m ; w = start_x - end_x
  53. sub r5, r4
  54. %if ARCH_X86_64
  55. sub r4, r3
  56. %else
  57. sub r4, dword r3m
  58. %endif
  59. cmp w_reg, 22
  60. jg .slow_v_extend_loop
  61. %if ARCH_X86_32
  62. mov r2, r2m ; linesize
  63. %endif
  64. sal w_reg, 7 ; w * 128
  65. %ifdef PIC
  66. lea rax, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)]
  67. add w_reg, rax
  68. %else
  69. lea w_reg, [.emuedge_v_extend_1 - (.emuedge_v_extend_2 - .emuedge_v_extend_1)+w_reg]
  70. %endif
  71. call w_reg ; fast top extend, body copy and bottom extend
  72. .v_extend_end:
  73. ; horizontal extend (left/right)
  74. mov w_reg, r6m ; start_x
  75. sub r0, w_reg
  76. %if ARCH_X86_64
  77. mov r3, r0 ; backup of buf+block_h*linesize
  78. mov r5, r8
  79. %else
  80. mov r0m, r0 ; backup of buf+block_h*linesize
  81. mov r5, r5m
  82. %endif
  83. test w_reg, w_reg
  84. jz .right_extend
  85. cmp w_reg, 22
  86. jg .slow_left_extend_loop
  87. mov r1, w_reg
  88. dec w_reg
  89. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  90. sar w_reg, 1
  91. sal w_reg, 6
  92. ; r0=buf+block_h*linesize,r7(64)/r6(32)=start_x offset for funcs
  93. ; r6(rax)/r3(ebx)=val,r2=linesize,r1=start_x,r5=block_h
  94. %ifdef PIC
  95. lea rax, [.emuedge_extend_left_2]
  96. add w_reg, rax
  97. %else
  98. lea w_reg, [.emuedge_extend_left_2+w_reg]
  99. %endif
  100. call w_reg
  101. ; now r3(64)/r0(32)=buf,r2=linesize,r8/r5=block_h,r6/r3=val, r7/r6=end_x, r1=block_w
  102. .right_extend:
  103. %if ARCH_X86_32
  104. mov r0, r0m
  105. mov r5, r5m
  106. %endif
  107. mov w_reg, r7m ; end_x
  108. mov r1, r8m ; block_w
  109. mov r4, r1
  110. sub r1, w_reg
  111. jz .h_extend_end ; if (end_x == block_w) goto h_extend_end
  112. cmp r1, 22
  113. jg .slow_right_extend_loop
  114. dec r1
  115. ; FIXME we can do a if size == 1 here if that makes any speed difference, test me
  116. sar r1, 1
  117. sal r1, 6
  118. %ifdef PIC
  119. lea rax, [.emuedge_extend_right_2]
  120. add r1, rax
  121. %else
  122. lea r1, [.emuedge_extend_right_2+r1]
  123. %endif
  124. call r1
  125. .h_extend_end:
  126. RET
  127. %if ARCH_X86_64
  128. %define vall al
  129. %define valh ah
  130. %define valw ax
  131. %define valw2 r7w
  132. %define valw3 r3w
  133. %if WIN64
  134. %define valw4 r7w
  135. %else ; unix64
  136. %define valw4 r3w
  137. %endif
  138. %define vald eax
  139. %else
  140. %define vall bl
  141. %define valh bh
  142. %define valw bx
  143. %define valw2 r6w
  144. %define valw3 valw2
  145. %define valw4 valw3
  146. %define vald ebx
  147. %define stack_offset 0x14
  148. %endif
  149. %endmacro
  150. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  151. ; on x86-64, - fills xmm0-15 for consecutive sets of 16 pixels
  152. ; - if (%2 & 15 == 8) fills the last 8 bytes into rax
  153. ; - else if (%2 & 8) fills 8 bytes into mm0
  154. ; - if (%2 & 7 == 4) fills the last 4 bytes into rax
  155. ; - else if (%2 & 4) fills 4 bytes into mm0-1
  156. ; - if (%2 & 3 == 3) fills 2 bytes into r7/r3, and 1 into eax
  157. ; (note that we're using r3 for body/bottom because it's a shorter
  158. ; opcode, and then the loop fits in 128 bytes)
  159. ; - else fills remaining bytes into rax
  160. ; on x86-32, - fills mm0-7 for consecutive sets of 8 pixels
  161. ; - if (%2 & 7 == 4) fills 4 bytes into ebx
  162. ; - else if (%2 & 4) fills 4 bytes into mm0-7
  163. ; - if (%2 & 3 == 3) fills 2 bytes into r6, and 1 into ebx
  164. ; - else fills remaining bytes into ebx
  165. ; writing data out is in the same way
  166. %macro READ_NUM_BYTES 2
  167. %assign %%src_off 0 ; offset in source buffer
  168. %assign %%smidx 0 ; mmx register idx
  169. %assign %%sxidx 0 ; xmm register idx
  170. %if cpuflag(sse)
  171. %rep %2/16
  172. movups xmm %+ %%sxidx, [r1+%%src_off]
  173. %assign %%src_off %%src_off+16
  174. %assign %%sxidx %%sxidx+1
  175. %endrep ; %2/16
  176. %endif
  177. %if ARCH_X86_64
  178. %if (%2-%%src_off) == 8
  179. mov rax, [r1+%%src_off]
  180. %assign %%src_off %%src_off+8
  181. %endif ; (%2-%%src_off) == 8
  182. %endif ; x86-64
  183. %rep (%2-%%src_off)/8
  184. movq mm %+ %%smidx, [r1+%%src_off]
  185. %assign %%src_off %%src_off+8
  186. %assign %%smidx %%smidx+1
  187. %endrep ; (%2-%%dst_off)/8
  188. %if (%2-%%src_off) == 4
  189. mov vald, [r1+%%src_off]
  190. %elif (%2-%%src_off) & 4
  191. movd mm %+ %%smidx, [r1+%%src_off]
  192. %assign %%src_off %%src_off+4
  193. %endif ; (%2-%%src_off) ==/& 4
  194. %if (%2-%%src_off) == 1
  195. mov vall, [r1+%%src_off]
  196. %elif (%2-%%src_off) == 2
  197. mov valw, [r1+%%src_off]
  198. %elif (%2-%%src_off) == 3
  199. %ifidn %1, top
  200. mov valw2, [r1+%%src_off]
  201. %elifidn %1, body
  202. mov valw3, [r1+%%src_off]
  203. %elifidn %1, bottom
  204. mov valw4, [r1+%%src_off]
  205. %endif ; %1 ==/!= top
  206. mov vall, [r1+%%src_off+2]
  207. %endif ; (%2-%%src_off) == 1/2/3
  208. %endmacro ; READ_NUM_BYTES
  209. %macro WRITE_NUM_BYTES 2
  210. %assign %%dst_off 0 ; offset in destination buffer
  211. %assign %%dmidx 0 ; mmx register idx
  212. %assign %%dxidx 0 ; xmm register idx
  213. %if cpuflag(sse)
  214. %rep %2/16
  215. movups [r0+%%dst_off], xmm %+ %%dxidx
  216. %assign %%dst_off %%dst_off+16
  217. %assign %%dxidx %%dxidx+1
  218. %endrep ; %2/16
  219. %endif
  220. %if ARCH_X86_64
  221. %if (%2-%%dst_off) == 8
  222. mov [r0+%%dst_off], rax
  223. %assign %%dst_off %%dst_off+8
  224. %endif ; (%2-%%dst_off) == 8
  225. %endif ; x86-64
  226. %rep (%2-%%dst_off)/8
  227. movq [r0+%%dst_off], mm %+ %%dmidx
  228. %assign %%dst_off %%dst_off+8
  229. %assign %%dmidx %%dmidx+1
  230. %endrep ; (%2-%%dst_off)/8
  231. %if (%2-%%dst_off) == 4
  232. mov [r0+%%dst_off], vald
  233. %elif (%2-%%dst_off) & 4
  234. movd [r0+%%dst_off], mm %+ %%dmidx
  235. %assign %%dst_off %%dst_off+4
  236. %endif ; (%2-%%dst_off) ==/& 4
  237. %if (%2-%%dst_off) == 1
  238. mov [r0+%%dst_off], vall
  239. %elif (%2-%%dst_off) == 2
  240. mov [r0+%%dst_off], valw
  241. %elif (%2-%%dst_off) == 3
  242. %ifidn %1, top
  243. mov [r0+%%dst_off], valw2
  244. %elifidn %1, body
  245. mov [r0+%%dst_off], valw3
  246. %elifidn %1, bottom
  247. mov [r0+%%dst_off], valw4
  248. %endif ; %1 ==/!= top
  249. mov [r0+%%dst_off+2], vall
  250. %endif ; (%2-%%dst_off) == 1/2/3
  251. %endmacro ; WRITE_NUM_BYTES
  252. ; vertical top/bottom extend and body copy fast loops
  253. ; these are function pointers to set-width line copy functions, i.e.
  254. ; they read a fixed number of pixels into set registers, and write
  255. ; those out into the destination buffer
  256. ; r0=buf,r1=src,r2=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  257. ; r6(eax/64)/r3(ebx/32)=val_reg
  258. %macro VERTICAL_EXTEND 0
  259. %assign %%n 1
  260. %rep 22
  261. ALIGN 128
  262. .emuedge_v_extend_ %+ %%n:
  263. ; extend pixels above body
  264. %if ARCH_X86_64
  265. test r3 , r3 ; if (!start_y)
  266. jz .emuedge_copy_body_ %+ %%n %+ _loop ; goto body
  267. %else ; ARCH_X86_32
  268. cmp dword r3m, 0
  269. je .emuedge_copy_body_ %+ %%n %+ _loop
  270. %endif ; ARCH_X86_64/32
  271. READ_NUM_BYTES top, %%n ; read bytes
  272. .emuedge_extend_top_ %+ %%n %+ _loop: ; do {
  273. WRITE_NUM_BYTES top, %%n ; write bytes
  274. add r0 , r2 ; dst += linesize
  275. %if ARCH_X86_64
  276. dec r3d
  277. %else ; ARCH_X86_32
  278. dec dword r3m
  279. %endif ; ARCH_X86_64/32
  280. jnz .emuedge_extend_top_ %+ %%n %+ _loop ; } while (--start_y)
  281. ; copy body pixels
  282. .emuedge_copy_body_ %+ %%n %+ _loop: ; do {
  283. READ_NUM_BYTES body, %%n ; read bytes
  284. WRITE_NUM_BYTES body, %%n ; write bytes
  285. add r0 , r2 ; dst += linesize
  286. add r1 , r2 ; src += linesize
  287. dec r4d
  288. jnz .emuedge_copy_body_ %+ %%n %+ _loop ; } while (--end_y)
  289. ; copy bottom pixels
  290. test r5 , r5 ; if (!block_h)
  291. jz .emuedge_v_extend_end_ %+ %%n ; goto end
  292. sub r1 , r2 ; src -= linesize
  293. READ_NUM_BYTES bottom, %%n ; read bytes
  294. .emuedge_extend_bottom_ %+ %%n %+ _loop: ; do {
  295. WRITE_NUM_BYTES bottom, %%n ; write bytes
  296. add r0 , r2 ; dst += linesize
  297. dec r5d
  298. jnz .emuedge_extend_bottom_ %+ %%n %+ _loop ; } while (--block_h)
  299. .emuedge_v_extend_end_ %+ %%n:
  300. %if ARCH_X86_64
  301. ret
  302. %else ; ARCH_X86_32
  303. rep ret
  304. %endif ; ARCH_X86_64/32
  305. %assign %%n %%n+1
  306. %endrep
  307. %endmacro VERTICAL_EXTEND
  308. ; left/right (horizontal) fast extend functions
  309. ; these are essentially identical to the vertical extend ones above,
  310. ; just left/right separated because number of pixels to extend is
  311. ; obviously not the same on both sides.
  312. ; for reading, pixels are placed in eax (x86-64) or ebx (x86-64) in the
  313. ; lowest two bytes of the register (so val*0x0101), and are splatted
  314. ; into each byte of mm0 as well if n_pixels >= 8
  315. %macro READ_V_PIXEL 2
  316. mov vall, %2
  317. mov valh, vall
  318. %if %1 >= 8
  319. movd mm0, vald
  320. %if cpuflag(mmxext)
  321. pshufw mm0, mm0, 0
  322. %else ; mmx
  323. punpcklwd mm0, mm0
  324. punpckldq mm0, mm0
  325. %endif ; sse
  326. %endif ; %1 >= 8
  327. %endmacro
  328. %macro WRITE_V_PIXEL 2
  329. %assign %%dst_off 0
  330. %rep %1/8
  331. movq [%2+%%dst_off], mm0
  332. %assign %%dst_off %%dst_off+8
  333. %endrep
  334. %if %1 & 4
  335. %if %1 >= 8
  336. movd [%2+%%dst_off], mm0
  337. %else ; %1 < 8
  338. mov [%2+%%dst_off] , valw
  339. mov [%2+%%dst_off+2], valw
  340. %endif ; %1 >=/< 8
  341. %assign %%dst_off %%dst_off+4
  342. %endif ; %1 & 4
  343. %if %1&2
  344. mov [%2+%%dst_off], valw
  345. %endif ; %1 & 2
  346. %endmacro
  347. ; r0=buf+block_h*linesize, r1=start_x, r2=linesize, r5=block_h, r6/r3=val
  348. %macro LEFT_EXTEND 0
  349. %assign %%n 2
  350. %rep 11
  351. ALIGN 64
  352. .emuedge_extend_left_ %+ %%n: ; do {
  353. sub r0, r2 ; dst -= linesize
  354. READ_V_PIXEL %%n, [r0+r1] ; read pixels
  355. WRITE_V_PIXEL %%n, r0 ; write pixels
  356. dec r5
  357. jnz .emuedge_extend_left_ %+ %%n ; } while (--block_h)
  358. %if ARCH_X86_64
  359. ret
  360. %else ; ARCH_X86_32
  361. rep ret
  362. %endif ; ARCH_X86_64/32
  363. %assign %%n %%n+2
  364. %endrep
  365. %endmacro ; LEFT_EXTEND
  366. ; r3/r0=buf+block_h*linesize, r2=linesize, r8/r5=block_h, r0/r6=end_x, r6/r3=val
  367. %macro RIGHT_EXTEND 0
  368. %assign %%n 2
  369. %rep 11
  370. ALIGN 64
  371. .emuedge_extend_right_ %+ %%n: ; do {
  372. %if ARCH_X86_64
  373. sub r3, r2 ; dst -= linesize
  374. READ_V_PIXEL %%n, [r3+w_reg-1] ; read pixels
  375. WRITE_V_PIXEL %%n, r3+r4-%%n ; write pixels
  376. dec r8
  377. %else ; ARCH_X86_32
  378. sub r0, r2 ; dst -= linesize
  379. READ_V_PIXEL %%n, [r0+w_reg-1] ; read pixels
  380. WRITE_V_PIXEL %%n, r0+r4-%%n ; write pixels
  381. dec r5
  382. %endif ; ARCH_X86_64/32
  383. jnz .emuedge_extend_right_ %+ %%n ; } while (--block_h)
  384. %if ARCH_X86_64
  385. ret
  386. %else ; ARCH_X86_32
  387. rep ret
  388. %endif ; ARCH_X86_64/32
  389. %assign %%n %%n+2
  390. %endrep
  391. %if ARCH_X86_32
  392. %define stack_offset 0x10
  393. %endif
  394. %endmacro ; RIGHT_EXTEND
  395. ; below follow the "slow" copy/extend functions, these act on a non-fixed
  396. ; width specified in a register, and run a loop to copy the full amount
  397. ; of bytes. They are optimized for copying of large amounts of pixels per
  398. ; line, so they unconditionally splat data into mm registers to copy 8
  399. ; bytes per loop iteration. It could be considered to use xmm for x86-64
  400. ; also, but I haven't optimized this as much (i.e. FIXME)
  401. %macro V_COPY_NPX 4-5
  402. %if %0 == 4
  403. test w_reg, %4
  404. jz .%1_skip_%4_px
  405. %else ; %0 == 5
  406. .%1_%4_px_loop:
  407. %endif
  408. %3 %2, [r1+cnt_reg]
  409. %3 [r0+cnt_reg], %2
  410. add cnt_reg, %4
  411. %if %0 == 5
  412. sub w_reg, %4
  413. test w_reg, %5
  414. jnz .%1_%4_px_loop
  415. %endif
  416. .%1_skip_%4_px:
  417. %endmacro
  418. %macro V_COPY_ROW 2
  419. %ifidn %1, bottom
  420. sub r1, linesize
  421. %endif
  422. .%1_copy_loop:
  423. xor cnt_reg, cnt_reg
  424. %if notcpuflag(sse)
  425. %define linesize r2m
  426. V_COPY_NPX %1, mm0, movq, 8, 0xFFFFFFF8
  427. %else ; sse
  428. V_COPY_NPX %1, xmm0, movups, 16, 0xFFFFFFF0
  429. %if ARCH_X86_64
  430. %define linesize r2
  431. V_COPY_NPX %1, rax , mov, 8
  432. %else ; ARCH_X86_32
  433. %define linesize r2m
  434. V_COPY_NPX %1, mm0, movq, 8
  435. %endif ; ARCH_X86_64/32
  436. %endif ; sse
  437. V_COPY_NPX %1, vald, mov, 4
  438. V_COPY_NPX %1, valw, mov, 2
  439. V_COPY_NPX %1, vall, mov, 1
  440. mov w_reg, cnt_reg
  441. %ifidn %1, body
  442. add r1, linesize
  443. %endif
  444. add r0, linesize
  445. dec %2
  446. jnz .%1_copy_loop
  447. %endmacro
  448. %macro SLOW_V_EXTEND 0
  449. .slow_v_extend_loop:
  450. ; r0=buf,r1=src,r2(64)/r2m(32)=linesize,r3(64)/r3m(32)=start_x,r4=end_y,r5=block_h
  451. ; r8(64)/r3(later-64)/r2(32)=cnt_reg,r6(64)/r3(32)=val_reg,r7(64)/r6(32)=w=end_x-start_x
  452. %if ARCH_X86_64
  453. push r8 ; save old value of block_h
  454. test r3, r3
  455. %define cnt_reg r8
  456. jz .do_body_copy ; if (!start_y) goto do_body_copy
  457. V_COPY_ROW top, r3
  458. %else
  459. cmp dword r3m, 0
  460. %define cnt_reg r2
  461. je .do_body_copy ; if (!start_y) goto do_body_copy
  462. V_COPY_ROW top, dword r3m
  463. %endif
  464. .do_body_copy:
  465. V_COPY_ROW body, r4
  466. %if ARCH_X86_64
  467. pop r8 ; restore old value of block_h
  468. %define cnt_reg r3
  469. %endif
  470. test r5, r5
  471. %if ARCH_X86_64
  472. jz .v_extend_end
  473. %else
  474. jz .skip_bottom_extend
  475. %endif
  476. V_COPY_ROW bottom, r5
  477. %if ARCH_X86_32
  478. .skip_bottom_extend:
  479. mov r2, r2m
  480. %endif
  481. jmp .v_extend_end
  482. %endmacro
  483. %macro SLOW_LEFT_EXTEND 0
  484. .slow_left_extend_loop:
  485. ; r0=buf+block_h*linesize,r2=linesize,r6(64)/r3(32)=val,r5=block_h,r4=cntr,r7/r6=start_x
  486. mov r4, 8
  487. sub r0, linesize
  488. READ_V_PIXEL 8, [r0+w_reg]
  489. .left_extend_8px_loop:
  490. movq [r0+r4-8], mm0
  491. add r4, 8
  492. cmp r4, w_reg
  493. jle .left_extend_8px_loop
  494. sub r4, 8
  495. cmp r4, w_reg
  496. jge .left_extend_loop_end
  497. .left_extend_2px_loop:
  498. mov [r0+r4], valw
  499. add r4, 2
  500. cmp r4, w_reg
  501. jl .left_extend_2px_loop
  502. .left_extend_loop_end:
  503. dec r5
  504. jnz .slow_left_extend_loop
  505. %if ARCH_X86_32
  506. mov r2, r2m
  507. %endif
  508. jmp .right_extend
  509. %endmacro
  510. %macro SLOW_RIGHT_EXTEND 0
  511. .slow_right_extend_loop:
  512. ; r3(64)/r0(32)=buf+block_h*linesize,r2=linesize,r4=block_w,r8(64)/r5(32)=block_h,
  513. ; r7(64)/r6(32)=end_x,r6/r3=val,r1=cntr
  514. %if ARCH_X86_64
  515. %define buf_reg r3
  516. %define bh_reg r8
  517. %else
  518. %define buf_reg r0
  519. %define bh_reg r5
  520. %endif
  521. lea r1, [r4-8]
  522. sub buf_reg, linesize
  523. READ_V_PIXEL 8, [buf_reg+w_reg-1]
  524. .right_extend_8px_loop:
  525. movq [buf_reg+r1], mm0
  526. sub r1, 8
  527. cmp r1, w_reg
  528. jge .right_extend_8px_loop
  529. add r1, 8
  530. cmp r1, w_reg
  531. je .right_extend_loop_end
  532. .right_extend_2px_loop:
  533. sub r1, 2
  534. mov [buf_reg+r1], valw
  535. cmp r1, w_reg
  536. jg .right_extend_2px_loop
  537. .right_extend_loop_end:
  538. dec bh_reg
  539. jnz .slow_right_extend_loop
  540. jmp .h_extend_end
  541. %endmacro
  542. %macro emu_edge 1
  543. INIT_XMM %1
  544. EMU_EDGE_FUNC
  545. VERTICAL_EXTEND
  546. LEFT_EXTEND
  547. RIGHT_EXTEND
  548. SLOW_V_EXTEND
  549. SLOW_LEFT_EXTEND
  550. SLOW_RIGHT_EXTEND
  551. %endmacro
  552. emu_edge sse
  553. %if ARCH_X86_32
  554. emu_edge mmx
  555. %endif
  556. %macro PREFETCH_FN 1
  557. cglobal prefetch, 3, 3, 0, buf, stride, h
  558. .loop:
  559. %1 [bufq]
  560. add bufq, strideq
  561. dec hd
  562. jg .loop
  563. REP_RET
  564. %endmacro
  565. INIT_MMX mmxext
  566. PREFETCH_FN prefetcht0
  567. %if ARCH_X86_32
  568. INIT_MMX 3dnow
  569. PREFETCH_FN prefetch
  570. %endif