You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

456 lines
13KB

  1. ;******************************************************************************
  2. ;* Core video DSP functions
  3. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION .text
  23. ; slow vertical extension loop function. Works with variable-width, and
  24. ; does per-line reading/writing of source data
  25. %macro V_COPY_ROW 2 ; type (top/body/bottom), h
  26. .%1_y_loop: ; do {
  27. mov wq, r7mp ; initialize w (r7mp = wmp)
  28. .%1_x_loop: ; do {
  29. movu m0, [srcq+wq] ; m0 = read($mmsize)
  30. movu [dstq+wq], m0 ; write(m0, $mmsize)
  31. add wq, mmsize ; w -= $mmsize
  32. cmp wq, -mmsize ; } while (w > $mmsize);
  33. jl .%1_x_loop
  34. movu m0, [srcq-mmsize] ; m0 = read($mmsize)
  35. movu [dstq-mmsize], m0 ; write(m0, $mmsize)
  36. %ifidn %1, body ; if ($type == body) {
  37. add srcq, src_strideq ; src += src_stride
  38. %endif ; }
  39. add dstq, dst_strideq ; dst += dst_stride
  40. dec %2 ; } while (--$h);
  41. jnz .%1_y_loop
  42. %endmacro
  43. %macro vvar_fn 0
  44. ; .----. <- zero
  45. ; | | <- top is copied from first line in body of source
  46. ; |----| <- start_y
  47. ; | | <- body is copied verbatim (line-by-line) from source
  48. ; |----| <- end_y
  49. ; | | <- bottom is copied from last line in body of source
  50. ; '----' <- bh
  51. %if ARCH_X86_64
  52. cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
  53. start_y, end_y, bh, w
  54. %else ; x86-32
  55. cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
  56. %define src_strideq r3mp
  57. %define dst_strideq r1mp
  58. mov srcq, r2mp
  59. mov start_yq, r4mp
  60. mov end_yq, r5mp
  61. mov bhq, r6mp
  62. %endif
  63. sub bhq, end_yq ; bh -= end_q
  64. sub end_yq, start_yq ; end_q -= start_q
  65. add srcq, r7mp ; (r7mp = wmp)
  66. add dstq, r7mp ; (r7mp = wmp)
  67. neg r7mp ; (r7mp = wmp)
  68. test start_yq, start_yq ; if (start_q) {
  69. jz .body
  70. V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq)
  71. .body: ; }
  72. V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq)
  73. test bhq, bhq ; if (bh) {
  74. jz .end
  75. sub srcq, src_strideq ; src -= src_stride
  76. V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh)
  77. .end: ; }
  78. RET
  79. %endmacro
  80. %if ARCH_X86_32
  81. INIT_MMX mmx
  82. vvar_fn
  83. %endif
  84. INIT_XMM sse
  85. vvar_fn
  86. %macro hvar_fn 0
  87. cglobal emu_edge_hvar, 5, 6, 2, dst, dst_stride, start_x, n_words, h, w
  88. lea dstq, [dstq+n_wordsq*2]
  89. neg n_wordsq
  90. lea start_xq, [start_xq+n_wordsq*2]
  91. .y_loop: ; do {
  92. mov wq, n_wordsq ; initialize w
  93. SPLATB_LOAD m0, dstq+start_xq, m1 ; read(1); splat
  94. .x_loop: ; do {
  95. movu [dstq+wq*2], m0 ; write($reg, $mmsize)
  96. add wq, mmsize/2 ; w -= $mmsize/2
  97. cmp wq, -mmsize/2 ; } while (w > $mmsize/2)
  98. jl .x_loop
  99. movu [dstq-mmsize], m0 ; write($reg, $mmsize)
  100. add dstq, dst_strideq ; dst += dst_stride
  101. dec hq ; } while (h--)
  102. jnz .y_loop
  103. RET
  104. %endmacro
  105. %if ARCH_X86_32
  106. INIT_MMX mmx
  107. hvar_fn
  108. INIT_MMX mmxext
  109. hvar_fn
  110. %endif
  111. INIT_XMM sse2
  112. hvar_fn
  113. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  114. ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
  115. ; - if (%2 & 8) fills 8 bytes into xmm$next
  116. ; - if (%2 & 4) fills 4 bytes into xmm$next
  117. ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
  118. ; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
  119. ; - if (%2 & 4) fills 4 bytes into mm$next
  120. ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
  121. ; writing data out is in the same way
  122. %macro READ_NUM_BYTES 2
  123. %assign %%off 0 ; offset in source buffer
  124. %assign %%mmx_idx 0 ; mmx register index
  125. %assign %%xmm_idx 0 ; xmm register index
  126. %rep %2/mmsize
  127. %if mmsize == 16
  128. movu xmm %+ %%xmm_idx, [srcq+%%off]
  129. %assign %%xmm_idx %%xmm_idx+1
  130. %else ; mmx
  131. movu mm %+ %%mmx_idx, [srcq+%%off]
  132. %assign %%mmx_idx %%mmx_idx+1
  133. %endif
  134. %assign %%off %%off+mmsize
  135. %endrep ; %2/mmsize
  136. %if mmsize == 16
  137. %if (%2-%%off) >= 8
  138. %if %2 > 16 && (%2-%%off) > 8
  139. movu xmm %+ %%xmm_idx, [srcq+%2-16]
  140. %assign %%xmm_idx %%xmm_idx+1
  141. %assign %%off %2
  142. %else
  143. movq mm %+ %%mmx_idx, [srcq+%%off]
  144. %assign %%mmx_idx %%mmx_idx+1
  145. %assign %%off %%off+8
  146. %endif
  147. %endif ; (%2-%%off) >= 8
  148. %endif
  149. %if (%2-%%off) >= 4
  150. %if %2 > 8 && (%2-%%off) > 4
  151. movq mm %+ %%mmx_idx, [srcq+%2-8]
  152. %assign %%off %2
  153. %else
  154. movd mm %+ %%mmx_idx, [srcq+%%off]
  155. %assign %%off %%off+4
  156. %endif
  157. %assign %%mmx_idx %%mmx_idx+1
  158. %endif ; (%2-%%off) >= 4
  159. %if (%2-%%off) >= 1
  160. %if %2 >= 4
  161. movd mm %+ %%mmx_idx, [srcq+%2-4]
  162. %elif (%2-%%off) == 1
  163. mov valb, [srcq+%2-1]
  164. %elif (%2-%%off) == 2
  165. mov valw, [srcq+%2-2]
  166. %elifidn %1, body
  167. mov vald, [srcq+%2-3]
  168. %else
  169. movd mm %+ %%mmx_idx, [srcq+%2-3]
  170. %endif
  171. %endif ; (%2-%%off) >= 1
  172. %endmacro ; READ_NUM_BYTES
  173. %macro WRITE_NUM_BYTES 2
  174. %assign %%off 0 ; offset in destination buffer
  175. %assign %%mmx_idx 0 ; mmx register index
  176. %assign %%xmm_idx 0 ; xmm register index
  177. %rep %2/mmsize
  178. %if mmsize == 16
  179. movu [dstq+%%off], xmm %+ %%xmm_idx
  180. %assign %%xmm_idx %%xmm_idx+1
  181. %else ; mmx
  182. movu [dstq+%%off], mm %+ %%mmx_idx
  183. %assign %%mmx_idx %%mmx_idx+1
  184. %endif
  185. %assign %%off %%off+mmsize
  186. %endrep ; %2/mmsize
  187. %if mmsize == 16
  188. %if (%2-%%off) >= 8
  189. %if %2 > 16 && (%2-%%off) > 8
  190. movu [dstq+%2-16], xmm %+ %%xmm_idx
  191. %assign %%xmm_idx %%xmm_idx+1
  192. %assign %%off %2
  193. %else
  194. movq [dstq+%%off], mm %+ %%mmx_idx
  195. %assign %%mmx_idx %%mmx_idx+1
  196. %assign %%off %%off+8
  197. %endif
  198. %endif ; (%2-%%off) >= 8
  199. %endif
  200. %if (%2-%%off) >= 4
  201. %if %2 > 8 && (%2-%%off) > 4
  202. movq [dstq+%2-8], mm %+ %%mmx_idx
  203. %assign %%off %2
  204. %else
  205. movd [dstq+%%off], mm %+ %%mmx_idx
  206. %assign %%off %%off+4
  207. %endif
  208. %assign %%mmx_idx %%mmx_idx+1
  209. %endif ; (%2-%%off) >= 4
  210. %if (%2-%%off) >= 1
  211. %if %2 >= 4
  212. movd [dstq+%2-4], mm %+ %%mmx_idx
  213. %elif (%2-%%off) == 1
  214. mov [dstq+%2-1], valb
  215. %elif (%2-%%off) == 2
  216. mov [dstq+%2-2], valw
  217. %elifidn %1, body
  218. mov [dstq+%2-3], valw
  219. shr vald, 16
  220. mov [dstq+%2-1], valb
  221. %else
  222. movd vald, mm %+ %%mmx_idx
  223. mov [dstq+%2-3], valw
  224. shr vald, 16
  225. mov [dstq+%2-1], valb
  226. %endif
  227. %endif ; (%2-%%off) >= 1
  228. %endmacro ; WRITE_NUM_BYTES
  229. ; vertical top/bottom extend and body copy fast loops
  230. ; these are function pointers to set-width line copy functions, i.e.
  231. ; they read a fixed number of pixels into set registers, and write
  232. ; those out into the destination buffer
  233. %macro VERTICAL_EXTEND 2
  234. %assign %%n %1
  235. %rep 1+%2-%1
  236. %if %%n <= 3
  237. %if ARCH_X86_64
  238. cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
  239. start_y, end_y, val, bh
  240. mov bhq, r6mp ; r6mp = bhmp
  241. %else ; x86-32
  242. cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
  243. mov dstq, r0mp
  244. mov srcq, r2mp
  245. mov start_yq, r4mp
  246. mov end_yq, r5mp
  247. mov bhq, r6mp
  248. %define dst_strideq r1mp
  249. %define src_strideq r3mp
  250. %endif ; x86-64/32
  251. %else
  252. %if ARCH_X86_64
  253. cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
  254. start_y, end_y, bh
  255. %else ; x86-32
  256. cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
  257. mov srcq, r2mp
  258. mov start_yq, r4mp
  259. mov end_yq, r5mp
  260. mov bhq, r6mp
  261. %define dst_strideq r1mp
  262. %define src_strideq r3mp
  263. %endif ; x86-64/32
  264. %endif
  265. ; FIXME move this to c wrapper?
  266. sub bhq, end_yq ; bh -= end_y
  267. sub end_yq, start_yq ; end_y -= start_y
  268. ; extend pixels above body
  269. test start_yq, start_yq ; if (start_y) {
  270. jz .body_loop
  271. READ_NUM_BYTES top, %%n ; $variable_regs = read($n)
  272. .top_loop: ; do {
  273. WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n)
  274. add dstq, dst_strideq ; dst += linesize
  275. dec start_yq ; } while (--start_y)
  276. jnz .top_loop ; }
  277. ; copy body pixels
  278. .body_loop: ; do {
  279. READ_NUM_BYTES body, %%n ; $variable_regs = read($n)
  280. WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n)
  281. add dstq, dst_strideq ; dst += dst_stride
  282. add srcq, src_strideq ; src += src_stride
  283. dec end_yq ; } while (--end_y)
  284. jnz .body_loop
  285. ; copy bottom pixels
  286. test bhq, bhq ; if (block_h) {
  287. jz .end
  288. sub srcq, src_strideq ; src -= linesize
  289. READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n)
  290. .bottom_loop: ; do {
  291. WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n)
  292. add dstq, dst_strideq ; dst += linesize
  293. dec bhq ; } while (--bh)
  294. jnz .bottom_loop ; }
  295. .end:
  296. RET
  297. %assign %%n %%n+1
  298. %endrep ; 1+%2-%1
  299. %endmacro ; VERTICAL_EXTEND
  300. INIT_MMX mmx
  301. VERTICAL_EXTEND 1, 15
  302. %if ARCH_X86_32
  303. VERTICAL_EXTEND 16, 22
  304. %endif
  305. INIT_XMM sse
  306. VERTICAL_EXTEND 16, 22
  307. ; left/right (horizontal) fast extend functions
  308. ; these are essentially identical to the vertical extend ones above,
  309. ; just left/right separated because number of pixels to extend is
  310. ; obviously not the same on both sides.
  311. %macro READ_V_PIXEL 2
  312. %if notcpuflag(mmxext) && %1 < 8
  313. movzx vald, byte [%2]
  314. imul vald, 0x01010101
  315. %else
  316. SPLATB_LOAD m0, %2, m1
  317. %endif ; %1 < 8
  318. %endmacro ; READ_V_PIXEL
  319. %macro WRITE_V_PIXEL 2
  320. %assign %%off 0
  321. %if %1 >= 8
  322. %rep %1/mmsize
  323. movu [%2+%%off], m0
  324. %assign %%off %%off+mmsize
  325. %endrep ; %1/mmsize
  326. %if mmsize == 16
  327. %if %1-%%off >= 8
  328. %if %1 > 16 && %1-%%off > 8
  329. movu [%2+%1-16], m0
  330. %assign %%off %1
  331. %else
  332. movq [%2+%%off], m0
  333. %assign %%off %%off+8
  334. %endif
  335. %endif ; %1-%%off >= 8
  336. %endif ; mmsize == 16
  337. %if %1-%%off >= 4
  338. %if %1 > 8 && %1-%%off > 4
  339. movq [%2+%1-8], m0
  340. %assign %%off %1
  341. %else
  342. movd [%2+%%off], m0
  343. %assign %%off %%off+4
  344. %endif
  345. %endif ; %1-%%off >= 4
  346. %if %1-%%off == 2
  347. movd [%2+%%off-2], m0
  348. %endif ; (%1-%%off)/2
  349. %else ; %1 < 8
  350. %if cpuflag(mmxext)
  351. movd [%2+%%off], m0
  352. %if %1 == 6
  353. movd [%2+%%off+2], m0
  354. %endif ; (%1-%%off)/2
  355. %else ; notcpuflag(mmxext)
  356. %rep %1/4
  357. mov [%2+%%off], vald
  358. %assign %%off %%off+4
  359. %endrep ; %1/4
  360. %if %1-%%off == 2
  361. mov [%2+%%off], valw
  362. %endif ; (%1-%%off)/2
  363. %endif ; cpuflag
  364. %endif ; %1 >=/< 8
  365. %endmacro ; WRITE_V_PIXEL
  366. %macro H_EXTEND 2
  367. %assign %%n %1
  368. %rep 1+(%2-%1)/2
  369. %if %%n < 8 && notcpuflag(mmxext)
  370. cglobal emu_edge_hfix %+ %%n, 4, 5, 2, dst, dst_stride, start_x, bh, val
  371. %else
  372. cglobal emu_edge_hfix %+ %%n, 4, 4, 2, dst, dst_stride, start_x, bh
  373. %endif
  374. .loop_y: ; do {
  375. READ_V_PIXEL %%n, dstq+start_xq ; $variable_regs = read($n)
  376. WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n)
  377. add dstq, dst_strideq ; dst += dst_stride
  378. dec bhq ; } while (--bh)
  379. jnz .loop_y
  380. RET
  381. %assign %%n %%n+2
  382. %endrep ; 1+(%2-%1)/2
  383. %endmacro ; H_EXTEND
  384. INIT_MMX mmx
  385. H_EXTEND 2, 2
  386. %if ARCH_X86_32
  387. H_EXTEND 4, 22
  388. %endif
  389. INIT_MMX mmxext
  390. H_EXTEND 4, 14
  391. %if ARCH_X86_32
  392. H_EXTEND 16, 22
  393. %endif
  394. INIT_XMM sse2
  395. H_EXTEND 16, 22
  396. %macro PREFETCH_FN 1
  397. cglobal prefetch, 3, 3, 0, buf, stride, h
  398. .loop:
  399. %1 [bufq]
  400. add bufq, strideq
  401. dec hd
  402. jg .loop
  403. REP_RET
  404. %endmacro
  405. INIT_MMX mmxext
  406. PREFETCH_FN prefetcht0
  407. %if ARCH_X86_32
  408. INIT_MMX 3dnow
  409. PREFETCH_FN prefetch
  410. %endif