You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

445 lines
13KB

  1. ;******************************************************************************
  2. ;* Core video DSP functions
  3. ;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION .text
  23. ; slow vertical extension loop function. Works with variable-width, and
  24. ; does per-line reading/writing of source data
  25. %macro V_COPY_ROW 2 ; type (top/body/bottom), h
  26. .%1_y_loop: ; do {
  27. mov wq, r7mp ; initialize w (r7mp = wmp)
  28. .%1_x_loop: ; do {
  29. movu m0, [srcq+wq] ; m0 = read($mmsize)
  30. movu [dstq+wq], m0 ; write(m0, $mmsize)
  31. add wq, mmsize ; w -= $mmsize
  32. cmp wq, -mmsize ; } while (w > $mmsize);
  33. jl .%1_x_loop
  34. movu m0, [srcq-mmsize] ; m0 = read($mmsize)
  35. movu [dstq-mmsize], m0 ; write(m0, $mmsize)
  36. %ifidn %1, body ; if ($type == body) {
  37. add srcq, src_strideq ; src += src_stride
  38. %endif ; }
  39. add dstq, dst_strideq ; dst += dst_stride
  40. dec %2 ; } while (--$h);
  41. jnz .%1_y_loop
  42. %endmacro
  43. %macro vvar_fn 0
  44. ; .----. <- zero
  45. ; | | <- top is copied from first line in body of source
  46. ; |----| <- start_y
  47. ; | | <- body is copied verbatim (line-by-line) from source
  48. ; |----| <- end_y
  49. ; | | <- bottom is copied from last line in body of source
  50. ; '----' <- bh
  51. %if ARCH_X86_64
  52. cglobal emu_edge_vvar, 7, 8, 1, dst, src, dst_stride, src_stride, \
  53. start_y, end_y, bh, w
  54. %else ; x86-32
  55. cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
  56. %define src_strideq r3mp
  57. %define dst_strideq r2mp
  58. mov srcq, r1mp
  59. mov start_yq, r4mp
  60. mov end_yq, r5mp
  61. mov bhq, r6mp
  62. %endif
  63. sub bhq, end_yq ; bh -= end_q
  64. sub end_yq, start_yq ; end_q -= start_q
  65. add srcq, r7mp ; (r7mp = wmp)
  66. add dstq, r7mp ; (r7mp = wmp)
  67. neg r7mp ; (r7mp = wmp)
  68. test start_yq, start_yq ; if (start_q) {
  69. jz .body
  70. V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq)
  71. .body: ; }
  72. V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq)
  73. test bhq, bhq ; if (bh) {
  74. jz .end
  75. sub srcq, src_strideq ; src -= src_stride
  76. V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh)
  77. .end: ; }
  78. RET
  79. %endmacro
  80. %if ARCH_X86_32
  81. INIT_MMX mmx
  82. vvar_fn
  83. %endif
  84. INIT_XMM sse
  85. vvar_fn
  86. %macro hvar_fn 0
  87. cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
  88. lea dstq, [dstq+n_wordsq*2]
  89. neg n_wordsq
  90. lea start_xq, [start_xq+n_wordsq*2]
  91. .y_loop: ; do {
  92. ; FIXME also write a ssse3 version using pshufb
  93. movzx wd, byte [dstq+start_xq] ; w = read(1)
  94. imul wd, 0x01010101 ; w *= 0x01010101
  95. movd m0, wd
  96. mov wq, n_wordsq ; initialize w
  97. %if cpuflag(sse2)
  98. pshufd m0, m0, q0000 ; splat
  99. %else ; mmx
  100. punpckldq m0, m0 ; splat
  101. %endif ; mmx/sse
  102. .x_loop: ; do {
  103. movu [dstq+wq*2], m0 ; write($reg, $mmsize)
  104. add wq, mmsize/2 ; w -= $mmsize/2
  105. cmp wq, -mmsize/2 ; } while (w > $mmsize/2)
  106. jl .x_loop
  107. movu [dstq-mmsize], m0 ; write($reg, $mmsize)
  108. add dstq, dst_strideq ; dst += dst_stride
  109. dec hq ; } while (h--)
  110. jnz .y_loop
  111. RET
  112. %endmacro
  113. %if ARCH_X86_32
  114. INIT_MMX mmx
  115. hvar_fn
  116. %endif
  117. INIT_XMM sse2
  118. hvar_fn
  119. ; macro to read/write a horizontal number of pixels (%2) to/from registers
  120. ; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
  121. ; - if (%2 & 8) fills 8 bytes into xmm$next
  122. ; - if (%2 & 4) fills 4 bytes into xmm$next
  123. ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
  124. ; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
  125. ; - if (%2 & 4) fills 4 bytes into mm$next
  126. ; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
  127. ; writing data out is in the same way
  128. %macro READ_NUM_BYTES 2
  129. %assign %%off 0 ; offset in source buffer
  130. %assign %%mmx_idx 0 ; mmx register index
  131. %assign %%xmm_idx 0 ; xmm register index
  132. %rep %2/mmsize
  133. %if mmsize == 16
  134. movu xmm %+ %%xmm_idx, [srcq+%%off]
  135. %assign %%xmm_idx %%xmm_idx+1
  136. %else ; mmx
  137. movu mm %+ %%mmx_idx, [srcq+%%off]
  138. %assign %%mmx_idx %%mmx_idx+1
  139. %endif
  140. %assign %%off %%off+mmsize
  141. %endrep ; %2/mmsize
  142. %if mmsize == 16
  143. %if (%2-%%off) >= 8
  144. %if %2 > 16 && (%2-%%off) > 8
  145. movu xmm %+ %%xmm_idx, [srcq+%2-16]
  146. %assign %%xmm_idx %%xmm_idx+1
  147. %assign %%off %2
  148. %else
  149. movq mm %+ %%mmx_idx, [srcq+%%off]
  150. %assign %%mmx_idx %%mmx_idx+1
  151. %assign %%off %%off+8
  152. %endif
  153. %endif ; (%2-%%off) >= 8
  154. %endif
  155. %if (%2-%%off) >= 4
  156. %if %2 > 8 && (%2-%%off) > 4
  157. movq mm %+ %%mmx_idx, [srcq+%2-8]
  158. %assign %%off %2
  159. %else
  160. movd mm %+ %%mmx_idx, [srcq+%%off]
  161. %assign %%off %%off+4
  162. %endif
  163. %assign %%mmx_idx %%mmx_idx+1
  164. %endif ; (%2-%%off) >= 4
  165. %if (%2-%%off) >= 1
  166. %if %2 >= 4
  167. movd mm %+ %%mmx_idx, [srcq+%2-4]
  168. %elif (%2-%%off) == 1
  169. mov valb, [srcq+%2-1]
  170. %elif (%2-%%off) == 2
  171. mov valw, [srcq+%2-2]
  172. %elifidn %1, body
  173. mov vald, [srcq+%2-3]
  174. %else
  175. movd mm %+ %%mmx_idx, [srcq+%2-3]
  176. %endif
  177. %endif ; (%2-%%off) >= 1
  178. %endmacro ; READ_NUM_BYTES
  179. %macro WRITE_NUM_BYTES 2
  180. %assign %%off 0 ; offset in destination buffer
  181. %assign %%mmx_idx 0 ; mmx register index
  182. %assign %%xmm_idx 0 ; xmm register index
  183. %rep %2/mmsize
  184. %if mmsize == 16
  185. movu [dstq+%%off], xmm %+ %%xmm_idx
  186. %assign %%xmm_idx %%xmm_idx+1
  187. %else ; mmx
  188. movu [dstq+%%off], mm %+ %%mmx_idx
  189. %assign %%mmx_idx %%mmx_idx+1
  190. %endif
  191. %assign %%off %%off+mmsize
  192. %endrep ; %2/mmsize
  193. %if mmsize == 16
  194. %if (%2-%%off) >= 8
  195. %if %2 > 16 && (%2-%%off) > 8
  196. movu [dstq+%2-16], xmm %+ %%xmm_idx
  197. %assign %%xmm_idx %%xmm_idx+1
  198. %assign %%off %2
  199. %else
  200. movq [dstq+%%off], mm %+ %%mmx_idx
  201. %assign %%mmx_idx %%mmx_idx+1
  202. %assign %%off %%off+8
  203. %endif
  204. %endif ; (%2-%%off) >= 8
  205. %endif
  206. %if (%2-%%off) >= 4
  207. %if %2 > 8 && (%2-%%off) > 4
  208. movq [dstq+%2-8], mm %+ %%mmx_idx
  209. %assign %%off %2
  210. %else
  211. movd [dstq+%%off], mm %+ %%mmx_idx
  212. %assign %%off %%off+4
  213. %endif
  214. %assign %%mmx_idx %%mmx_idx+1
  215. %endif ; (%2-%%off) >= 4
  216. %if (%2-%%off) >= 1
  217. %if %2 >= 4
  218. movd [dstq+%2-4], mm %+ %%mmx_idx
  219. %elif (%2-%%off) == 1
  220. mov [dstq+%2-1], valb
  221. %elif (%2-%%off) == 2
  222. mov [dstq+%2-2], valw
  223. %elifidn %1, body
  224. mov [dstq+%2-3], valw
  225. shr vald, 16
  226. mov [dstq+%2-1], valb
  227. %else
  228. movd vald, mm %+ %%mmx_idx
  229. mov [dstq+%2-3], valw
  230. shr vald, 16
  231. mov [dstq+%2-1], valb
  232. %endif
  233. %endif ; (%2-%%off) >= 1
  234. %endmacro ; WRITE_NUM_BYTES
  235. ; vertical top/bottom extend and body copy fast loops
  236. ; these are function pointers to set-width line copy functions, i.e.
  237. ; they read a fixed number of pixels into set registers, and write
  238. ; those out into the destination buffer
  239. %macro VERTICAL_EXTEND 2
  240. %assign %%n %1
  241. %rep 1+%2-%1
  242. %if %%n <= 3
  243. %if ARCH_X86_64
  244. cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, src, dst_stride, src_stride, \
  245. start_y, end_y, val, bh
  246. mov bhq, r6mp ; r6mp = bhmp
  247. %else ; x86-32
  248. cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
  249. mov dstq, r0mp
  250. mov srcq, r1mp
  251. mov start_yq, r4mp
  252. mov end_yq, r5mp
  253. mov bhq, r6mp
  254. %define dst_strideq r2mp
  255. %define src_strideq r3mp
  256. %endif ; x86-64/32
  257. %else
  258. %if ARCH_X86_64
  259. cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, src, dst_stride, src_stride, \
  260. start_y, end_y, bh
  261. %else ; x86-32
  262. cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
  263. mov srcq, r1mp
  264. mov start_yq, r4mp
  265. mov end_yq, r5mp
  266. mov bhq, r6mp
  267. %define dst_strideq r2mp
  268. %define src_strideq r3mp
  269. %endif ; x86-64/32
  270. %endif
  271. ; FIXME move this to c wrapper?
  272. sub bhq, end_yq ; bh -= end_y
  273. sub end_yq, start_yq ; end_y -= start_y
  274. ; extend pixels above body
  275. test start_yq, start_yq ; if (start_y) {
  276. jz .body_loop
  277. READ_NUM_BYTES top, %%n ; $variable_regs = read($n)
  278. .top_loop: ; do {
  279. WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n)
  280. add dstq, dst_strideq ; dst += linesize
  281. dec start_yq ; } while (--start_y)
  282. jnz .top_loop ; }
  283. ; copy body pixels
  284. .body_loop: ; do {
  285. READ_NUM_BYTES body, %%n ; $variable_regs = read($n)
  286. WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n)
  287. add dstq, dst_strideq ; dst += dst_stride
  288. add srcq, src_strideq ; src += src_stride
  289. dec end_yq ; } while (--end_y)
  290. jnz .body_loop
  291. ; copy bottom pixels
  292. test bhq, bhq ; if (block_h) {
  293. jz .end
  294. sub srcq, src_strideq ; src -= linesize
  295. READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n)
  296. .bottom_loop: ; do {
  297. WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n)
  298. add dstq, dst_strideq ; dst += linesize
  299. dec bhq ; } while (--bh)
  300. jnz .bottom_loop ; }
  301. .end:
  302. RET
  303. %assign %%n %%n+1
  304. %endrep ; 1+%2-%1
  305. %endmacro ; VERTICAL_EXTEND
  306. INIT_MMX mmx
  307. VERTICAL_EXTEND 1, 15
  308. %if ARCH_X86_32
  309. VERTICAL_EXTEND 16, 22
  310. %endif
  311. INIT_XMM sse
  312. VERTICAL_EXTEND 16, 22
  313. ; left/right (horizontal) fast extend functions
  314. ; these are essentially identical to the vertical extend ones above,
  315. ; just left/right separated because number of pixels to extend is
  316. ; obviously not the same on both sides.
  317. %macro READ_V_PIXEL 2
  318. %if %1 == 2
  319. movzx valw, byte %2
  320. imul valw, 0x0101
  321. %else
  322. movzx vald, byte %2
  323. imul vald, 0x01010101
  324. %if %1 >= 8
  325. movd m0, vald
  326. %if mmsize == 16
  327. pshufd m0, m0, q0000
  328. %else
  329. punpckldq m0, m0
  330. %endif
  331. %endif ; %1 >= 8
  332. %endif
  333. %endmacro ; READ_V_PIXEL
  334. %macro WRITE_V_PIXEL 2
  335. %assign %%off 0
  336. %rep %1/mmsize
  337. movu [%2+%%off], m0
  338. %assign %%off %%off+mmsize
  339. %endrep ; %1/mmsize
  340. %if mmsize == 16
  341. %if %1-%%off >= 8
  342. %if %1 > 16 && %1-%%off > 8
  343. movu [%2+%1-16], m0
  344. %assign %%off %1
  345. %else
  346. movq [%2+%%off], m0
  347. %assign %%off %%off+8
  348. %endif
  349. %endif ; %1-%%off >= 8
  350. %endif
  351. %if %1-%%off >= 4
  352. %if %1 > 8 && %1-%%off > 4
  353. movq [%2+%1-8], m0
  354. %assign %%off %1
  355. %elif %1 >= 8 && %1-%%off >= 4
  356. movd [%2+%%off], m0
  357. %assign %%off %%off+4
  358. %else
  359. mov [%2+%%off], vald
  360. %assign %%off %%off+4
  361. %endif
  362. %endif ; %1-%%off >= 4
  363. %if %1-%%off >= 2
  364. %if %1 >= 8
  365. movd [%2+%1-4], m0
  366. %else
  367. mov [%2+%%off], valw
  368. %endif
  369. %endif ; (%1-%%off)/2
  370. %endmacro ; WRITE_V_PIXEL
  371. %macro H_EXTEND 2
  372. %assign %%n %1
  373. %rep 1+(%2-%1)/2
  374. cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
  375. .loop_y: ; do {
  376. READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n)
  377. WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n)
  378. add dstq, dst_strideq ; dst += dst_stride
  379. dec bhq ; } while (--bh)
  380. jnz .loop_y
  381. RET
  382. %assign %%n %%n+2
  383. %endrep ; 1+(%2-%1)/2
  384. %endmacro ; H_EXTEND
  385. INIT_MMX mmx
  386. H_EXTEND 2, 14
  387. %if ARCH_X86_32
  388. H_EXTEND 16, 22
  389. %endif
  390. INIT_XMM sse2
  391. H_EXTEND 16, 22
  392. %macro PREFETCH_FN 1
  393. cglobal prefetch, 3, 3, 0, buf, stride, h
  394. .loop:
  395. %1 [bufq]
  396. add bufq, strideq
  397. dec hd
  398. jg .loop
  399. REP_RET
  400. %endmacro
  401. INIT_MMX mmxext
  402. PREFETCH_FN prefetcht0
  403. %if ARCH_X86_32
  404. INIT_MMX 3dnow
  405. PREFETCH_FN prefetch
  406. %endif