You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

293 lines
8.6KB

  1. ;******************************************************************************
  2. ;* VC1 motion compensation optimizations
  3. ;* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. cextern pw_9
  23. cextern pw_128
  24. SECTION .text
  25. %if HAVE_MMX_INLINE
  26. ; XXX some of these macros are not used right now, but they will in the future
  27. ; when more functions are ported.
  28. %macro OP_PUT 2 ; dst, src
  29. %endmacro
  30. %macro OP_AVG 2 ; dst, src
  31. pavgb %1, %2
  32. %endmacro
  33. %macro NORMALIZE_MMX 1 ; shift
  34. paddw m3, m7 ; +bias-r
  35. paddw m4, m7 ; +bias-r
  36. psraw m3, %1
  37. psraw m4, %1
  38. %endmacro
  39. %macro TRANSFER_DO_PACK 2 ; op, dst
  40. packuswb m3, m4
  41. %1 m3, [%2]
  42. mova [%2], m3
  43. %endmacro
  44. %macro TRANSFER_DONT_PACK 2 ; op, dst
  45. %1 m3, [%2]
  46. %1 m3, [%2 + mmsize]
  47. mova [%2], m3
  48. mova [mmsize + %2], m4
  49. %endmacro
  50. ; see MSPEL_FILTER13_CORE for use as UNPACK macro
  51. %macro DO_UNPACK 1 ; reg
  52. punpcklbw %1, m0
  53. %endmacro
  54. %macro DONT_UNPACK 1 ; reg
  55. %endmacro
  56. ; Compute the rounder 32-r or 8-r and unpacks it to m7
  57. %macro LOAD_ROUNDER_MMX 1 ; round
  58. movd m7, %1
  59. punpcklwd m7, m7
  60. punpckldq m7, m7
  61. %endmacro
  62. %macro SHIFT2_LINE 5 ; off, r0, r1, r2, r3
  63. paddw m%3, m%4
  64. movh m%2, [srcq + stride_neg2]
  65. pmullw m%3, m6
  66. punpcklbw m%2, m0
  67. movh m%5, [srcq + strideq]
  68. psubw m%3, m%2
  69. punpcklbw m%5, m0
  70. paddw m%3, m7
  71. psubw m%3, m%5
  72. psraw m%3, shift
  73. movu [dstq + %1], m%3
  74. add srcq, strideq
  75. %endmacro
  76. INIT_MMX mmx
  77. ; void ff_vc1_put_ver_16b_shift2_mmx(int16_t *dst, const uint8_t *src,
  78. ; x86_reg stride, int rnd, int64_t shift)
  79. ; Sacrificing m6 makes it possible to pipeline loads from src
  80. %if ARCH_X86_32
  81. cglobal vc1_put_ver_16b_shift2, 3,6,0, dst, src, stride
  82. DECLARE_REG_TMP 3, 4, 5
  83. %define rnd r3mp
  84. %define shift qword r4m
  85. %else ; X86_64
  86. cglobal vc1_put_ver_16b_shift2, 4,7,0, dst, src, stride
  87. DECLARE_REG_TMP 4, 5, 6
  88. %define rnd r3d
  89. ; We need shift either in memory or in a mm reg as it's used in psraw
  90. ; On WIN64, the arg is already on the stack
  91. ; On UNIX64, m5 doesn't seem to be used
  92. %if WIN64
  93. %define shift r4mp
  94. %else ; UNIX64
  95. %define shift m5
  96. mova shift, r4q
  97. %endif ; WIN64
  98. %endif ; X86_32
  99. %define stride_neg2 t0q
  100. %define stride_9minus4 t1q
  101. %define i t2q
  102. mov stride_neg2, strideq
  103. neg stride_neg2
  104. add stride_neg2, stride_neg2
  105. lea stride_9minus4, [strideq * 9 - 4]
  106. mov i, 3
  107. LOAD_ROUNDER_MMX rnd
  108. mova m6, [pw_9]
  109. pxor m0, m0
  110. .loop:
  111. movh m2, [srcq]
  112. add srcq, strideq
  113. movh m3, [srcq]
  114. punpcklbw m2, m0
  115. punpcklbw m3, m0
  116. SHIFT2_LINE 0, 1, 2, 3, 4
  117. SHIFT2_LINE 24, 2, 3, 4, 1
  118. SHIFT2_LINE 48, 3, 4, 1, 2
  119. SHIFT2_LINE 72, 4, 1, 2, 3
  120. SHIFT2_LINE 96, 1, 2, 3, 4
  121. SHIFT2_LINE 120, 2, 3, 4, 1
  122. SHIFT2_LINE 144, 3, 4, 1, 2
  123. SHIFT2_LINE 168, 4, 1, 2, 3
  124. sub srcq, stride_9minus4
  125. add dstq, 8
  126. dec i
  127. jnz .loop
  128. REP_RET
  129. %undef rnd
  130. %undef shift
  131. %undef stride_neg2
  132. %undef stride_9minus4
  133. %undef i
  134. ; void ff_vc1_*_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,
  135. ; const int16_t *src, int rnd);
  136. ; Data is already unpacked, so some operations can directly be made from
  137. ; memory.
  138. %macro HOR_16B_SHIFT2 2 ; op, opname
  139. cglobal vc1_%2_hor_16b_shift2, 4, 5, 0, dst, stride, src, rnd, h
  140. mov hq, 8
  141. sub srcq, 2
  142. sub rndd, (-1+9+9-1) * 1024 ; add -1024 bias
  143. LOAD_ROUNDER_MMX rndd
  144. mova m5, [pw_9]
  145. mova m6, [pw_128]
  146. pxor m0, m0
  147. .loop:
  148. mova m1, [srcq + 2 * 0]
  149. mova m2, [srcq + 2 * 0 + mmsize]
  150. mova m3, [srcq + 2 * 1]
  151. mova m4, [srcq + 2 * 1 + mmsize]
  152. paddw m3, [srcq + 2 * 2]
  153. paddw m4, [srcq + 2 * 2 + mmsize]
  154. paddw m1, [srcq + 2 * 3]
  155. paddw m2, [srcq + 2 * 3 + mmsize]
  156. pmullw m3, m5
  157. pmullw m4, m5
  158. psubw m3, m1
  159. psubw m4, m2
  160. NORMALIZE_MMX 7
  161. ; remove bias
  162. paddw m3, m6
  163. paddw m4, m6
  164. TRANSFER_DO_PACK %1, dstq
  165. add srcq, 24
  166. add dstq, strideq
  167. dec hq
  168. jnz .loop
  169. RET
  170. %endmacro
  171. INIT_MMX mmx
  172. HOR_16B_SHIFT2 OP_PUT, put
  173. INIT_MMX mmxext
  174. HOR_16B_SHIFT2 OP_AVG, avg
  175. %endif ; HAVE_MMX_INLINE
  176. %macro INV_TRANS_INIT 0
  177. movsxdifnidn linesizeq, linesized
  178. movd m0, blockd
  179. SPLATW m0, m0
  180. pxor m1, m1
  181. psubw m1, m0
  182. packuswb m0, m0
  183. packuswb m1, m1
  184. DEFINE_ARGS dest, linesize, linesize3
  185. lea linesize3q, [linesizeq*3]
  186. %endmacro
  187. %macro INV_TRANS_PROCESS 1
  188. mov%1 m2, [destq+linesizeq*0]
  189. mov%1 m3, [destq+linesizeq*1]
  190. mov%1 m4, [destq+linesizeq*2]
  191. mov%1 m5, [destq+linesize3q]
  192. paddusb m2, m0
  193. paddusb m3, m0
  194. paddusb m4, m0
  195. paddusb m5, m0
  196. psubusb m2, m1
  197. psubusb m3, m1
  198. psubusb m4, m1
  199. psubusb m5, m1
  200. mov%1 [linesizeq*0+destq], m2
  201. mov%1 [linesizeq*1+destq], m3
  202. mov%1 [linesizeq*2+destq], m4
  203. mov%1 [linesize3q +destq], m5
  204. %endmacro
  205. ; ff_vc1_inv_trans_?x?_dc_mmxext(uint8_t *dest, ptrdiff_t linesize, int16_t *block)
  206. INIT_MMX mmxext
  207. cglobal vc1_inv_trans_4x4_dc, 3,4,0, dest, linesize, block
  208. movsx r3d, WORD [blockq]
  209. mov blockd, r3d ; dc
  210. shl blockd, 4 ; 16 * dc
  211. lea blockd, [blockq+r3+4] ; 17 * dc + 4
  212. sar blockd, 3 ; >> 3
  213. mov r3d, blockd ; dc
  214. shl blockd, 4 ; 16 * dc
  215. lea blockd, [blockq+r3+64] ; 17 * dc + 64
  216. sar blockd, 7 ; >> 7
  217. INV_TRANS_INIT
  218. INV_TRANS_PROCESS h
  219. RET
  220. INIT_MMX mmxext
  221. cglobal vc1_inv_trans_4x8_dc, 3,4,0, dest, linesize, block
  222. movsx r3d, WORD [blockq]
  223. mov blockd, r3d ; dc
  224. shl blockd, 4 ; 16 * dc
  225. lea blockd, [blockq+r3+4] ; 17 * dc + 4
  226. sar blockd, 3 ; >> 3
  227. shl blockd, 2 ; 4 * dc
  228. lea blockd, [blockq*3+64] ; 12 * dc + 64
  229. sar blockd, 7 ; >> 7
  230. INV_TRANS_INIT
  231. INV_TRANS_PROCESS h
  232. lea destq, [destq+linesizeq*4]
  233. INV_TRANS_PROCESS h
  234. RET
  235. INIT_MMX mmxext
  236. cglobal vc1_inv_trans_8x4_dc, 3,4,0, dest, linesize, block
  237. movsx blockd, WORD [blockq] ; dc
  238. lea blockd, [blockq*3+1] ; 3 * dc + 1
  239. sar blockd, 1 ; >> 1
  240. mov r3d, blockd ; dc
  241. shl blockd, 4 ; 16 * dc
  242. lea blockd, [blockq+r3+64] ; 17 * dc + 64
  243. sar blockd, 7 ; >> 7
  244. INV_TRANS_INIT
  245. INV_TRANS_PROCESS a
  246. RET
  247. INIT_MMX mmxext
  248. cglobal vc1_inv_trans_8x8_dc, 3,3,0, dest, linesize, block
  249. movsx blockd, WORD [blockq] ; dc
  250. lea blockd, [blockq*3+1] ; 3 * dc + 1
  251. sar blockd, 1 ; >> 1
  252. lea blockd, [blockq*3+16] ; 3 * dc + 16
  253. sar blockd, 5 ; >> 5
  254. INV_TRANS_INIT
  255. INV_TRANS_PROCESS a
  256. lea destq, [destq+linesizeq*4]
  257. INV_TRANS_PROCESS a
  258. RET