You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

376 lines
8.3KB

  1. ;*****************************************************************************
  2. ;* SSE2-optimized weighted prediction code
  3. ;*****************************************************************************
  4. ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "x86inc.asm"
  24. SECTION .text
  25. ;-----------------------------------------------------------------------------
  26. ; biweight pred:
  27. ;
  28. ; void h264_biweight_16x16_sse2(uint8_t *dst, uint8_t *src, int stride,
  29. ; int log2_denom, int weightd, int weights,
  30. ; int offset);
  31. ; and
  32. ; void h264_weight_16x16_sse2(uint8_t *dst, int stride,
  33. ; int log2_denom, int weight,
  34. ; int offset);
  35. ;-----------------------------------------------------------------------------
  36. %macro WEIGHT_SETUP 0
  37. add r4, r4
  38. inc r4
  39. movd m3, r3d
  40. movd m5, r4d
  41. movd m6, r2d
  42. pslld m5, m6
  43. psrld m5, 1
  44. %if mmsize == 16
  45. pshuflw m3, m3, 0
  46. pshuflw m5, m5, 0
  47. punpcklqdq m3, m3
  48. punpcklqdq m5, m5
  49. %else
  50. pshufw m3, m3, 0
  51. pshufw m5, m5, 0
  52. %endif
  53. pxor m7, m7
  54. %endmacro
  55. %macro WEIGHT_OP 2
  56. movh m0, [r0+%1]
  57. movh m1, [r0+%2]
  58. punpcklbw m0, m7
  59. punpcklbw m1, m7
  60. pmullw m0, m3
  61. pmullw m1, m3
  62. paddsw m0, m5
  63. paddsw m1, m5
  64. psraw m0, m6
  65. psraw m1, m6
  66. packuswb m0, m1
  67. %endmacro
  68. %macro WEIGHT_FUNC_DBL_MM 1
  69. cglobal h264_weight_16x%1_mmx2, 5, 5, 0
  70. WEIGHT_SETUP
  71. mov r2, %1
  72. %if %1 == 16
  73. .nextrow
  74. WEIGHT_OP 0, 4
  75. mova [r0 ], m0
  76. WEIGHT_OP 8, 12
  77. mova [r0+8], m0
  78. add r0, r1
  79. dec r2
  80. jnz .nextrow
  81. REP_RET
  82. %else
  83. jmp mangle(ff_h264_weight_16x16_mmx2.nextrow)
  84. %endif
  85. %endmacro
  86. INIT_MMX
  87. WEIGHT_FUNC_DBL_MM 16
  88. WEIGHT_FUNC_DBL_MM 8
  89. %macro WEIGHT_FUNC_MM 4
  90. cglobal h264_weight_%1x%2_%4, 7, 7, %3
  91. WEIGHT_SETUP
  92. mov r2, %2
  93. %if %2 == 16
  94. .nextrow
  95. WEIGHT_OP 0, mmsize/2
  96. mova [r0], m0
  97. add r0, r1
  98. dec r2
  99. jnz .nextrow
  100. REP_RET
  101. %else
  102. jmp mangle(ff_h264_weight_%1x16_%4.nextrow)
  103. %endif
  104. %endmacro
  105. INIT_MMX
  106. WEIGHT_FUNC_MM 8, 16, 0, mmx2
  107. WEIGHT_FUNC_MM 8, 8, 0, mmx2
  108. WEIGHT_FUNC_MM 8, 4, 0, mmx2
  109. INIT_XMM
  110. WEIGHT_FUNC_MM 16, 16, 8, sse2
  111. WEIGHT_FUNC_MM 16, 8, 8, sse2
  112. %macro WEIGHT_FUNC_HALF_MM 5
  113. cglobal h264_weight_%1x%2_%5, 5, 5, %4
  114. WEIGHT_SETUP
  115. mov r2, %2/2
  116. lea r3, [r1*2]
  117. %if %2 == mmsize
  118. .nextrow
  119. WEIGHT_OP 0, r1
  120. movh [r0], m0
  121. %if mmsize == 16
  122. movhps [r0+r1], m0
  123. %else
  124. psrlq m0, 32
  125. movh [r0+r1], m0
  126. %endif
  127. add r0, r3
  128. dec r2
  129. jnz .nextrow
  130. REP_RET
  131. %else
  132. jmp mangle(ff_h264_weight_%1x%3_%5.nextrow)
  133. %endif
  134. %endmacro
  135. INIT_MMX
  136. WEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
  137. WEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
  138. WEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
  139. INIT_XMM
  140. WEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
  141. WEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
  142. WEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
  143. %macro BIWEIGHT_SETUP 0
  144. add r6, 1
  145. or r6, 1
  146. add r3, 1
  147. movd m3, r4d
  148. movd m4, r5d
  149. movd m5, r6d
  150. movd m6, r3d
  151. pslld m5, m6
  152. psrld m5, 1
  153. %if mmsize == 16
  154. pshuflw m3, m3, 0
  155. pshuflw m4, m4, 0
  156. pshuflw m5, m5, 0
  157. punpcklqdq m3, m3
  158. punpcklqdq m4, m4
  159. punpcklqdq m5, m5
  160. %else
  161. pshufw m3, m3, 0
  162. pshufw m4, m4, 0
  163. pshufw m5, m5, 0
  164. %endif
  165. pxor m7, m7
  166. %endmacro
  167. %macro BIWEIGHT_STEPA 3
  168. movh m%1, [r0+%3]
  169. movh m%2, [r1+%3]
  170. punpcklbw m%1, m7
  171. punpcklbw m%2, m7
  172. pmullw m%1, m3
  173. pmullw m%2, m4
  174. paddsw m%1, m%2
  175. %endmacro
  176. %macro BIWEIGHT_STEPB 0
  177. paddsw m0, m5
  178. paddsw m1, m5
  179. psraw m0, m6
  180. psraw m1, m6
  181. packuswb m0, m1
  182. %endmacro
  183. %macro BIWEIGHT_FUNC_DBL_MM 1
  184. cglobal h264_biweight_16x%1_mmx2, 7, 7, 0
  185. BIWEIGHT_SETUP
  186. mov r3, %1
  187. %if %1 == 16
  188. .nextrow
  189. BIWEIGHT_STEPA 0, 1, 0
  190. BIWEIGHT_STEPA 1, 2, 4
  191. BIWEIGHT_STEPB
  192. mova [r0], m0
  193. BIWEIGHT_STEPA 0, 1, 8
  194. BIWEIGHT_STEPA 1, 2, 12
  195. BIWEIGHT_STEPB
  196. mova [r0+8], m0
  197. add r0, r2
  198. add r1, r2
  199. dec r3
  200. jnz .nextrow
  201. REP_RET
  202. %else
  203. jmp mangle(ff_h264_biweight_16x16_mmx2.nextrow)
  204. %endif
  205. %endmacro
  206. INIT_MMX
  207. BIWEIGHT_FUNC_DBL_MM 16
  208. BIWEIGHT_FUNC_DBL_MM 8
  209. %macro BIWEIGHT_FUNC_MM 4
  210. cglobal h264_biweight_%1x%2_%4, 7, 7, %3
  211. BIWEIGHT_SETUP
  212. mov r3, %2
  213. %if %2 == 16
  214. .nextrow
  215. BIWEIGHT_STEPA 0, 1, 0
  216. BIWEIGHT_STEPA 1, 2, mmsize/2
  217. BIWEIGHT_STEPB
  218. mova [r0], m0
  219. add r0, r2
  220. add r1, r2
  221. dec r3
  222. jnz .nextrow
  223. REP_RET
  224. %else
  225. jmp mangle(ff_h264_biweight_%1x16_%4.nextrow)
  226. %endif
  227. %endmacro
  228. INIT_MMX
  229. BIWEIGHT_FUNC_MM 8, 16, 0, mmx2
  230. BIWEIGHT_FUNC_MM 8, 8, 0, mmx2
  231. BIWEIGHT_FUNC_MM 8, 4, 0, mmx2
  232. INIT_XMM
  233. BIWEIGHT_FUNC_MM 16, 16, 8, sse2
  234. BIWEIGHT_FUNC_MM 16, 8, 8, sse2
  235. %macro BIWEIGHT_FUNC_HALF_MM 5
  236. cglobal h264_biweight_%1x%2_%5, 7, 7, %4
  237. BIWEIGHT_SETUP
  238. mov r3, %2/2
  239. lea r4, [r2*2]
  240. %if %2 == mmsize
  241. .nextrow
  242. BIWEIGHT_STEPA 0, 1, 0
  243. BIWEIGHT_STEPA 1, 2, r2
  244. BIWEIGHT_STEPB
  245. movh [r0], m0
  246. %if mmsize == 16
  247. movhps [r0+r2], m0
  248. %else
  249. psrlq m0, 32
  250. movh [r0+r2], m0
  251. %endif
  252. add r0, r4
  253. add r1, r4
  254. dec r3
  255. jnz .nextrow
  256. REP_RET
  257. %else
  258. jmp mangle(ff_h264_biweight_%1x%3_%5.nextrow)
  259. %endif
  260. %endmacro
  261. INIT_MMX
  262. BIWEIGHT_FUNC_HALF_MM 4, 8, 8, 0, mmx2
  263. BIWEIGHT_FUNC_HALF_MM 4, 4, 8, 0, mmx2
  264. BIWEIGHT_FUNC_HALF_MM 4, 2, 8, 0, mmx2
  265. INIT_XMM
  266. BIWEIGHT_FUNC_HALF_MM 8, 16, 16, 8, sse2
  267. BIWEIGHT_FUNC_HALF_MM 8, 8, 16, 8, sse2
  268. BIWEIGHT_FUNC_HALF_MM 8, 4, 16, 8, sse2
  269. %macro BIWEIGHT_SSSE3_SETUP 0
  270. add r6, 1
  271. or r6, 1
  272. add r3, 1
  273. movd m4, r4d
  274. movd m0, r5d
  275. movd m5, r6d
  276. movd m6, r3d
  277. pslld m5, m6
  278. psrld m5, 1
  279. punpcklbw m4, m0
  280. pshuflw m4, m4, 0
  281. pshuflw m5, m5, 0
  282. punpcklqdq m4, m4
  283. punpcklqdq m5, m5
  284. %endmacro
  285. %macro BIWEIGHT_SSSE3_OP 0
  286. pmaddubsw m0, m4
  287. pmaddubsw m2, m4
  288. paddsw m0, m5
  289. paddsw m2, m5
  290. psraw m0, m6
  291. psraw m2, m6
  292. packuswb m0, m2
  293. %endmacro
  294. %macro BIWEIGHT_SSSE3_16 1
  295. cglobal h264_biweight_16x%1_ssse3, 7, 7, 8
  296. BIWEIGHT_SSSE3_SETUP
  297. mov r3, %1
  298. %if %1 == 16
  299. .nextrow
  300. movh m0, [r0]
  301. movh m2, [r0+8]
  302. movh m3, [r1+8]
  303. punpcklbw m0, [r1]
  304. punpcklbw m2, m3
  305. BIWEIGHT_SSSE3_OP
  306. mova [r0], m0
  307. add r0, r2
  308. add r1, r2
  309. dec r3
  310. jnz .nextrow
  311. REP_RET
  312. %else
  313. jmp mangle(ff_h264_biweight_16x16_ssse3.nextrow)
  314. %endif
  315. %endmacro
  316. INIT_XMM
  317. BIWEIGHT_SSSE3_16 16
  318. BIWEIGHT_SSSE3_16 8
  319. %macro BIWEIGHT_SSSE3_8 1
  320. cglobal h264_biweight_8x%1_ssse3, 7, 7, 8
  321. BIWEIGHT_SSSE3_SETUP
  322. mov r3, %1/2
  323. lea r4, [r2*2]
  324. %if %1 == 16
  325. .nextrow
  326. movh m0, [r0]
  327. movh m1, [r1]
  328. movh m2, [r0+r2]
  329. movh m3, [r1+r2]
  330. punpcklbw m0, m1
  331. punpcklbw m2, m3
  332. BIWEIGHT_SSSE3_OP
  333. movh [r0], m0
  334. movhps [r0+r2], m0
  335. add r0, r4
  336. add r1, r4
  337. dec r3
  338. jnz .nextrow
  339. REP_RET
  340. %else
  341. jmp mangle(ff_h264_biweight_8x16_ssse3.nextrow)
  342. %endif
  343. %endmacro
  344. INIT_XMM
  345. BIWEIGHT_SSSE3_8 16
  346. BIWEIGHT_SSSE3_8 8
  347. BIWEIGHT_SSSE3_8 4