You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

505 lines
22KB

  1. /*
  2. * Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  3. *
  4. * This file is part of Libav.
  5. *
  6. * Libav is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * Libav is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with Libav; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "libavutil/cpu.h"
  21. #include "libavutil/x86_cpu.h"
  22. #include "libavcodec/h264dsp.h"
  23. #include "dsputil_mmx.h"
  24. DECLARE_ALIGNED(8, static const uint64_t, ff_pb_3_1 ) = 0x0103010301030103ULL;
  25. /***********************************/
  26. /* IDCT */
  27. #define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
  28. void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT (uint8_t *dst, int16_t *block, int stride);
  29. IDCT_ADD_FUNC(, 10, sse2)
  30. IDCT_ADD_FUNC(_dc, 10, mmx2)
  31. IDCT_ADD_FUNC(8_dc, 10, sse2)
  32. IDCT_ADD_FUNC(8, 10, sse2)
  33. #if HAVE_AVX
  34. IDCT_ADD_FUNC(, 10, avx)
  35. IDCT_ADD_FUNC(8_dc, 10, avx)
  36. IDCT_ADD_FUNC(8, 10, avx)
  37. #endif
  38. #define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
  39. void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
  40. (uint8_t *dst, const int *block_offset, \
  41. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  42. IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
  43. IDCT_ADD_REP_FUNC(8, 4, 10, avx)
  44. IDCT_ADD_REP_FUNC(, 16, 10, sse2)
  45. IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
  46. #if HAVE_AVX
  47. IDCT_ADD_REP_FUNC(, 16, 10, avx)
  48. IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
  49. #endif
  50. #define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
  51. void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
  52. (uint8_t **dst, const int *block_offset, \
  53. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  54. IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
  55. #if HAVE_AVX
  56. IDCT_ADD_REP_FUNC2(, 8, 10, avx)
  57. #endif
  58. void ff_h264_idct_add_mmx (uint8_t *dst, int16_t *block, int stride);
  59. void ff_h264_idct8_add_mmx (uint8_t *dst, int16_t *block, int stride);
  60. void ff_h264_idct8_add_sse2 (uint8_t *dst, int16_t *block, int stride);
  61. void ff_h264_idct_dc_add_mmx2 (uint8_t *dst, int16_t *block, int stride);
  62. void ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride);
  63. void ff_h264_idct_add16_mmx (uint8_t *dst, const int *block_offset,
  64. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  65. void ff_h264_idct8_add4_mmx (uint8_t *dst, const int *block_offset,
  66. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  67. void ff_h264_idct_add16_mmx2 (uint8_t *dst, const int *block_offset,
  68. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  69. void ff_h264_idct_add16intra_mmx (uint8_t *dst, const int *block_offset,
  70. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  71. void ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
  72. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  73. void ff_h264_idct8_add4_mmx2 (uint8_t *dst, const int *block_offset,
  74. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  75. void ff_h264_idct8_add4_sse2 (uint8_t *dst, const int *block_offset,
  76. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  77. void ff_h264_idct_add8_mmx (uint8_t **dest, const int *block_offset,
  78. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  79. void ff_h264_idct_add8_mmx2 (uint8_t **dest, const int *block_offset,
  80. DCTELEM *block, int stride, const uint8_t nnzc[6*8]);
  81. void ff_h264_idct_add16_sse2 (uint8_t *dst, const int *block_offset, DCTELEM *block,
  82. int stride, const uint8_t nnzc[6*8]);
  83. void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block,
  84. int stride, const uint8_t nnzc[6*8]);
  85. void ff_h264_idct_add8_sse2 (uint8_t **dest, const int *block_offset, DCTELEM *block,
  86. int stride, const uint8_t nnzc[6*8]);
  87. void ff_h264_luma_dc_dequant_idct_mmx (DCTELEM *output, DCTELEM *input, int qmul);
  88. void ff_h264_luma_dc_dequant_idct_sse2(DCTELEM *output, DCTELEM *input, int qmul);
  89. /***********************************/
  90. /* deblocking */
  91. #define h264_loop_filter_strength_iteration_mmx2(bS, nz, ref, mv, bidir, edges, step, mask_mv, dir, d_idx, mask_dir) \
  92. do { \
  93. x86_reg b_idx; \
  94. mask_mv <<= 3; \
  95. for( b_idx=0; b_idx<edges; b_idx+=step ) { \
  96. if (!mask_dir) \
  97. __asm__ volatile( \
  98. "pxor %%mm0, %%mm0 \n\t" \
  99. :: \
  100. ); \
  101. if(!(mask_mv & b_idx)) { \
  102. if(bidir) { \
  103. __asm__ volatile( \
  104. "movd %a3(%0,%2), %%mm2 \n" \
  105. "punpckldq %a4(%0,%2), %%mm2 \n" /* { ref0[bn], ref1[bn] } */ \
  106. "pshufw $0x44, 12(%0,%2), %%mm0 \n" /* { ref0[b], ref0[b] } */ \
  107. "pshufw $0x44, 52(%0,%2), %%mm1 \n" /* { ref1[b], ref1[b] } */ \
  108. "pshufw $0x4E, %%mm2, %%mm3 \n" \
  109. "psubb %%mm2, %%mm0 \n" /* { ref0[b]!=ref0[bn], ref0[b]!=ref1[bn] } */ \
  110. "psubb %%mm3, %%mm1 \n" /* { ref1[b]!=ref1[bn], ref1[b]!=ref0[bn] } */ \
  111. \
  112. "por %%mm1, %%mm0 \n" \
  113. "movq %a5(%1,%2,4), %%mm1 \n" \
  114. "movq %a6(%1,%2,4), %%mm2 \n" \
  115. "movq %%mm1, %%mm3 \n" \
  116. "movq %%mm2, %%mm4 \n" \
  117. "psubw 48(%1,%2,4), %%mm1 \n" \
  118. "psubw 56(%1,%2,4), %%mm2 \n" \
  119. "psubw 208(%1,%2,4), %%mm3 \n" \
  120. "psubw 216(%1,%2,4), %%mm4 \n" \
  121. "packsswb %%mm2, %%mm1 \n" \
  122. "packsswb %%mm4, %%mm3 \n" \
  123. "paddb %%mm6, %%mm1 \n" \
  124. "paddb %%mm6, %%mm3 \n" \
  125. "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
  126. "psubusb %%mm5, %%mm3 \n" \
  127. "packsswb %%mm3, %%mm1 \n" \
  128. \
  129. "por %%mm1, %%mm0 \n" \
  130. "movq %a7(%1,%2,4), %%mm1 \n" \
  131. "movq %a8(%1,%2,4), %%mm2 \n" \
  132. "movq %%mm1, %%mm3 \n" \
  133. "movq %%mm2, %%mm4 \n" \
  134. "psubw 48(%1,%2,4), %%mm1 \n" \
  135. "psubw 56(%1,%2,4), %%mm2 \n" \
  136. "psubw 208(%1,%2,4), %%mm3 \n" \
  137. "psubw 216(%1,%2,4), %%mm4 \n" \
  138. "packsswb %%mm2, %%mm1 \n" \
  139. "packsswb %%mm4, %%mm3 \n" \
  140. "paddb %%mm6, %%mm1 \n" \
  141. "paddb %%mm6, %%mm3 \n" \
  142. "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
  143. "psubusb %%mm5, %%mm3 \n" \
  144. "packsswb %%mm3, %%mm1 \n" \
  145. \
  146. "pshufw $0x4E, %%mm1, %%mm1 \n" \
  147. "por %%mm1, %%mm0 \n" \
  148. "pshufw $0x4E, %%mm0, %%mm1 \n" \
  149. "pminub %%mm1, %%mm0 \n" \
  150. ::"r"(ref), \
  151. "r"(mv), \
  152. "r"(b_idx), \
  153. "i"(d_idx+12), \
  154. "i"(d_idx+52), \
  155. "i"(d_idx*4+48), \
  156. "i"(d_idx*4+56), \
  157. "i"(d_idx*4+208), \
  158. "i"(d_idx*4+216) \
  159. ); \
  160. } else { \
  161. __asm__ volatile( \
  162. "movd 12(%0,%2), %%mm0 \n" \
  163. "psubb %a3(%0,%2), %%mm0 \n" /* ref[b] != ref[bn] */ \
  164. "movq 48(%1,%2,4), %%mm1 \n" \
  165. "movq 56(%1,%2,4), %%mm2 \n" \
  166. "psubw %a4(%1,%2,4), %%mm1 \n" \
  167. "psubw %a5(%1,%2,4), %%mm2 \n" \
  168. "packsswb %%mm2, %%mm1 \n" \
  169. "paddb %%mm6, %%mm1 \n" \
  170. "psubusb %%mm5, %%mm1 \n" /* abs(mv[b] - mv[bn]) >= limit */ \
  171. "packsswb %%mm1, %%mm1 \n" \
  172. "por %%mm1, %%mm0 \n" \
  173. ::"r"(ref), \
  174. "r"(mv), \
  175. "r"(b_idx), \
  176. "i"(d_idx+12), \
  177. "i"(d_idx*4+48), \
  178. "i"(d_idx*4+56) \
  179. ); \
  180. } \
  181. } \
  182. __asm__ volatile( \
  183. "movd 12(%0,%1), %%mm1 \n" \
  184. "por %a2(%0,%1), %%mm1 \n" /* nnz[b] || nnz[bn] */ \
  185. ::"r"(nnz), \
  186. "r"(b_idx), \
  187. "i"(d_idx+12) \
  188. ); \
  189. __asm__ volatile( \
  190. "pminub %%mm7, %%mm1 \n" \
  191. "pminub %%mm7, %%mm0 \n" \
  192. "psllw $1, %%mm1 \n" \
  193. "pxor %%mm2, %%mm2 \n" \
  194. "pmaxub %%mm0, %%mm1 \n" \
  195. "punpcklbw %%mm2, %%mm1 \n" \
  196. "movq %%mm1, %a1(%0,%2) \n" \
  197. ::"r"(bS), \
  198. "i"(32*dir), \
  199. "r"(b_idx) \
  200. :"memory" \
  201. ); \
  202. } \
  203. } while (0)
  204. static void h264_loop_filter_strength_mmx2( int16_t bS[2][4][4], uint8_t nnz[40], int8_t ref[2][40], int16_t mv[2][40][2],
  205. int bidir, int edges, int step, int mask_mv0, int mask_mv1, int field ) {
  206. __asm__ volatile(
  207. "movq %0, %%mm7 \n"
  208. "movq %1, %%mm6 \n"
  209. ::"m"(ff_pb_1), "m"(ff_pb_3)
  210. );
  211. if(field)
  212. __asm__ volatile(
  213. "movq %0, %%mm6 \n"
  214. ::"m"(ff_pb_3_1)
  215. );
  216. __asm__ volatile(
  217. "movq %%mm6, %%mm5 \n"
  218. "paddb %%mm5, %%mm5 \n"
  219. :);
  220. // could do a special case for dir==0 && edges==1, but it only reduces the
  221. // average filter time by 1.2%
  222. step <<= 3;
  223. edges <<= 3;
  224. h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, edges, step, mask_mv1, 1, -8, 0);
  225. h264_loop_filter_strength_iteration_mmx2(bS, nnz, ref, mv, bidir, 32, 8, mask_mv0, 0, -1, -1);
  226. __asm__ volatile(
  227. "movq (%0), %%mm0 \n\t"
  228. "movq 8(%0), %%mm1 \n\t"
  229. "movq 16(%0), %%mm2 \n\t"
  230. "movq 24(%0), %%mm3 \n\t"
  231. TRANSPOSE4(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4)
  232. "movq %%mm0, (%0) \n\t"
  233. "movq %%mm3, 8(%0) \n\t"
  234. "movq %%mm4, 16(%0) \n\t"
  235. "movq %%mm2, 24(%0) \n\t"
  236. ::"r"(bS[0])
  237. :"memory"
  238. );
  239. }
  240. #define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
  241. void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
  242. int alpha, int beta, int8_t *tc0);
  243. #define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
  244. void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *pix, int stride, \
  245. int alpha, int beta);
  246. #define LF_FUNCS(type, depth)\
  247. LF_FUNC (h, chroma, depth, mmxext)\
  248. LF_IFUNC(h, chroma_intra, depth, mmxext)\
  249. LF_FUNC (v, chroma, depth, mmxext)\
  250. LF_IFUNC(v, chroma_intra, depth, mmxext)\
  251. LF_FUNC (h, luma, depth, mmxext)\
  252. LF_IFUNC(h, luma_intra, depth, mmxext)\
  253. LF_FUNC (h, luma, depth, sse2)\
  254. LF_IFUNC(h, luma_intra, depth, sse2)\
  255. LF_FUNC (v, luma, depth, sse2)\
  256. LF_IFUNC(v, luma_intra, depth, sse2)\
  257. LF_FUNC (h, chroma, depth, sse2)\
  258. LF_IFUNC(h, chroma_intra, depth, sse2)\
  259. LF_FUNC (v, chroma, depth, sse2)\
  260. LF_IFUNC(v, chroma_intra, depth, sse2)\
  261. LF_FUNC (h, luma, depth, avx)\
  262. LF_IFUNC(h, luma_intra, depth, avx)\
  263. LF_FUNC (v, luma, depth, avx)\
  264. LF_IFUNC(v, luma_intra, depth, avx)\
  265. LF_FUNC (h, chroma, depth, avx)\
  266. LF_IFUNC(h, chroma_intra, depth, avx)\
  267. LF_FUNC (v, chroma, depth, avx)\
  268. LF_IFUNC(v, chroma_intra, depth, avx)
  269. LF_FUNCS( uint8_t, 8)
  270. LF_FUNCS(uint16_t, 10)
  271. #if ARCH_X86_32
  272. LF_FUNC (v8, luma, 8, mmxext)
  273. static void ff_deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
  274. {
  275. if((tc0[0] & tc0[1]) >= 0)
  276. ff_deblock_v8_luma_8_mmxext(pix+0, stride, alpha, beta, tc0);
  277. if((tc0[2] & tc0[3]) >= 0)
  278. ff_deblock_v8_luma_8_mmxext(pix+8, stride, alpha, beta, tc0+2);
  279. }
  280. LF_IFUNC(v8, luma_intra, 8, mmxext)
  281. static void ff_deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, int alpha, int beta)
  282. {
  283. ff_deblock_v8_luma_intra_8_mmxext(pix+0, stride, alpha, beta);
  284. ff_deblock_v8_luma_intra_8_mmxext(pix+8, stride, alpha, beta);
  285. }
  286. #endif /* ARCH_X86_32 */
  287. LF_FUNC (v, luma, 10, mmxext)
  288. LF_IFUNC(v, luma_intra, 10, mmxext)
  289. /***********************************/
  290. /* weighted prediction */
  291. #define H264_WEIGHT(W, H, OPT) \
  292. void ff_h264_weight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
  293. int stride, int log2_denom, int weight, int offset);
  294. #define H264_BIWEIGHT(W, H, OPT) \
  295. void ff_h264_biweight_ ## W ## x ## H ## _ ## OPT(uint8_t *dst, \
  296. uint8_t *src, int stride, int log2_denom, int weightd, \
  297. int weights, int offset);
  298. #define H264_BIWEIGHT_MMX(W,H) \
  299. H264_WEIGHT (W, H, mmx2) \
  300. H264_BIWEIGHT(W, H, mmx2)
  301. #define H264_BIWEIGHT_MMX_SSE(W,H) \
  302. H264_BIWEIGHT_MMX(W, H) \
  303. H264_WEIGHT (W, H, sse2) \
  304. H264_BIWEIGHT (W, H, sse2) \
  305. H264_BIWEIGHT (W, H, ssse3)
  306. H264_BIWEIGHT_MMX_SSE(16, 16)
  307. H264_BIWEIGHT_MMX_SSE(16, 8)
  308. H264_BIWEIGHT_MMX_SSE( 8, 16)
  309. H264_BIWEIGHT_MMX_SSE( 8, 8)
  310. H264_BIWEIGHT_MMX_SSE( 8, 4)
  311. H264_BIWEIGHT_MMX ( 4, 8)
  312. H264_BIWEIGHT_MMX ( 4, 4)
  313. H264_BIWEIGHT_MMX ( 4, 2)
  314. void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth)
  315. {
  316. int mm_flags = av_get_cpu_flags();
  317. if (bit_depth == 8) {
  318. if (mm_flags & AV_CPU_FLAG_MMX2) {
  319. c->h264_loop_filter_strength= h264_loop_filter_strength_mmx2;
  320. }
  321. #if HAVE_YASM
  322. if (mm_flags & AV_CPU_FLAG_MMX) {
  323. c->h264_idct_dc_add=
  324. c->h264_idct_add= ff_h264_idct_add_mmx;
  325. c->h264_idct8_dc_add=
  326. c->h264_idct8_add= ff_h264_idct8_add_mmx;
  327. c->h264_idct_add16 = ff_h264_idct_add16_mmx;
  328. c->h264_idct8_add4 = ff_h264_idct8_add4_mmx;
  329. c->h264_idct_add8 = ff_h264_idct_add8_mmx;
  330. c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx;
  331. c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_mmx;
  332. if (mm_flags & AV_CPU_FLAG_MMX2) {
  333. c->h264_idct_dc_add= ff_h264_idct_dc_add_mmx2;
  334. c->h264_idct8_dc_add= ff_h264_idct8_dc_add_mmx2;
  335. c->h264_idct_add16 = ff_h264_idct_add16_mmx2;
  336. c->h264_idct8_add4 = ff_h264_idct8_add4_mmx2;
  337. c->h264_idct_add8 = ff_h264_idct_add8_mmx2;
  338. c->h264_idct_add16intra= ff_h264_idct_add16intra_mmx2;
  339. c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_8_mmxext;
  340. c->h264_h_loop_filter_chroma= ff_deblock_h_chroma_8_mmxext;
  341. c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_8_mmxext;
  342. c->h264_h_loop_filter_chroma_intra= ff_deblock_h_chroma_intra_8_mmxext;
  343. #if ARCH_X86_32
  344. c->h264_v_loop_filter_luma= ff_deblock_v_luma_8_mmxext;
  345. c->h264_h_loop_filter_luma= ff_deblock_h_luma_8_mmxext;
  346. c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_mmxext;
  347. c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
  348. #endif
  349. c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
  350. c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
  351. c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
  352. c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
  353. c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
  354. c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
  355. c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
  356. c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
  357. c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
  358. c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
  359. c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
  360. c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
  361. c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
  362. c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
  363. c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
  364. c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
  365. if (mm_flags&AV_CPU_FLAG_SSE2) {
  366. c->h264_idct8_add = ff_h264_idct8_add_sse2;
  367. c->h264_idct8_add4= ff_h264_idct8_add4_sse2;
  368. c->h264_luma_dc_dequant_idct= ff_h264_luma_dc_dequant_idct_sse2;
  369. c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_sse2;
  370. c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_sse2;
  371. c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_sse2;
  372. c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_sse2;
  373. c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_sse2;
  374. c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_sse2;
  375. c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_sse2;
  376. c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_sse2;
  377. c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_sse2;
  378. c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_sse2;
  379. #if HAVE_ALIGNED_STACK
  380. c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
  381. c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
  382. c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
  383. c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
  384. #endif
  385. c->h264_idct_add16 = ff_h264_idct_add16_sse2;
  386. c->h264_idct_add8 = ff_h264_idct_add8_sse2;
  387. c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
  388. }
  389. if (mm_flags&AV_CPU_FLAG_SSSE3) {
  390. c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_ssse3;
  391. c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_ssse3;
  392. c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_ssse3;
  393. c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_ssse3;
  394. c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_ssse3;
  395. }
  396. if (mm_flags&AV_CPU_FLAG_AVX) {
  397. #if HAVE_ALIGNED_STACK
  398. c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
  399. c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
  400. c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
  401. c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
  402. #endif
  403. }
  404. }
  405. }
  406. #endif
  407. } else if (bit_depth == 10) {
  408. #if HAVE_YASM
  409. if (mm_flags & AV_CPU_FLAG_MMX) {
  410. if (mm_flags & AV_CPU_FLAG_MMX2) {
  411. #if ARCH_X86_32
  412. c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_mmxext;
  413. c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_mmxext;
  414. c->h264_v_loop_filter_luma= ff_deblock_v_luma_10_mmxext;
  415. c->h264_h_loop_filter_luma= ff_deblock_h_luma_10_mmxext;
  416. c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
  417. c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
  418. #endif
  419. c->h264_idct_dc_add= ff_h264_idct_dc_add_10_mmx2;
  420. if (mm_flags&AV_CPU_FLAG_SSE2) {
  421. c->h264_idct_add = ff_h264_idct_add_10_sse2;
  422. c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
  423. c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
  424. c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
  425. c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
  426. c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
  427. c->h264_idct_add16intra= ff_h264_idct_add16intra_10_sse2;
  428. c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_sse2;
  429. c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_sse2;
  430. #if HAVE_ALIGNED_STACK
  431. c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
  432. c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
  433. c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
  434. c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
  435. #endif
  436. }
  437. #if HAVE_AVX
  438. if (mm_flags&AV_CPU_FLAG_AVX) {
  439. c->h264_idct_dc_add =
  440. c->h264_idct_add = ff_h264_idct_add_10_avx;
  441. c->h264_idct8_add = ff_h264_idct8_add_10_avx;
  442. c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
  443. c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
  444. c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
  445. c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
  446. c->h264_idct_add16intra= ff_h264_idct_add16intra_10_avx;
  447. c->h264_v_loop_filter_chroma= ff_deblock_v_chroma_10_avx;
  448. c->h264_v_loop_filter_chroma_intra= ff_deblock_v_chroma_intra_10_avx;
  449. #if HAVE_ALIGNED_STACK
  450. c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
  451. c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
  452. c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
  453. c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
  454. #endif
  455. }
  456. #endif /* HAVE_AVX */
  457. }
  458. }
  459. #endif
  460. }
  461. }