You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

676 lines
40KB

  1. /*
  2. * Copyright (c) 2000, 2001 Fabrice Bellard
  3. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "config.h"
  22. #include "libavutil/attributes.h"
  23. #include "libavutil/cpu.h"
  24. #include "libavutil/internal.h"
  25. #include "libavutil/x86/cpu.h"
  26. #include "libavcodec/avcodec.h"
  27. #include "libavcodec/dsputil.h"
  28. #include "libavcodec/pixels.h"
  29. #include "libavcodec/simple_idct.h"
  30. #include "libavcodec/version.h"
  31. #include "dsputil_x86.h"
  32. #include "fpel.h"
  33. #include "idct_xvid.h"
  34. void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  35. int dstStride, int src1Stride, int h);
  36. void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
  37. uint8_t *src2, int dstStride,
  38. int src1Stride, int h);
  39. void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  40. int dstStride, int src1Stride, int h);
  41. void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  42. int dstStride, int src1Stride, int h);
  43. void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  44. int dstStride, int src1Stride, int h);
  45. void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  46. int dstStride, int src1Stride, int h);
  47. void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  48. int dstStride, int srcStride, int h);
  49. void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  50. int dstStride, int srcStride, int h);
  51. void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  52. int dstStride, int srcStride,
  53. int h);
  54. void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  55. int dstStride, int srcStride, int h);
  56. void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  57. int dstStride, int srcStride, int h);
  58. void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  59. int dstStride, int srcStride,
  60. int h);
  61. void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  62. int dstStride, int srcStride);
  63. void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  64. int dstStride, int srcStride);
  65. void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  66. int dstStride, int srcStride);
  67. void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  68. int dstStride, int srcStride);
  69. void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  70. int dstStride, int srcStride);
  71. void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
  72. int dstStride, int srcStride);
  73. #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
  74. #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
  75. int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
  76. int order);
  77. int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
  78. int order);
  79. int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
  80. const int16_t *v3,
  81. int order, int mul);
  82. int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
  83. const int16_t *v3,
  84. int order, int mul);
  85. int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
  86. const int16_t *v3,
  87. int order, int mul);
  88. void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
  89. void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
  90. void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
  91. int32_t min, int32_t max, unsigned int len);
  92. void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
  93. int32_t min, int32_t max, unsigned int len);
  94. void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
  95. int32_t min, int32_t max, unsigned int len);
  96. void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
  97. int32_t min, int32_t max, unsigned int len);
  98. #if HAVE_YASM
  99. #define ff_put_pixels16_mmxext ff_put_pixels16_mmx
  100. #define ff_put_pixels8_mmxext ff_put_pixels8_mmx
  101. #define QPEL_OP(OPNAME, RND, MMX) \
  102. static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, uint8_t *src, \
  103. ptrdiff_t stride) \
  104. { \
  105. ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
  106. } \
  107. \
  108. static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
  109. ptrdiff_t stride) \
  110. { \
  111. uint64_t temp[8]; \
  112. uint8_t *const half = (uint8_t *) temp; \
  113. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
  114. stride, 8); \
  115. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
  116. stride, stride, 8); \
  117. } \
  118. \
  119. static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
  120. ptrdiff_t stride) \
  121. { \
  122. ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
  123. stride, 8); \
  124. } \
  125. \
  126. static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
  127. ptrdiff_t stride) \
  128. { \
  129. uint64_t temp[8]; \
  130. uint8_t *const half = (uint8_t *) temp; \
  131. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
  132. stride, 8); \
  133. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
  134. stride, 8); \
  135. } \
  136. \
  137. static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
  138. ptrdiff_t stride) \
  139. { \
  140. uint64_t temp[8]; \
  141. uint8_t *const half = (uint8_t *) temp; \
  142. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
  143. 8, stride); \
  144. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
  145. stride, stride, 8); \
  146. } \
  147. \
  148. static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
  149. ptrdiff_t stride) \
  150. { \
  151. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
  152. stride, stride); \
  153. } \
  154. \
  155. static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
  156. ptrdiff_t stride) \
  157. { \
  158. uint64_t temp[8]; \
  159. uint8_t *const half = (uint8_t *) temp; \
  160. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
  161. 8, stride); \
  162. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
  163. stride, 8); \
  164. } \
  165. \
  166. static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
  167. ptrdiff_t stride) \
  168. { \
  169. uint64_t half[8 + 9]; \
  170. uint8_t *const halfH = (uint8_t *) half + 64; \
  171. uint8_t *const halfHV = (uint8_t *) half; \
  172. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  173. stride, 9); \
  174. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
  175. stride, 9); \
  176. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  177. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
  178. stride, 8, 8); \
  179. } \
  180. \
  181. static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
  182. ptrdiff_t stride) \
  183. { \
  184. uint64_t half[8 + 9]; \
  185. uint8_t *const halfH = (uint8_t *) half + 64; \
  186. uint8_t *const halfHV = (uint8_t *) half; \
  187. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  188. stride, 9); \
  189. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
  190. stride, 9); \
  191. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  192. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
  193. stride, 8, 8); \
  194. } \
  195. \
  196. static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
  197. ptrdiff_t stride) \
  198. { \
  199. uint64_t half[8 + 9]; \
  200. uint8_t *const halfH = (uint8_t *) half + 64; \
  201. uint8_t *const halfHV = (uint8_t *) half; \
  202. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  203. stride, 9); \
  204. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
  205. stride, 9); \
  206. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  207. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
  208. stride, 8, 8); \
  209. } \
  210. \
  211. static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
  212. ptrdiff_t stride) \
  213. { \
  214. uint64_t half[8 + 9]; \
  215. uint8_t *const halfH = (uint8_t *) half + 64; \
  216. uint8_t *const halfHV = (uint8_t *) half; \
  217. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  218. stride, 9); \
  219. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
  220. stride, 9); \
  221. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  222. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
  223. stride, 8, 8); \
  224. } \
  225. \
  226. static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
  227. ptrdiff_t stride) \
  228. { \
  229. uint64_t half[8 + 9]; \
  230. uint8_t *const halfH = (uint8_t *) half + 64; \
  231. uint8_t *const halfHV = (uint8_t *) half; \
  232. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  233. stride, 9); \
  234. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  235. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
  236. stride, 8, 8); \
  237. } \
  238. \
  239. static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
  240. ptrdiff_t stride) \
  241. { \
  242. uint64_t half[8 + 9]; \
  243. uint8_t *const halfH = (uint8_t *) half + 64; \
  244. uint8_t *const halfHV = (uint8_t *) half; \
  245. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  246. stride, 9); \
  247. ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
  248. ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
  249. stride, 8, 8); \
  250. } \
  251. \
  252. static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
  253. ptrdiff_t stride) \
  254. { \
  255. uint64_t half[8 + 9]; \
  256. uint8_t *const halfH = (uint8_t *) half; \
  257. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  258. stride, 9); \
  259. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
  260. 8, stride, 9); \
  261. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
  262. stride, 8); \
  263. } \
  264. \
  265. static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
  266. ptrdiff_t stride) \
  267. { \
  268. uint64_t half[8 + 9]; \
  269. uint8_t *const halfH = (uint8_t *) half; \
  270. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  271. stride, 9); \
  272. ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
  273. stride, 9); \
  274. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
  275. stride, 8); \
  276. } \
  277. \
  278. static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
  279. ptrdiff_t stride) \
  280. { \
  281. uint64_t half[9]; \
  282. uint8_t *const halfH = (uint8_t *) half; \
  283. ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
  284. stride, 9); \
  285. ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
  286. stride, 8); \
  287. } \
  288. \
  289. static void OPNAME ## qpel16_mc00_ ## MMX(uint8_t *dst, uint8_t *src, \
  290. ptrdiff_t stride) \
  291. { \
  292. ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
  293. } \
  294. \
  295. static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
  296. ptrdiff_t stride) \
  297. { \
  298. uint64_t temp[32]; \
  299. uint8_t *const half = (uint8_t *) temp; \
  300. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
  301. stride, 16); \
  302. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
  303. stride, 16); \
  304. } \
  305. \
  306. static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
  307. ptrdiff_t stride) \
  308. { \
  309. ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
  310. stride, stride, 16);\
  311. } \
  312. \
  313. static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
  314. ptrdiff_t stride) \
  315. { \
  316. uint64_t temp[32]; \
  317. uint8_t *const half = (uint8_t*) temp; \
  318. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
  319. stride, 16); \
  320. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
  321. stride, stride, 16); \
  322. } \
  323. \
  324. static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
  325. ptrdiff_t stride) \
  326. { \
  327. uint64_t temp[32]; \
  328. uint8_t *const half = (uint8_t *) temp; \
  329. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
  330. stride); \
  331. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
  332. stride, 16); \
  333. } \
  334. \
  335. static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
  336. ptrdiff_t stride) \
  337. { \
  338. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
  339. stride, stride); \
  340. } \
  341. \
  342. static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
  343. ptrdiff_t stride) \
  344. { \
  345. uint64_t temp[32]; \
  346. uint8_t *const half = (uint8_t *) temp; \
  347. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
  348. stride); \
  349. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
  350. stride, stride, 16); \
  351. } \
  352. \
  353. static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
  354. ptrdiff_t stride) \
  355. { \
  356. uint64_t half[16 * 2 + 17 * 2]; \
  357. uint8_t *const halfH = (uint8_t *) half + 256; \
  358. uint8_t *const halfHV = (uint8_t *) half; \
  359. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  360. stride, 17); \
  361. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
  362. stride, 17); \
  363. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  364. 16, 16); \
  365. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
  366. stride, 16, 16); \
  367. } \
  368. \
  369. static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
  370. ptrdiff_t stride) \
  371. { \
  372. uint64_t half[16 * 2 + 17 * 2]; \
  373. uint8_t *const halfH = (uint8_t *) half + 256; \
  374. uint8_t *const halfHV = (uint8_t *) half; \
  375. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  376. stride, 17); \
  377. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
  378. stride, 17); \
  379. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  380. 16, 16); \
  381. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
  382. stride, 16, 16); \
  383. } \
  384. \
  385. static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
  386. ptrdiff_t stride) \
  387. { \
  388. uint64_t half[16 * 2 + 17 * 2]; \
  389. uint8_t *const halfH = (uint8_t *) half + 256; \
  390. uint8_t *const halfHV = (uint8_t *) half; \
  391. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  392. stride, 17); \
  393. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
  394. stride, 17); \
  395. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  396. 16, 16); \
  397. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
  398. stride, 16, 16); \
  399. } \
  400. \
  401. static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
  402. ptrdiff_t stride) \
  403. { \
  404. uint64_t half[16 * 2 + 17 * 2]; \
  405. uint8_t *const halfH = (uint8_t *) half + 256; \
  406. uint8_t *const halfHV = (uint8_t *) half; \
  407. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  408. stride, 17); \
  409. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
  410. stride, 17); \
  411. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  412. 16, 16); \
  413. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
  414. stride, 16, 16); \
  415. } \
  416. \
  417. static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
  418. ptrdiff_t stride) \
  419. { \
  420. uint64_t half[16 * 2 + 17 * 2]; \
  421. uint8_t *const halfH = (uint8_t *) half + 256; \
  422. uint8_t *const halfHV = (uint8_t *) half; \
  423. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  424. stride, 17); \
  425. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  426. 16, 16); \
  427. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
  428. stride, 16, 16); \
  429. } \
  430. \
  431. static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
  432. ptrdiff_t stride) \
  433. { \
  434. uint64_t half[16 * 2 + 17 * 2]; \
  435. uint8_t *const halfH = (uint8_t *) half + 256; \
  436. uint8_t *const halfHV = (uint8_t *) half; \
  437. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  438. stride, 17); \
  439. ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
  440. 16, 16); \
  441. ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
  442. stride, 16, 16); \
  443. } \
  444. \
  445. static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
  446. ptrdiff_t stride) \
  447. { \
  448. uint64_t half[17 * 2]; \
  449. uint8_t *const halfH = (uint8_t *) half; \
  450. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  451. stride, 17); \
  452. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
  453. stride, 17); \
  454. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
  455. stride, 16); \
  456. } \
  457. \
  458. static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
  459. ptrdiff_t stride) \
  460. { \
  461. uint64_t half[17 * 2]; \
  462. uint8_t *const halfH = (uint8_t *) half; \
  463. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  464. stride, 17); \
  465. ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
  466. stride, 17); \
  467. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
  468. stride, 16); \
  469. } \
  470. \
  471. static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
  472. ptrdiff_t stride) \
  473. { \
  474. uint64_t half[17 * 2]; \
  475. uint8_t *const halfH = (uint8_t *) half; \
  476. ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
  477. stride, 17); \
  478. ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
  479. stride, 16); \
  480. }
  481. QPEL_OP(put_, _, mmxext)
  482. QPEL_OP(avg_, _, mmxext)
  483. QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
  484. #endif /* HAVE_YASM */
  485. #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
  486. do { \
  487. c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
  488. c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
  489. c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
  490. c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
  491. c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
  492. c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
  493. c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
  494. c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
  495. c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
  496. c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
  497. c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
  498. c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
  499. c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
  500. c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
  501. c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
  502. c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
  503. } while (0)
  504. static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
  505. int cpu_flags, unsigned high_bit_depth)
  506. {
  507. #if HAVE_MMX_INLINE
  508. c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
  509. c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
  510. if (!high_bit_depth) {
  511. c->draw_edges = ff_draw_edges_mmx;
  512. }
  513. #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
  514. c->gmc = ff_gmc_mmx;
  515. #endif
  516. #endif /* HAVE_MMX_INLINE */
  517. #if HAVE_MMX_EXTERNAL
  518. if (!high_bit_depth) {
  519. c->clear_block = ff_clear_block_mmx;
  520. c->clear_blocks = ff_clear_blocks_mmx;
  521. }
  522. c->vector_clip_int32 = ff_vector_clip_int32_mmx;
  523. c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
  524. #endif /* HAVE_MMX_EXTERNAL */
  525. }
  526. static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
  527. int cpu_flags, unsigned high_bit_depth)
  528. {
  529. #if HAVE_MMXEXT_INLINE
  530. if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
  531. c->idct_put = ff_idct_xvid_mmxext_put;
  532. c->idct_add = ff_idct_xvid_mmxext_add;
  533. c->idct = ff_idct_xvid_mmxext;
  534. }
  535. #endif /* HAVE_MMXEXT_INLINE */
  536. #if HAVE_MMXEXT_EXTERNAL
  537. SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
  538. SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
  539. SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
  540. SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
  541. SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
  542. SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
  543. c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
  544. c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
  545. #endif /* HAVE_MMXEXT_EXTERNAL */
  546. }
  547. static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
  548. int cpu_flags, unsigned high_bit_depth)
  549. {
  550. #if HAVE_YASM
  551. #if HAVE_SSE_EXTERNAL
  552. c->vector_clipf = ff_vector_clipf_sse;
  553. /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
  554. if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
  555. return;
  556. if (!high_bit_depth) {
  557. c->clear_block = ff_clear_block_sse;
  558. c->clear_blocks = ff_clear_blocks_sse;
  559. }
  560. #endif
  561. #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
  562. c->gmc = ff_gmc_sse;
  563. #endif
  564. #endif /* HAVE_YASM */
  565. }
  566. static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
  567. int cpu_flags, unsigned high_bit_depth)
  568. {
  569. #if HAVE_SSE2_INLINE
  570. if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
  571. c->idct_put = ff_idct_xvid_sse2_put;
  572. c->idct_add = ff_idct_xvid_sse2_add;
  573. c->idct = ff_idct_xvid_sse2;
  574. c->idct_permutation_type = FF_SSE2_IDCT_PERM;
  575. }
  576. #endif /* HAVE_SSE2_INLINE */
  577. #if HAVE_SSE2_EXTERNAL
  578. c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
  579. c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
  580. if (cpu_flags & AV_CPU_FLAG_ATOM) {
  581. c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
  582. } else {
  583. c->vector_clip_int32 = ff_vector_clip_int32_sse2;
  584. }
  585. c->bswap_buf = ff_bswap32_buf_sse2;
  586. c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
  587. #endif /* HAVE_SSE2_EXTERNAL */
  588. }
  589. static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
  590. int cpu_flags, unsigned high_bit_depth)
  591. {
  592. #if HAVE_SSSE3_EXTERNAL
  593. if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
  594. c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
  595. c->bswap_buf = ff_bswap32_buf_ssse3;
  596. #endif /* HAVE_SSSE3_EXTERNAL */
  597. }
  598. static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
  599. int cpu_flags, unsigned high_bit_depth)
  600. {
  601. #if HAVE_SSE4_EXTERNAL
  602. c->vector_clip_int32 = ff_vector_clip_int32_sse4;
  603. #endif /* HAVE_SSE4_EXTERNAL */
  604. }
  605. av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
  606. unsigned high_bit_depth)
  607. {
  608. int cpu_flags = av_get_cpu_flags();
  609. if (X86_MMX(cpu_flags)) {
  610. #if HAVE_INLINE_ASM
  611. const int idct_algo = avctx->idct_algo;
  612. if (avctx->lowres == 0 && !high_bit_depth) {
  613. if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
  614. c->idct_put = ff_simple_idct_put_mmx;
  615. c->idct_add = ff_simple_idct_add_mmx;
  616. c->idct = ff_simple_idct_mmx;
  617. c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
  618. } else if (idct_algo == FF_IDCT_XVIDMMX) {
  619. c->idct_put = ff_idct_xvid_mmx_put;
  620. c->idct_add = ff_idct_xvid_mmx_add;
  621. c->idct = ff_idct_xvid_mmx;
  622. }
  623. }
  624. #endif /* HAVE_INLINE_ASM */
  625. dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth);
  626. }
  627. if (X86_MMXEXT(cpu_flags))
  628. dsputil_init_mmxext(c, avctx, cpu_flags, high_bit_depth);
  629. if (X86_SSE(cpu_flags))
  630. dsputil_init_sse(c, avctx, cpu_flags, high_bit_depth);
  631. if (X86_SSE2(cpu_flags))
  632. dsputil_init_sse2(c, avctx, cpu_flags, high_bit_depth);
  633. if (EXTERNAL_SSSE3(cpu_flags))
  634. dsputil_init_ssse3(c, avctx, cpu_flags, high_bit_depth);
  635. if (EXTERNAL_SSE4(cpu_flags))
  636. dsputil_init_sse4(c, avctx, cpu_flags, high_bit_depth);
  637. if (CONFIG_ENCODERS)
  638. ff_dsputilenc_init_mmx(c, avctx, high_bit_depth);
  639. }