You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1212 lines
35KB

  1. /*
  2. * MMX optimized DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of Libav.
  7. *
  8. * Libav is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * Libav is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with Libav; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. *
  22. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23. */
  24. #include "libavutil/cpu.h"
  25. #include "libavutil/x86/asm.h"
  26. #include "libavutil/x86/cpu.h"
  27. #include "libavcodec/dsputil.h"
  28. #include "libavcodec/mpegvideo.h"
  29. #include "libavcodec/mathops.h"
  30. #include "dsputil_mmx.h"
  31. #if HAVE_INLINE_ASM
  32. static void get_pixels_mmx(DCTELEM *block, const uint8_t *pixels, int line_size)
  33. {
  34. __asm__ volatile(
  35. "mov $-128, %%"REG_a" \n\t"
  36. "pxor %%mm7, %%mm7 \n\t"
  37. ".p2align 4 \n\t"
  38. "1: \n\t"
  39. "movq (%0), %%mm0 \n\t"
  40. "movq (%0, %2), %%mm2 \n\t"
  41. "movq %%mm0, %%mm1 \n\t"
  42. "movq %%mm2, %%mm3 \n\t"
  43. "punpcklbw %%mm7, %%mm0 \n\t"
  44. "punpckhbw %%mm7, %%mm1 \n\t"
  45. "punpcklbw %%mm7, %%mm2 \n\t"
  46. "punpckhbw %%mm7, %%mm3 \n\t"
  47. "movq %%mm0, (%1, %%"REG_a") \n\t"
  48. "movq %%mm1, 8(%1, %%"REG_a") \n\t"
  49. "movq %%mm2, 16(%1, %%"REG_a") \n\t"
  50. "movq %%mm3, 24(%1, %%"REG_a") \n\t"
  51. "add %3, %0 \n\t"
  52. "add $32, %%"REG_a" \n\t"
  53. "js 1b \n\t"
  54. : "+r" (pixels)
  55. : "r" (block+64), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*2)
  56. : "%"REG_a
  57. );
  58. }
  59. static void get_pixels_sse2(DCTELEM *block, const uint8_t *pixels, int line_size)
  60. {
  61. __asm__ volatile(
  62. "pxor %%xmm4, %%xmm4 \n\t"
  63. "movq (%0), %%xmm0 \n\t"
  64. "movq (%0, %2), %%xmm1 \n\t"
  65. "movq (%0, %2,2), %%xmm2 \n\t"
  66. "movq (%0, %3), %%xmm3 \n\t"
  67. "lea (%0,%2,4), %0 \n\t"
  68. "punpcklbw %%xmm4, %%xmm0 \n\t"
  69. "punpcklbw %%xmm4, %%xmm1 \n\t"
  70. "punpcklbw %%xmm4, %%xmm2 \n\t"
  71. "punpcklbw %%xmm4, %%xmm3 \n\t"
  72. "movdqa %%xmm0, (%1) \n\t"
  73. "movdqa %%xmm1, 16(%1) \n\t"
  74. "movdqa %%xmm2, 32(%1) \n\t"
  75. "movdqa %%xmm3, 48(%1) \n\t"
  76. "movq (%0), %%xmm0 \n\t"
  77. "movq (%0, %2), %%xmm1 \n\t"
  78. "movq (%0, %2,2), %%xmm2 \n\t"
  79. "movq (%0, %3), %%xmm3 \n\t"
  80. "punpcklbw %%xmm4, %%xmm0 \n\t"
  81. "punpcklbw %%xmm4, %%xmm1 \n\t"
  82. "punpcklbw %%xmm4, %%xmm2 \n\t"
  83. "punpcklbw %%xmm4, %%xmm3 \n\t"
  84. "movdqa %%xmm0, 64(%1) \n\t"
  85. "movdqa %%xmm1, 80(%1) \n\t"
  86. "movdqa %%xmm2, 96(%1) \n\t"
  87. "movdqa %%xmm3, 112(%1) \n\t"
  88. : "+r" (pixels)
  89. : "r" (block), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3)
  90. );
  91. }
  92. static inline void diff_pixels_mmx(DCTELEM *block, const uint8_t *s1, const uint8_t *s2, int stride)
  93. {
  94. __asm__ volatile(
  95. "pxor %%mm7, %%mm7 \n\t"
  96. "mov $-128, %%"REG_a" \n\t"
  97. ".p2align 4 \n\t"
  98. "1: \n\t"
  99. "movq (%0), %%mm0 \n\t"
  100. "movq (%1), %%mm2 \n\t"
  101. "movq %%mm0, %%mm1 \n\t"
  102. "movq %%mm2, %%mm3 \n\t"
  103. "punpcklbw %%mm7, %%mm0 \n\t"
  104. "punpckhbw %%mm7, %%mm1 \n\t"
  105. "punpcklbw %%mm7, %%mm2 \n\t"
  106. "punpckhbw %%mm7, %%mm3 \n\t"
  107. "psubw %%mm2, %%mm0 \n\t"
  108. "psubw %%mm3, %%mm1 \n\t"
  109. "movq %%mm0, (%2, %%"REG_a") \n\t"
  110. "movq %%mm1, 8(%2, %%"REG_a") \n\t"
  111. "add %3, %0 \n\t"
  112. "add %3, %1 \n\t"
  113. "add $16, %%"REG_a" \n\t"
  114. "jnz 1b \n\t"
  115. : "+r" (s1), "+r" (s2)
  116. : "r" (block+64), "r" ((x86_reg)stride)
  117. : "%"REG_a
  118. );
  119. }
  120. static int pix_sum16_mmx(uint8_t * pix, int line_size){
  121. const int h=16;
  122. int sum;
  123. x86_reg index= -line_size*h;
  124. __asm__ volatile(
  125. "pxor %%mm7, %%mm7 \n\t"
  126. "pxor %%mm6, %%mm6 \n\t"
  127. "1: \n\t"
  128. "movq (%2, %1), %%mm0 \n\t"
  129. "movq (%2, %1), %%mm1 \n\t"
  130. "movq 8(%2, %1), %%mm2 \n\t"
  131. "movq 8(%2, %1), %%mm3 \n\t"
  132. "punpcklbw %%mm7, %%mm0 \n\t"
  133. "punpckhbw %%mm7, %%mm1 \n\t"
  134. "punpcklbw %%mm7, %%mm2 \n\t"
  135. "punpckhbw %%mm7, %%mm3 \n\t"
  136. "paddw %%mm0, %%mm1 \n\t"
  137. "paddw %%mm2, %%mm3 \n\t"
  138. "paddw %%mm1, %%mm3 \n\t"
  139. "paddw %%mm3, %%mm6 \n\t"
  140. "add %3, %1 \n\t"
  141. " js 1b \n\t"
  142. "movq %%mm6, %%mm5 \n\t"
  143. "psrlq $32, %%mm6 \n\t"
  144. "paddw %%mm5, %%mm6 \n\t"
  145. "movq %%mm6, %%mm5 \n\t"
  146. "psrlq $16, %%mm6 \n\t"
  147. "paddw %%mm5, %%mm6 \n\t"
  148. "movd %%mm6, %0 \n\t"
  149. "andl $0xFFFF, %0 \n\t"
  150. : "=&r" (sum), "+r" (index)
  151. : "r" (pix - index), "r" ((x86_reg)line_size)
  152. );
  153. return sum;
  154. }
  155. static int pix_norm1_mmx(uint8_t *pix, int line_size) {
  156. int tmp;
  157. __asm__ volatile (
  158. "movl $16,%%ecx\n"
  159. "pxor %%mm0,%%mm0\n"
  160. "pxor %%mm7,%%mm7\n"
  161. "1:\n"
  162. "movq (%0),%%mm2\n" /* mm2 = pix[0-7] */
  163. "movq 8(%0),%%mm3\n" /* mm3 = pix[8-15] */
  164. "movq %%mm2,%%mm1\n" /* mm1 = mm2 = pix[0-7] */
  165. "punpckhbw %%mm0,%%mm1\n" /* mm1 = [pix4-7] */
  166. "punpcklbw %%mm0,%%mm2\n" /* mm2 = [pix0-3] */
  167. "movq %%mm3,%%mm4\n" /* mm4 = mm3 = pix[8-15] */
  168. "punpckhbw %%mm0,%%mm3\n" /* mm3 = [pix12-15] */
  169. "punpcklbw %%mm0,%%mm4\n" /* mm4 = [pix8-11] */
  170. "pmaddwd %%mm1,%%mm1\n" /* mm1 = (pix0^2+pix1^2,pix2^2+pix3^2) */
  171. "pmaddwd %%mm2,%%mm2\n" /* mm2 = (pix4^2+pix5^2,pix6^2+pix7^2) */
  172. "pmaddwd %%mm3,%%mm3\n"
  173. "pmaddwd %%mm4,%%mm4\n"
  174. "paddd %%mm1,%%mm2\n" /* mm2 = (pix0^2+pix1^2+pix4^2+pix5^2,
  175. pix2^2+pix3^2+pix6^2+pix7^2) */
  176. "paddd %%mm3,%%mm4\n"
  177. "paddd %%mm2,%%mm7\n"
  178. "add %2, %0\n"
  179. "paddd %%mm4,%%mm7\n"
  180. "dec %%ecx\n"
  181. "jnz 1b\n"
  182. "movq %%mm7,%%mm1\n"
  183. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  184. "paddd %%mm7,%%mm1\n"
  185. "movd %%mm1,%1\n"
  186. : "+r" (pix), "=r"(tmp) : "r" ((x86_reg)line_size) : "%ecx" );
  187. return tmp;
  188. }
  189. static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  190. int tmp;
  191. __asm__ volatile (
  192. "movl %4,%%ecx\n"
  193. "shr $1,%%ecx\n"
  194. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  195. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  196. "1:\n"
  197. "movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
  198. "movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
  199. "movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
  200. "movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
  201. /* todo: mm1-mm2, mm3-mm4 */
  202. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  203. /* OR the results to get absolute difference */
  204. "movq %%mm1,%%mm5\n"
  205. "movq %%mm3,%%mm6\n"
  206. "psubusb %%mm2,%%mm1\n"
  207. "psubusb %%mm4,%%mm3\n"
  208. "psubusb %%mm5,%%mm2\n"
  209. "psubusb %%mm6,%%mm4\n"
  210. "por %%mm1,%%mm2\n"
  211. "por %%mm3,%%mm4\n"
  212. /* now convert to 16-bit vectors so we can square them */
  213. "movq %%mm2,%%mm1\n"
  214. "movq %%mm4,%%mm3\n"
  215. "punpckhbw %%mm0,%%mm2\n"
  216. "punpckhbw %%mm0,%%mm4\n"
  217. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  218. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  219. "pmaddwd %%mm2,%%mm2\n"
  220. "pmaddwd %%mm4,%%mm4\n"
  221. "pmaddwd %%mm1,%%mm1\n"
  222. "pmaddwd %%mm3,%%mm3\n"
  223. "lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
  224. "lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
  225. "paddd %%mm2,%%mm1\n"
  226. "paddd %%mm4,%%mm3\n"
  227. "paddd %%mm1,%%mm7\n"
  228. "paddd %%mm3,%%mm7\n"
  229. "decl %%ecx\n"
  230. "jnz 1b\n"
  231. "movq %%mm7,%%mm1\n"
  232. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  233. "paddd %%mm7,%%mm1\n"
  234. "movd %%mm1,%2\n"
  235. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  236. : "r" ((x86_reg)line_size) , "m" (h)
  237. : "%ecx");
  238. return tmp;
  239. }
  240. static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  241. int tmp;
  242. __asm__ volatile (
  243. "movl %4,%%ecx\n"
  244. "pxor %%mm0,%%mm0\n" /* mm0 = 0 */
  245. "pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
  246. "1:\n"
  247. "movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
  248. "movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
  249. "movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
  250. "movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
  251. /* todo: mm1-mm2, mm3-mm4 */
  252. /* algo: subtract mm1 from mm2 with saturation and vice versa */
  253. /* OR the results to get absolute difference */
  254. "movq %%mm1,%%mm5\n"
  255. "movq %%mm3,%%mm6\n"
  256. "psubusb %%mm2,%%mm1\n"
  257. "psubusb %%mm4,%%mm3\n"
  258. "psubusb %%mm5,%%mm2\n"
  259. "psubusb %%mm6,%%mm4\n"
  260. "por %%mm1,%%mm2\n"
  261. "por %%mm3,%%mm4\n"
  262. /* now convert to 16-bit vectors so we can square them */
  263. "movq %%mm2,%%mm1\n"
  264. "movq %%mm4,%%mm3\n"
  265. "punpckhbw %%mm0,%%mm2\n"
  266. "punpckhbw %%mm0,%%mm4\n"
  267. "punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
  268. "punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
  269. "pmaddwd %%mm2,%%mm2\n"
  270. "pmaddwd %%mm4,%%mm4\n"
  271. "pmaddwd %%mm1,%%mm1\n"
  272. "pmaddwd %%mm3,%%mm3\n"
  273. "add %3,%0\n"
  274. "add %3,%1\n"
  275. "paddd %%mm2,%%mm1\n"
  276. "paddd %%mm4,%%mm3\n"
  277. "paddd %%mm1,%%mm7\n"
  278. "paddd %%mm3,%%mm7\n"
  279. "decl %%ecx\n"
  280. "jnz 1b\n"
  281. "movq %%mm7,%%mm1\n"
  282. "psrlq $32, %%mm7\n" /* shift hi dword to lo */
  283. "paddd %%mm7,%%mm1\n"
  284. "movd %%mm1,%2\n"
  285. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  286. : "r" ((x86_reg)line_size) , "m" (h)
  287. : "%ecx");
  288. return tmp;
  289. }
  290. static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
  291. int tmp;
  292. __asm__ volatile (
  293. "movl %3,%%ecx\n"
  294. "pxor %%mm7,%%mm7\n"
  295. "pxor %%mm6,%%mm6\n"
  296. "movq (%0),%%mm0\n"
  297. "movq %%mm0, %%mm1\n"
  298. "psllq $8, %%mm0\n"
  299. "psrlq $8, %%mm1\n"
  300. "psrlq $8, %%mm0\n"
  301. "movq %%mm0, %%mm2\n"
  302. "movq %%mm1, %%mm3\n"
  303. "punpcklbw %%mm7,%%mm0\n"
  304. "punpcklbw %%mm7,%%mm1\n"
  305. "punpckhbw %%mm7,%%mm2\n"
  306. "punpckhbw %%mm7,%%mm3\n"
  307. "psubw %%mm1, %%mm0\n"
  308. "psubw %%mm3, %%mm2\n"
  309. "add %2,%0\n"
  310. "movq (%0),%%mm4\n"
  311. "movq %%mm4, %%mm1\n"
  312. "psllq $8, %%mm4\n"
  313. "psrlq $8, %%mm1\n"
  314. "psrlq $8, %%mm4\n"
  315. "movq %%mm4, %%mm5\n"
  316. "movq %%mm1, %%mm3\n"
  317. "punpcklbw %%mm7,%%mm4\n"
  318. "punpcklbw %%mm7,%%mm1\n"
  319. "punpckhbw %%mm7,%%mm5\n"
  320. "punpckhbw %%mm7,%%mm3\n"
  321. "psubw %%mm1, %%mm4\n"
  322. "psubw %%mm3, %%mm5\n"
  323. "psubw %%mm4, %%mm0\n"
  324. "psubw %%mm5, %%mm2\n"
  325. "pxor %%mm3, %%mm3\n"
  326. "pxor %%mm1, %%mm1\n"
  327. "pcmpgtw %%mm0, %%mm3\n\t"
  328. "pcmpgtw %%mm2, %%mm1\n\t"
  329. "pxor %%mm3, %%mm0\n"
  330. "pxor %%mm1, %%mm2\n"
  331. "psubw %%mm3, %%mm0\n"
  332. "psubw %%mm1, %%mm2\n"
  333. "paddw %%mm0, %%mm2\n"
  334. "paddw %%mm2, %%mm6\n"
  335. "add %2,%0\n"
  336. "1:\n"
  337. "movq (%0),%%mm0\n"
  338. "movq %%mm0, %%mm1\n"
  339. "psllq $8, %%mm0\n"
  340. "psrlq $8, %%mm1\n"
  341. "psrlq $8, %%mm0\n"
  342. "movq %%mm0, %%mm2\n"
  343. "movq %%mm1, %%mm3\n"
  344. "punpcklbw %%mm7,%%mm0\n"
  345. "punpcklbw %%mm7,%%mm1\n"
  346. "punpckhbw %%mm7,%%mm2\n"
  347. "punpckhbw %%mm7,%%mm3\n"
  348. "psubw %%mm1, %%mm0\n"
  349. "psubw %%mm3, %%mm2\n"
  350. "psubw %%mm0, %%mm4\n"
  351. "psubw %%mm2, %%mm5\n"
  352. "pxor %%mm3, %%mm3\n"
  353. "pxor %%mm1, %%mm1\n"
  354. "pcmpgtw %%mm4, %%mm3\n\t"
  355. "pcmpgtw %%mm5, %%mm1\n\t"
  356. "pxor %%mm3, %%mm4\n"
  357. "pxor %%mm1, %%mm5\n"
  358. "psubw %%mm3, %%mm4\n"
  359. "psubw %%mm1, %%mm5\n"
  360. "paddw %%mm4, %%mm5\n"
  361. "paddw %%mm5, %%mm6\n"
  362. "add %2,%0\n"
  363. "movq (%0),%%mm4\n"
  364. "movq %%mm4, %%mm1\n"
  365. "psllq $8, %%mm4\n"
  366. "psrlq $8, %%mm1\n"
  367. "psrlq $8, %%mm4\n"
  368. "movq %%mm4, %%mm5\n"
  369. "movq %%mm1, %%mm3\n"
  370. "punpcklbw %%mm7,%%mm4\n"
  371. "punpcklbw %%mm7,%%mm1\n"
  372. "punpckhbw %%mm7,%%mm5\n"
  373. "punpckhbw %%mm7,%%mm3\n"
  374. "psubw %%mm1, %%mm4\n"
  375. "psubw %%mm3, %%mm5\n"
  376. "psubw %%mm4, %%mm0\n"
  377. "psubw %%mm5, %%mm2\n"
  378. "pxor %%mm3, %%mm3\n"
  379. "pxor %%mm1, %%mm1\n"
  380. "pcmpgtw %%mm0, %%mm3\n\t"
  381. "pcmpgtw %%mm2, %%mm1\n\t"
  382. "pxor %%mm3, %%mm0\n"
  383. "pxor %%mm1, %%mm2\n"
  384. "psubw %%mm3, %%mm0\n"
  385. "psubw %%mm1, %%mm2\n"
  386. "paddw %%mm0, %%mm2\n"
  387. "paddw %%mm2, %%mm6\n"
  388. "add %2,%0\n"
  389. "subl $2, %%ecx\n"
  390. " jnz 1b\n"
  391. "movq %%mm6, %%mm0\n"
  392. "punpcklwd %%mm7,%%mm0\n"
  393. "punpckhwd %%mm7,%%mm6\n"
  394. "paddd %%mm0, %%mm6\n"
  395. "movq %%mm6,%%mm0\n"
  396. "psrlq $32, %%mm6\n"
  397. "paddd %%mm6,%%mm0\n"
  398. "movd %%mm0,%1\n"
  399. : "+r" (pix1), "=r"(tmp)
  400. : "r" ((x86_reg)line_size) , "g" (h-2)
  401. : "%ecx");
  402. return tmp;
  403. }
  404. static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
  405. int tmp;
  406. uint8_t * pix= pix1;
  407. __asm__ volatile (
  408. "movl %3,%%ecx\n"
  409. "pxor %%mm7,%%mm7\n"
  410. "pxor %%mm6,%%mm6\n"
  411. "movq (%0),%%mm0\n"
  412. "movq 1(%0),%%mm1\n"
  413. "movq %%mm0, %%mm2\n"
  414. "movq %%mm1, %%mm3\n"
  415. "punpcklbw %%mm7,%%mm0\n"
  416. "punpcklbw %%mm7,%%mm1\n"
  417. "punpckhbw %%mm7,%%mm2\n"
  418. "punpckhbw %%mm7,%%mm3\n"
  419. "psubw %%mm1, %%mm0\n"
  420. "psubw %%mm3, %%mm2\n"
  421. "add %2,%0\n"
  422. "movq (%0),%%mm4\n"
  423. "movq 1(%0),%%mm1\n"
  424. "movq %%mm4, %%mm5\n"
  425. "movq %%mm1, %%mm3\n"
  426. "punpcklbw %%mm7,%%mm4\n"
  427. "punpcklbw %%mm7,%%mm1\n"
  428. "punpckhbw %%mm7,%%mm5\n"
  429. "punpckhbw %%mm7,%%mm3\n"
  430. "psubw %%mm1, %%mm4\n"
  431. "psubw %%mm3, %%mm5\n"
  432. "psubw %%mm4, %%mm0\n"
  433. "psubw %%mm5, %%mm2\n"
  434. "pxor %%mm3, %%mm3\n"
  435. "pxor %%mm1, %%mm1\n"
  436. "pcmpgtw %%mm0, %%mm3\n\t"
  437. "pcmpgtw %%mm2, %%mm1\n\t"
  438. "pxor %%mm3, %%mm0\n"
  439. "pxor %%mm1, %%mm2\n"
  440. "psubw %%mm3, %%mm0\n"
  441. "psubw %%mm1, %%mm2\n"
  442. "paddw %%mm0, %%mm2\n"
  443. "paddw %%mm2, %%mm6\n"
  444. "add %2,%0\n"
  445. "1:\n"
  446. "movq (%0),%%mm0\n"
  447. "movq 1(%0),%%mm1\n"
  448. "movq %%mm0, %%mm2\n"
  449. "movq %%mm1, %%mm3\n"
  450. "punpcklbw %%mm7,%%mm0\n"
  451. "punpcklbw %%mm7,%%mm1\n"
  452. "punpckhbw %%mm7,%%mm2\n"
  453. "punpckhbw %%mm7,%%mm3\n"
  454. "psubw %%mm1, %%mm0\n"
  455. "psubw %%mm3, %%mm2\n"
  456. "psubw %%mm0, %%mm4\n"
  457. "psubw %%mm2, %%mm5\n"
  458. "pxor %%mm3, %%mm3\n"
  459. "pxor %%mm1, %%mm1\n"
  460. "pcmpgtw %%mm4, %%mm3\n\t"
  461. "pcmpgtw %%mm5, %%mm1\n\t"
  462. "pxor %%mm3, %%mm4\n"
  463. "pxor %%mm1, %%mm5\n"
  464. "psubw %%mm3, %%mm4\n"
  465. "psubw %%mm1, %%mm5\n"
  466. "paddw %%mm4, %%mm5\n"
  467. "paddw %%mm5, %%mm6\n"
  468. "add %2,%0\n"
  469. "movq (%0),%%mm4\n"
  470. "movq 1(%0),%%mm1\n"
  471. "movq %%mm4, %%mm5\n"
  472. "movq %%mm1, %%mm3\n"
  473. "punpcklbw %%mm7,%%mm4\n"
  474. "punpcklbw %%mm7,%%mm1\n"
  475. "punpckhbw %%mm7,%%mm5\n"
  476. "punpckhbw %%mm7,%%mm3\n"
  477. "psubw %%mm1, %%mm4\n"
  478. "psubw %%mm3, %%mm5\n"
  479. "psubw %%mm4, %%mm0\n"
  480. "psubw %%mm5, %%mm2\n"
  481. "pxor %%mm3, %%mm3\n"
  482. "pxor %%mm1, %%mm1\n"
  483. "pcmpgtw %%mm0, %%mm3\n\t"
  484. "pcmpgtw %%mm2, %%mm1\n\t"
  485. "pxor %%mm3, %%mm0\n"
  486. "pxor %%mm1, %%mm2\n"
  487. "psubw %%mm3, %%mm0\n"
  488. "psubw %%mm1, %%mm2\n"
  489. "paddw %%mm0, %%mm2\n"
  490. "paddw %%mm2, %%mm6\n"
  491. "add %2,%0\n"
  492. "subl $2, %%ecx\n"
  493. " jnz 1b\n"
  494. "movq %%mm6, %%mm0\n"
  495. "punpcklwd %%mm7,%%mm0\n"
  496. "punpckhwd %%mm7,%%mm6\n"
  497. "paddd %%mm0, %%mm6\n"
  498. "movq %%mm6,%%mm0\n"
  499. "psrlq $32, %%mm6\n"
  500. "paddd %%mm6,%%mm0\n"
  501. "movd %%mm0,%1\n"
  502. : "+r" (pix1), "=r"(tmp)
  503. : "r" ((x86_reg)line_size) , "g" (h-2)
  504. : "%ecx");
  505. return tmp + hf_noise8_mmx(pix+8, line_size, h);
  506. }
  507. static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  508. MpegEncContext *c = p;
  509. int score1, score2;
  510. if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
  511. else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
  512. score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
  513. if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
  514. else return score1 + FFABS(score2)*8;
  515. }
  516. static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  517. MpegEncContext *c = p;
  518. int score1= sse8_mmx(c, pix1, pix2, line_size, h);
  519. int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
  520. if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
  521. else return score1 + FFABS(score2)*8;
  522. }
  523. static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  524. int tmp;
  525. assert( (((int)pix) & 7) == 0);
  526. assert((line_size &7) ==0);
  527. #define SUM(in0, in1, out0, out1) \
  528. "movq (%0), %%mm2\n"\
  529. "movq 8(%0), %%mm3\n"\
  530. "add %2,%0\n"\
  531. "movq %%mm2, " #out0 "\n"\
  532. "movq %%mm3, " #out1 "\n"\
  533. "psubusb " #in0 ", %%mm2\n"\
  534. "psubusb " #in1 ", %%mm3\n"\
  535. "psubusb " #out0 ", " #in0 "\n"\
  536. "psubusb " #out1 ", " #in1 "\n"\
  537. "por %%mm2, " #in0 "\n"\
  538. "por %%mm3, " #in1 "\n"\
  539. "movq " #in0 ", %%mm2\n"\
  540. "movq " #in1 ", %%mm3\n"\
  541. "punpcklbw %%mm7, " #in0 "\n"\
  542. "punpcklbw %%mm7, " #in1 "\n"\
  543. "punpckhbw %%mm7, %%mm2\n"\
  544. "punpckhbw %%mm7, %%mm3\n"\
  545. "paddw " #in1 ", " #in0 "\n"\
  546. "paddw %%mm3, %%mm2\n"\
  547. "paddw %%mm2, " #in0 "\n"\
  548. "paddw " #in0 ", %%mm6\n"
  549. __asm__ volatile (
  550. "movl %3,%%ecx\n"
  551. "pxor %%mm6,%%mm6\n"
  552. "pxor %%mm7,%%mm7\n"
  553. "movq (%0),%%mm0\n"
  554. "movq 8(%0),%%mm1\n"
  555. "add %2,%0\n"
  556. "jmp 2f\n"
  557. "1:\n"
  558. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  559. "2:\n"
  560. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  561. "subl $2, %%ecx\n"
  562. "jnz 1b\n"
  563. "movq %%mm6,%%mm0\n"
  564. "psrlq $32, %%mm6\n"
  565. "paddw %%mm6,%%mm0\n"
  566. "movq %%mm0,%%mm6\n"
  567. "psrlq $16, %%mm0\n"
  568. "paddw %%mm6,%%mm0\n"
  569. "movd %%mm0,%1\n"
  570. : "+r" (pix), "=r"(tmp)
  571. : "r" ((x86_reg)line_size) , "m" (h)
  572. : "%ecx");
  573. return tmp & 0xFFFF;
  574. }
  575. #undef SUM
  576. static int vsad_intra16_mmx2(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
  577. int tmp;
  578. assert( (((int)pix) & 7) == 0);
  579. assert((line_size &7) ==0);
  580. #define SUM(in0, in1, out0, out1) \
  581. "movq (%0), " #out0 "\n"\
  582. "movq 8(%0), " #out1 "\n"\
  583. "add %2,%0\n"\
  584. "psadbw " #out0 ", " #in0 "\n"\
  585. "psadbw " #out1 ", " #in1 "\n"\
  586. "paddw " #in1 ", " #in0 "\n"\
  587. "paddw " #in0 ", %%mm6\n"
  588. __asm__ volatile (
  589. "movl %3,%%ecx\n"
  590. "pxor %%mm6,%%mm6\n"
  591. "pxor %%mm7,%%mm7\n"
  592. "movq (%0),%%mm0\n"
  593. "movq 8(%0),%%mm1\n"
  594. "add %2,%0\n"
  595. "jmp 2f\n"
  596. "1:\n"
  597. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  598. "2:\n"
  599. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  600. "subl $2, %%ecx\n"
  601. "jnz 1b\n"
  602. "movd %%mm6,%1\n"
  603. : "+r" (pix), "=r"(tmp)
  604. : "r" ((x86_reg)line_size) , "m" (h)
  605. : "%ecx");
  606. return tmp;
  607. }
  608. #undef SUM
  609. static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  610. int tmp;
  611. assert( (((int)pix1) & 7) == 0);
  612. assert( (((int)pix2) & 7) == 0);
  613. assert((line_size &7) ==0);
  614. #define SUM(in0, in1, out0, out1) \
  615. "movq (%0),%%mm2\n"\
  616. "movq (%1)," #out0 "\n"\
  617. "movq 8(%0),%%mm3\n"\
  618. "movq 8(%1)," #out1 "\n"\
  619. "add %3,%0\n"\
  620. "add %3,%1\n"\
  621. "psubb " #out0 ", %%mm2\n"\
  622. "psubb " #out1 ", %%mm3\n"\
  623. "pxor %%mm7, %%mm2\n"\
  624. "pxor %%mm7, %%mm3\n"\
  625. "movq %%mm2, " #out0 "\n"\
  626. "movq %%mm3, " #out1 "\n"\
  627. "psubusb " #in0 ", %%mm2\n"\
  628. "psubusb " #in1 ", %%mm3\n"\
  629. "psubusb " #out0 ", " #in0 "\n"\
  630. "psubusb " #out1 ", " #in1 "\n"\
  631. "por %%mm2, " #in0 "\n"\
  632. "por %%mm3, " #in1 "\n"\
  633. "movq " #in0 ", %%mm2\n"\
  634. "movq " #in1 ", %%mm3\n"\
  635. "punpcklbw %%mm7, " #in0 "\n"\
  636. "punpcklbw %%mm7, " #in1 "\n"\
  637. "punpckhbw %%mm7, %%mm2\n"\
  638. "punpckhbw %%mm7, %%mm3\n"\
  639. "paddw " #in1 ", " #in0 "\n"\
  640. "paddw %%mm3, %%mm2\n"\
  641. "paddw %%mm2, " #in0 "\n"\
  642. "paddw " #in0 ", %%mm6\n"
  643. __asm__ volatile (
  644. "movl %4,%%ecx\n"
  645. "pxor %%mm6,%%mm6\n"
  646. "pcmpeqw %%mm7,%%mm7\n"
  647. "psllw $15, %%mm7\n"
  648. "packsswb %%mm7, %%mm7\n"
  649. "movq (%0),%%mm0\n"
  650. "movq (%1),%%mm2\n"
  651. "movq 8(%0),%%mm1\n"
  652. "movq 8(%1),%%mm3\n"
  653. "add %3,%0\n"
  654. "add %3,%1\n"
  655. "psubb %%mm2, %%mm0\n"
  656. "psubb %%mm3, %%mm1\n"
  657. "pxor %%mm7, %%mm0\n"
  658. "pxor %%mm7, %%mm1\n"
  659. "jmp 2f\n"
  660. "1:\n"
  661. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  662. "2:\n"
  663. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  664. "subl $2, %%ecx\n"
  665. "jnz 1b\n"
  666. "movq %%mm6,%%mm0\n"
  667. "psrlq $32, %%mm6\n"
  668. "paddw %%mm6,%%mm0\n"
  669. "movq %%mm0,%%mm6\n"
  670. "psrlq $16, %%mm0\n"
  671. "paddw %%mm6,%%mm0\n"
  672. "movd %%mm0,%2\n"
  673. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  674. : "r" ((x86_reg)line_size) , "m" (h)
  675. : "%ecx");
  676. return tmp & 0x7FFF;
  677. }
  678. #undef SUM
  679. static int vsad16_mmx2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
  680. int tmp;
  681. assert( (((int)pix1) & 7) == 0);
  682. assert( (((int)pix2) & 7) == 0);
  683. assert((line_size &7) ==0);
  684. #define SUM(in0, in1, out0, out1) \
  685. "movq (%0)," #out0 "\n"\
  686. "movq (%1),%%mm2\n"\
  687. "movq 8(%0)," #out1 "\n"\
  688. "movq 8(%1),%%mm3\n"\
  689. "add %3,%0\n"\
  690. "add %3,%1\n"\
  691. "psubb %%mm2, " #out0 "\n"\
  692. "psubb %%mm3, " #out1 "\n"\
  693. "pxor %%mm7, " #out0 "\n"\
  694. "pxor %%mm7, " #out1 "\n"\
  695. "psadbw " #out0 ", " #in0 "\n"\
  696. "psadbw " #out1 ", " #in1 "\n"\
  697. "paddw " #in1 ", " #in0 "\n"\
  698. "paddw " #in0 ", %%mm6\n"
  699. __asm__ volatile (
  700. "movl %4,%%ecx\n"
  701. "pxor %%mm6,%%mm6\n"
  702. "pcmpeqw %%mm7,%%mm7\n"
  703. "psllw $15, %%mm7\n"
  704. "packsswb %%mm7, %%mm7\n"
  705. "movq (%0),%%mm0\n"
  706. "movq (%1),%%mm2\n"
  707. "movq 8(%0),%%mm1\n"
  708. "movq 8(%1),%%mm3\n"
  709. "add %3,%0\n"
  710. "add %3,%1\n"
  711. "psubb %%mm2, %%mm0\n"
  712. "psubb %%mm3, %%mm1\n"
  713. "pxor %%mm7, %%mm0\n"
  714. "pxor %%mm7, %%mm1\n"
  715. "jmp 2f\n"
  716. "1:\n"
  717. SUM(%%mm4, %%mm5, %%mm0, %%mm1)
  718. "2:\n"
  719. SUM(%%mm0, %%mm1, %%mm4, %%mm5)
  720. "subl $2, %%ecx\n"
  721. "jnz 1b\n"
  722. "movd %%mm6,%2\n"
  723. : "+r" (pix1), "+r" (pix2), "=r"(tmp)
  724. : "r" ((x86_reg)line_size) , "m" (h)
  725. : "%ecx");
  726. return tmp;
  727. }
  728. #undef SUM
  729. static void diff_bytes_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
  730. x86_reg i=0;
  731. __asm__ volatile(
  732. "1: \n\t"
  733. "movq (%2, %0), %%mm0 \n\t"
  734. "movq (%1, %0), %%mm1 \n\t"
  735. "psubb %%mm0, %%mm1 \n\t"
  736. "movq %%mm1, (%3, %0) \n\t"
  737. "movq 8(%2, %0), %%mm0 \n\t"
  738. "movq 8(%1, %0), %%mm1 \n\t"
  739. "psubb %%mm0, %%mm1 \n\t"
  740. "movq %%mm1, 8(%3, %0) \n\t"
  741. "add $16, %0 \n\t"
  742. "cmp %4, %0 \n\t"
  743. " jb 1b \n\t"
  744. : "+r" (i)
  745. : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
  746. );
  747. for(; i<w; i++)
  748. dst[i+0] = src1[i+0]-src2[i+0];
  749. }
  750. static void sub_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
  751. x86_reg i=0;
  752. uint8_t l, lt;
  753. __asm__ volatile(
  754. "movq (%1, %0), %%mm0 \n\t" // LT
  755. "psllq $8, %%mm0 \n\t"
  756. "1: \n\t"
  757. "movq (%1, %0), %%mm1 \n\t" // T
  758. "movq -1(%2, %0), %%mm2 \n\t" // L
  759. "movq (%2, %0), %%mm3 \n\t" // X
  760. "movq %%mm2, %%mm4 \n\t" // L
  761. "psubb %%mm0, %%mm2 \n\t"
  762. "paddb %%mm1, %%mm2 \n\t" // L + T - LT
  763. "movq %%mm4, %%mm5 \n\t" // L
  764. "pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
  765. "pminub %%mm5, %%mm1 \n\t" // min(T, L)
  766. "pminub %%mm2, %%mm4 \n\t"
  767. "pmaxub %%mm1, %%mm4 \n\t"
  768. "psubb %%mm4, %%mm3 \n\t" // dst - pred
  769. "movq %%mm3, (%3, %0) \n\t"
  770. "add $8, %0 \n\t"
  771. "movq -1(%1, %0), %%mm0 \n\t" // LT
  772. "cmp %4, %0 \n\t"
  773. " jb 1b \n\t"
  774. : "+r" (i)
  775. : "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
  776. );
  777. l= *left;
  778. lt= *left_top;
  779. dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
  780. *left_top= src1[w-1];
  781. *left = src2[w-1];
  782. }
  783. #define MMABS_MMX(a,z)\
  784. "pxor " #z ", " #z " \n\t"\
  785. "pcmpgtw " #a ", " #z " \n\t"\
  786. "pxor " #z ", " #a " \n\t"\
  787. "psubw " #z ", " #a " \n\t"
  788. #define MMABS_MMX2(a,z)\
  789. "pxor " #z ", " #z " \n\t"\
  790. "psubw " #a ", " #z " \n\t"\
  791. "pmaxsw " #z ", " #a " \n\t"
  792. #define MMABS_SSSE3(a,z)\
  793. "pabsw " #a ", " #a " \n\t"
  794. #define MMABS_SUM(a,z, sum)\
  795. MMABS(a,z)\
  796. "paddusw " #a ", " #sum " \n\t"
  797. /* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
  798. * about 100k on extreme inputs. But that's very unlikely to occur in natural video,
  799. * and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
  800. #define HSUM_MMX(a, t, dst)\
  801. "movq "#a", "#t" \n\t"\
  802. "psrlq $32, "#a" \n\t"\
  803. "paddusw "#t", "#a" \n\t"\
  804. "movq "#a", "#t" \n\t"\
  805. "psrlq $16, "#a" \n\t"\
  806. "paddusw "#t", "#a" \n\t"\
  807. "movd "#a", "#dst" \n\t"\
  808. #define HSUM_MMX2(a, t, dst)\
  809. "pshufw $0x0E, "#a", "#t" \n\t"\
  810. "paddusw "#t", "#a" \n\t"\
  811. "pshufw $0x01, "#a", "#t" \n\t"\
  812. "paddusw "#t", "#a" \n\t"\
  813. "movd "#a", "#dst" \n\t"\
  814. #define HSUM_SSE2(a, t, dst)\
  815. "movhlps "#a", "#t" \n\t"\
  816. "paddusw "#t", "#a" \n\t"\
  817. "pshuflw $0x0E, "#a", "#t" \n\t"\
  818. "paddusw "#t", "#a" \n\t"\
  819. "pshuflw $0x01, "#a", "#t" \n\t"\
  820. "paddusw "#t", "#a" \n\t"\
  821. "movd "#a", "#dst" \n\t"\
  822. #define DCT_SAD4(m,mm,o)\
  823. "mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
  824. "mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
  825. "mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
  826. "mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
  827. MMABS_SUM(mm##2, mm##6, mm##0)\
  828. MMABS_SUM(mm##3, mm##7, mm##1)\
  829. MMABS_SUM(mm##4, mm##6, mm##0)\
  830. MMABS_SUM(mm##5, mm##7, mm##1)\
  831. #define DCT_SAD_MMX\
  832. "pxor %%mm0, %%mm0 \n\t"\
  833. "pxor %%mm1, %%mm1 \n\t"\
  834. DCT_SAD4(q, %%mm, 0)\
  835. DCT_SAD4(q, %%mm, 8)\
  836. DCT_SAD4(q, %%mm, 64)\
  837. DCT_SAD4(q, %%mm, 72)\
  838. "paddusw %%mm1, %%mm0 \n\t"\
  839. HSUM(%%mm0, %%mm1, %0)
  840. #define DCT_SAD_SSE2\
  841. "pxor %%xmm0, %%xmm0 \n\t"\
  842. "pxor %%xmm1, %%xmm1 \n\t"\
  843. DCT_SAD4(dqa, %%xmm, 0)\
  844. DCT_SAD4(dqa, %%xmm, 64)\
  845. "paddusw %%xmm1, %%xmm0 \n\t"\
  846. HSUM(%%xmm0, %%xmm1, %0)
  847. #define DCT_SAD_FUNC(cpu) \
  848. static int sum_abs_dctelem_##cpu(DCTELEM *block){\
  849. int sum;\
  850. __asm__ volatile(\
  851. DCT_SAD\
  852. :"=r"(sum)\
  853. :"r"(block)\
  854. );\
  855. return sum&0xFFFF;\
  856. }
  857. #define DCT_SAD DCT_SAD_MMX
  858. #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
  859. #define MMABS(a,z) MMABS_MMX(a,z)
  860. DCT_SAD_FUNC(mmx)
  861. #undef MMABS
  862. #undef HSUM
  863. #define HSUM(a,t,dst) HSUM_MMX2(a,t,dst)
  864. #define MMABS(a,z) MMABS_MMX2(a,z)
  865. DCT_SAD_FUNC(mmx2)
  866. #undef HSUM
  867. #undef DCT_SAD
  868. #define DCT_SAD DCT_SAD_SSE2
  869. #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
  870. DCT_SAD_FUNC(sse2)
  871. #undef MMABS
  872. #if HAVE_SSSE3_INLINE
  873. #define MMABS(a,z) MMABS_SSSE3(a,z)
  874. DCT_SAD_FUNC(ssse3)
  875. #undef MMABS
  876. #endif
  877. #undef HSUM
  878. #undef DCT_SAD
  879. static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
  880. int sum;
  881. x86_reg i=size;
  882. __asm__ volatile(
  883. "pxor %%mm4, %%mm4 \n"
  884. "1: \n"
  885. "sub $8, %0 \n"
  886. "movq (%2,%0), %%mm2 \n"
  887. "movq (%3,%0,2), %%mm0 \n"
  888. "movq 8(%3,%0,2), %%mm1 \n"
  889. "punpckhbw %%mm2, %%mm3 \n"
  890. "punpcklbw %%mm2, %%mm2 \n"
  891. "psraw $8, %%mm3 \n"
  892. "psraw $8, %%mm2 \n"
  893. "psubw %%mm3, %%mm1 \n"
  894. "psubw %%mm2, %%mm0 \n"
  895. "pmaddwd %%mm1, %%mm1 \n"
  896. "pmaddwd %%mm0, %%mm0 \n"
  897. "paddd %%mm1, %%mm4 \n"
  898. "paddd %%mm0, %%mm4 \n"
  899. "jg 1b \n"
  900. "movq %%mm4, %%mm3 \n"
  901. "psrlq $32, %%mm3 \n"
  902. "paddd %%mm3, %%mm4 \n"
  903. "movd %%mm4, %1 \n"
  904. :"+r"(i), "=r"(sum)
  905. :"r"(pix1), "r"(pix2)
  906. );
  907. return sum;
  908. }
  909. #define PHADDD(a, t)\
  910. "movq "#a", "#t" \n\t"\
  911. "psrlq $32, "#a" \n\t"\
  912. "paddd "#t", "#a" \n\t"
  913. /*
  914. pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
  915. pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
  916. pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
  917. */
  918. #define PMULHRW(x, y, s, o)\
  919. "pmulhw " #s ", "#x " \n\t"\
  920. "pmulhw " #s ", "#y " \n\t"\
  921. "paddw " #o ", "#x " \n\t"\
  922. "paddw " #o ", "#y " \n\t"\
  923. "psraw $1, "#x " \n\t"\
  924. "psraw $1, "#y " \n\t"
  925. #define DEF(x) x ## _mmx
  926. #define SET_RND MOVQ_WONE
  927. #define SCALE_OFFSET 1
  928. #include "dsputil_qns_template.c"
  929. #undef DEF
  930. #undef SET_RND
  931. #undef SCALE_OFFSET
  932. #undef PMULHRW
  933. #define DEF(x) x ## _3dnow
  934. #define SET_RND(x)
  935. #define SCALE_OFFSET 0
  936. #define PMULHRW(x, y, s, o)\
  937. "pmulhrw " #s ", "#x " \n\t"\
  938. "pmulhrw " #s ", "#y " \n\t"
  939. #include "dsputil_qns_template.c"
  940. #undef DEF
  941. #undef SET_RND
  942. #undef SCALE_OFFSET
  943. #undef PMULHRW
  944. #if HAVE_SSSE3_INLINE
  945. #undef PHADDD
  946. #define DEF(x) x ## _ssse3
  947. #define SET_RND(x)
  948. #define SCALE_OFFSET -1
  949. #define PHADDD(a, t)\
  950. "pshufw $0x0E, "#a", "#t" \n\t"\
  951. "paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
  952. #define PMULHRW(x, y, s, o)\
  953. "pmulhrsw " #s ", "#x " \n\t"\
  954. "pmulhrsw " #s ", "#y " \n\t"
  955. #include "dsputil_qns_template.c"
  956. #undef DEF
  957. #undef SET_RND
  958. #undef SCALE_OFFSET
  959. #undef PMULHRW
  960. #undef PHADDD
  961. #endif /* HAVE_SSSE3_INLINE */
  962. #endif /* HAVE_INLINE_ASM */
  963. int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
  964. #define hadamard_func(cpu) \
  965. int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
  966. int stride, int h); \
  967. int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
  968. int stride, int h);
  969. hadamard_func(mmx)
  970. hadamard_func(mmx2)
  971. hadamard_func(sse2)
  972. hadamard_func(ssse3)
  973. void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx)
  974. {
  975. int mm_flags = av_get_cpu_flags();
  976. #if HAVE_INLINE_ASM
  977. int bit_depth = avctx->bits_per_raw_sample;
  978. if (mm_flags & AV_CPU_FLAG_MMX) {
  979. const int dct_algo = avctx->dct_algo;
  980. if (avctx->bits_per_raw_sample <= 8 &&
  981. (dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX)) {
  982. if(mm_flags & AV_CPU_FLAG_SSE2){
  983. c->fdct = ff_fdct_sse2;
  984. } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
  985. c->fdct = ff_fdct_mmx2;
  986. }else{
  987. c->fdct = ff_fdct_mmx;
  988. }
  989. }
  990. if (bit_depth <= 8)
  991. c->get_pixels = get_pixels_mmx;
  992. c->diff_pixels = diff_pixels_mmx;
  993. c->pix_sum = pix_sum16_mmx;
  994. c->diff_bytes= diff_bytes_mmx;
  995. c->sum_abs_dctelem= sum_abs_dctelem_mmx;
  996. c->pix_norm1 = pix_norm1_mmx;
  997. c->sse[0] = sse16_mmx;
  998. c->sse[1] = sse8_mmx;
  999. c->vsad[4]= vsad_intra16_mmx;
  1000. c->nsse[0] = nsse16_mmx;
  1001. c->nsse[1] = nsse8_mmx;
  1002. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1003. c->vsad[0] = vsad16_mmx;
  1004. }
  1005. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1006. c->try_8x8basis= try_8x8basis_mmx;
  1007. }
  1008. c->add_8x8basis= add_8x8basis_mmx;
  1009. c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
  1010. if (mm_flags & AV_CPU_FLAG_MMXEXT) {
  1011. c->sum_abs_dctelem= sum_abs_dctelem_mmx2;
  1012. c->vsad[4]= vsad_intra16_mmx2;
  1013. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1014. c->vsad[0] = vsad16_mmx2;
  1015. }
  1016. c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
  1017. }
  1018. if(mm_flags & AV_CPU_FLAG_SSE2){
  1019. if (bit_depth <= 8)
  1020. c->get_pixels = get_pixels_sse2;
  1021. c->sum_abs_dctelem= sum_abs_dctelem_sse2;
  1022. }
  1023. #if HAVE_SSSE3_INLINE
  1024. if(mm_flags & AV_CPU_FLAG_SSSE3){
  1025. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1026. c->try_8x8basis= try_8x8basis_ssse3;
  1027. }
  1028. c->add_8x8basis= add_8x8basis_ssse3;
  1029. c->sum_abs_dctelem= sum_abs_dctelem_ssse3;
  1030. }
  1031. #endif
  1032. if(mm_flags & AV_CPU_FLAG_3DNOW){
  1033. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  1034. c->try_8x8basis= try_8x8basis_3dnow;
  1035. }
  1036. c->add_8x8basis= add_8x8basis_3dnow;
  1037. }
  1038. }
  1039. #endif /* HAVE_INLINE_ASM */
  1040. if (EXTERNAL_MMX(mm_flags)) {
  1041. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
  1042. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
  1043. if (EXTERNAL_MMXEXT(mm_flags)) {
  1044. c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx2;
  1045. c->hadamard8_diff[1] = ff_hadamard8_diff_mmx2;
  1046. }
  1047. if (EXTERNAL_SSE2(mm_flags)) {
  1048. c->sse[0] = ff_sse16_sse2;
  1049. #if HAVE_ALIGNED_STACK
  1050. c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
  1051. c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
  1052. #endif
  1053. }
  1054. if (EXTERNAL_SSSE3(mm_flags) && HAVE_ALIGNED_STACK) {
  1055. c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
  1056. c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
  1057. }
  1058. }
  1059. ff_dsputil_init_pix_mmx(c, avctx);
  1060. }