You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

521 lines
16KB

  1. /*
  2. * The simplest mpeg encoder (well, it was the simplest!)
  3. * Copyright (c) 2000,2001 Fabrice Bellard.
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. *
  19. * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>
  20. * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
  21. */
  22. #include "../dsputil.h"
  23. #include "../mpegvideo.h"
  24. #include "../avcodec.h"
  25. extern uint8_t zigzag_direct_noperm[64];
  26. extern uint16_t inv_zigzag_direct16[64];
  27. extern uint32_t inverse[256];
  28. static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
  29. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  30. static void dct_unquantize_h263_mmx(MpegEncContext *s,
  31. DCTELEM *block, int n, int qscale)
  32. {
  33. int level, qmul, qadd, nCoeffs;
  34. qmul = qscale << 1;
  35. qadd = (qscale - 1) | 1;
  36. assert(s->block_last_index[n]>=0);
  37. if (s->mb_intra) {
  38. if (!s->h263_aic) {
  39. if (n < 4)
  40. level = block[0] * s->y_dc_scale;
  41. else
  42. level = block[0] * s->c_dc_scale;
  43. }else{
  44. qadd = 0;
  45. level= block[0];
  46. }
  47. nCoeffs=63;
  48. } else {
  49. nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
  50. level = 0;/* keep gcc quiet */
  51. }
  52. //printf("%d %d ", qmul, qadd);
  53. asm volatile(
  54. "movd %1, %%mm6 \n\t" //qmul
  55. "packssdw %%mm6, %%mm6 \n\t"
  56. "packssdw %%mm6, %%mm6 \n\t"
  57. "movd %2, %%mm5 \n\t" //qadd
  58. "pxor %%mm7, %%mm7 \n\t"
  59. "packssdw %%mm5, %%mm5 \n\t"
  60. "packssdw %%mm5, %%mm5 \n\t"
  61. "psubw %%mm5, %%mm7 \n\t"
  62. "pxor %%mm4, %%mm4 \n\t"
  63. ".balign 16\n\t"
  64. "1: \n\t"
  65. "movq (%0, %3), %%mm0 \n\t"
  66. "movq 8(%0, %3), %%mm1 \n\t"
  67. "pmullw %%mm6, %%mm0 \n\t"
  68. "pmullw %%mm6, %%mm1 \n\t"
  69. "movq (%0, %3), %%mm2 \n\t"
  70. "movq 8(%0, %3), %%mm3 \n\t"
  71. "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
  72. "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
  73. "pxor %%mm2, %%mm0 \n\t"
  74. "pxor %%mm3, %%mm1 \n\t"
  75. "paddw %%mm7, %%mm0 \n\t"
  76. "paddw %%mm7, %%mm1 \n\t"
  77. "pxor %%mm0, %%mm2 \n\t"
  78. "pxor %%mm1, %%mm3 \n\t"
  79. "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
  80. "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
  81. "pandn %%mm2, %%mm0 \n\t"
  82. "pandn %%mm3, %%mm1 \n\t"
  83. "movq %%mm0, (%0, %3) \n\t"
  84. "movq %%mm1, 8(%0, %3) \n\t"
  85. "addl $16, %3 \n\t"
  86. "jng 1b \n\t"
  87. ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(-nCoeffs))
  88. : "memory"
  89. );
  90. if(s->mb_intra)
  91. block[0]= level;
  92. }
  93. /*
  94. NK:
  95. Note: looking at PARANOID:
  96. "enable all paranoid tests for rounding, overflows, etc..."
  97. #ifdef PARANOID
  98. if (level < -2048 || level > 2047)
  99. fprintf(stderr, "unquant error %d %d\n", i, level);
  100. #endif
  101. We can suppose that result of two multiplications can't be greate of 0xFFFF
  102. i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
  103. a complex multiplication.
  104. =====================================================
  105. Full formula for multiplication of 2 integer numbers
  106. which are represent as high:low words:
  107. input: value1 = high1:low1
  108. value2 = high2:low2
  109. output: value3 = value1*value2
  110. value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
  111. this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
  112. but this algorithm will compute only 0x66cb0ce4
  113. this limited by 16-bit size of operands
  114. ---------------------------------
  115. tlow1 = high1*low2
  116. tlow2 = high2*low1
  117. tlow1 = tlow1 + tlow2
  118. high3:low3 = low1*low2
  119. high3 += tlow1
  120. */
  121. static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
  122. DCTELEM *block, int n, int qscale)
  123. {
  124. int nCoeffs;
  125. const uint16_t *quant_matrix;
  126. assert(s->block_last_index[n]>=0);
  127. nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
  128. if (s->mb_intra) {
  129. int block0;
  130. if (n < 4)
  131. block0 = block[0] * s->y_dc_scale;
  132. else
  133. block0 = block[0] * s->c_dc_scale;
  134. /* XXX: only mpeg1 */
  135. quant_matrix = s->intra_matrix;
  136. asm volatile(
  137. "pcmpeqw %%mm7, %%mm7 \n\t"
  138. "psrlw $15, %%mm7 \n\t"
  139. "movd %2, %%mm6 \n\t"
  140. "packssdw %%mm6, %%mm6 \n\t"
  141. "packssdw %%mm6, %%mm6 \n\t"
  142. "movl %3, %%eax \n\t"
  143. ".balign 16\n\t"
  144. "1: \n\t"
  145. "movq (%0, %%eax), %%mm0 \n\t"
  146. "movq 8(%0, %%eax), %%mm1 \n\t"
  147. "movq (%1, %%eax), %%mm4 \n\t"
  148. "movq 8(%1, %%eax), %%mm5 \n\t"
  149. "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
  150. "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
  151. "pxor %%mm2, %%mm2 \n\t"
  152. "pxor %%mm3, %%mm3 \n\t"
  153. "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
  154. "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
  155. "pxor %%mm2, %%mm0 \n\t"
  156. "pxor %%mm3, %%mm1 \n\t"
  157. "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
  158. "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
  159. "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
  160. "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
  161. "pxor %%mm4, %%mm4 \n\t"
  162. "pxor %%mm5, %%mm5 \n\t" // FIXME slow
  163. "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
  164. "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
  165. "psraw $3, %%mm0 \n\t"
  166. "psraw $3, %%mm1 \n\t"
  167. "psubw %%mm7, %%mm0 \n\t"
  168. "psubw %%mm7, %%mm1 \n\t"
  169. "por %%mm7, %%mm0 \n\t"
  170. "por %%mm7, %%mm1 \n\t"
  171. "pxor %%mm2, %%mm0 \n\t"
  172. "pxor %%mm3, %%mm1 \n\t"
  173. "psubw %%mm2, %%mm0 \n\t"
  174. "psubw %%mm3, %%mm1 \n\t"
  175. "pandn %%mm0, %%mm4 \n\t"
  176. "pandn %%mm1, %%mm5 \n\t"
  177. "movq %%mm4, (%0, %%eax) \n\t"
  178. "movq %%mm5, 8(%0, %%eax) \n\t"
  179. "addl $16, %%eax \n\t"
  180. "js 1b \n\t"
  181. ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
  182. : "%eax", "memory"
  183. );
  184. block[0]= block0;
  185. } else {
  186. quant_matrix = s->inter_matrix;
  187. asm volatile(
  188. "pcmpeqw %%mm7, %%mm7 \n\t"
  189. "psrlw $15, %%mm7 \n\t"
  190. "movd %2, %%mm6 \n\t"
  191. "packssdw %%mm6, %%mm6 \n\t"
  192. "packssdw %%mm6, %%mm6 \n\t"
  193. "movl %3, %%eax \n\t"
  194. ".balign 16\n\t"
  195. "1: \n\t"
  196. "movq (%0, %%eax), %%mm0 \n\t"
  197. "movq 8(%0, %%eax), %%mm1 \n\t"
  198. "movq (%1, %%eax), %%mm4 \n\t"
  199. "movq 8(%1, %%eax), %%mm5 \n\t"
  200. "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
  201. "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
  202. "pxor %%mm2, %%mm2 \n\t"
  203. "pxor %%mm3, %%mm3 \n\t"
  204. "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
  205. "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
  206. "pxor %%mm2, %%mm0 \n\t"
  207. "pxor %%mm3, %%mm1 \n\t"
  208. "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
  209. "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
  210. "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
  211. "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
  212. "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
  213. "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
  214. "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
  215. "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
  216. "pxor %%mm4, %%mm4 \n\t"
  217. "pxor %%mm5, %%mm5 \n\t" // FIXME slow
  218. "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
  219. "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
  220. "psraw $4, %%mm0 \n\t"
  221. "psraw $4, %%mm1 \n\t"
  222. "psubw %%mm7, %%mm0 \n\t"
  223. "psubw %%mm7, %%mm1 \n\t"
  224. "por %%mm7, %%mm0 \n\t"
  225. "por %%mm7, %%mm1 \n\t"
  226. "pxor %%mm2, %%mm0 \n\t"
  227. "pxor %%mm3, %%mm1 \n\t"
  228. "psubw %%mm2, %%mm0 \n\t"
  229. "psubw %%mm3, %%mm1 \n\t"
  230. "pandn %%mm0, %%mm4 \n\t"
  231. "pandn %%mm1, %%mm5 \n\t"
  232. "movq %%mm4, (%0, %%eax) \n\t"
  233. "movq %%mm5, 8(%0, %%eax) \n\t"
  234. "addl $16, %%eax \n\t"
  235. "js 1b \n\t"
  236. ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
  237. : "%eax", "memory"
  238. );
  239. }
  240. }
  241. static void dct_unquantize_mpeg2_mmx(MpegEncContext *s,
  242. DCTELEM *block, int n, int qscale)
  243. {
  244. int nCoeffs;
  245. const uint16_t *quant_matrix;
  246. assert(s->block_last_index[n]>=0);
  247. if(s->alternate_scan) nCoeffs= 63; //FIXME
  248. else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
  249. if (s->mb_intra) {
  250. int block0;
  251. if (n < 4)
  252. block0 = block[0] * s->y_dc_scale;
  253. else
  254. block0 = block[0] * s->c_dc_scale;
  255. quant_matrix = s->intra_matrix;
  256. asm volatile(
  257. "pcmpeqw %%mm7, %%mm7 \n\t"
  258. "psrlw $15, %%mm7 \n\t"
  259. "movd %2, %%mm6 \n\t"
  260. "packssdw %%mm6, %%mm6 \n\t"
  261. "packssdw %%mm6, %%mm6 \n\t"
  262. "movl %3, %%eax \n\t"
  263. ".balign 16\n\t"
  264. "1: \n\t"
  265. "movq (%0, %%eax), %%mm0 \n\t"
  266. "movq 8(%0, %%eax), %%mm1 \n\t"
  267. "movq (%1, %%eax), %%mm4 \n\t"
  268. "movq 8(%1, %%eax), %%mm5 \n\t"
  269. "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
  270. "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
  271. "pxor %%mm2, %%mm2 \n\t"
  272. "pxor %%mm3, %%mm3 \n\t"
  273. "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
  274. "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
  275. "pxor %%mm2, %%mm0 \n\t"
  276. "pxor %%mm3, %%mm1 \n\t"
  277. "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
  278. "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
  279. "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
  280. "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
  281. "pxor %%mm4, %%mm4 \n\t"
  282. "pxor %%mm5, %%mm5 \n\t" // FIXME slow
  283. "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
  284. "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
  285. "psraw $3, %%mm0 \n\t"
  286. "psraw $3, %%mm1 \n\t"
  287. "pxor %%mm2, %%mm0 \n\t"
  288. "pxor %%mm3, %%mm1 \n\t"
  289. "psubw %%mm2, %%mm0 \n\t"
  290. "psubw %%mm3, %%mm1 \n\t"
  291. "pandn %%mm0, %%mm4 \n\t"
  292. "pandn %%mm1, %%mm5 \n\t"
  293. "movq %%mm4, (%0, %%eax) \n\t"
  294. "movq %%mm5, 8(%0, %%eax) \n\t"
  295. "addl $16, %%eax \n\t"
  296. "jng 1b \n\t"
  297. ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "g" (-2*nCoeffs)
  298. : "%eax", "memory"
  299. );
  300. block[0]= block0;
  301. //Note, we dont do mismatch control for intra as errors cannot accumulate
  302. } else {
  303. quant_matrix = s->inter_matrix;
  304. asm volatile(
  305. "pcmpeqw %%mm7, %%mm7 \n\t"
  306. "psrlq $48, %%mm7 \n\t"
  307. "movd %2, %%mm6 \n\t"
  308. "packssdw %%mm6, %%mm6 \n\t"
  309. "packssdw %%mm6, %%mm6 \n\t"
  310. "movl %3, %%eax \n\t"
  311. ".balign 16\n\t"
  312. "1: \n\t"
  313. "movq (%0, %%eax), %%mm0 \n\t"
  314. "movq 8(%0, %%eax), %%mm1 \n\t"
  315. "movq (%1, %%eax), %%mm4 \n\t"
  316. "movq 8(%1, %%eax), %%mm5 \n\t"
  317. "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
  318. "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
  319. "pxor %%mm2, %%mm2 \n\t"
  320. "pxor %%mm3, %%mm3 \n\t"
  321. "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
  322. "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
  323. "pxor %%mm2, %%mm0 \n\t"
  324. "pxor %%mm3, %%mm1 \n\t"
  325. "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
  326. "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
  327. "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
  328. "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
  329. "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
  330. "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
  331. "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
  332. "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
  333. "pxor %%mm4, %%mm4 \n\t"
  334. "pxor %%mm5, %%mm5 \n\t" // FIXME slow
  335. "pcmpeqw (%0, %%eax), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
  336. "pcmpeqw 8(%0, %%eax), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
  337. "psrlw $4, %%mm0 \n\t"
  338. "psrlw $4, %%mm1 \n\t"
  339. "pxor %%mm2, %%mm0 \n\t"
  340. "pxor %%mm3, %%mm1 \n\t"
  341. "psubw %%mm2, %%mm0 \n\t"
  342. "psubw %%mm3, %%mm1 \n\t"
  343. "pandn %%mm0, %%mm4 \n\t"
  344. "pandn %%mm1, %%mm5 \n\t"
  345. "pxor %%mm4, %%mm7 \n\t"
  346. "pxor %%mm5, %%mm7 \n\t"
  347. "movq %%mm4, (%0, %%eax) \n\t"
  348. "movq %%mm5, 8(%0, %%eax) \n\t"
  349. "addl $16, %%eax \n\t"
  350. "jng 1b \n\t"
  351. "movd 124(%0, %3), %%mm0 \n\t"
  352. "movq %%mm7, %%mm6 \n\t"
  353. "psrlq $32, %%mm7 \n\t"
  354. "pxor %%mm6, %%mm7 \n\t"
  355. "movq %%mm7, %%mm6 \n\t"
  356. "psrlq $16, %%mm7 \n\t"
  357. "pxor %%mm6, %%mm7 \n\t"
  358. "pslld $31, %%mm7 \n\t"
  359. "psrlq $15, %%mm7 \n\t"
  360. "pxor %%mm7, %%mm0 \n\t"
  361. "movd %%mm0, 124(%0, %3) \n\t"
  362. ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (-2*nCoeffs)
  363. : "%eax", "memory"
  364. );
  365. }
  366. }
  367. /* draw the edges of width 'w' of an image of size width, height
  368. this mmx version can only handle w==8 || w==16 */
  369. static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w)
  370. {
  371. uint8_t *ptr, *last_line;
  372. int i;
  373. last_line = buf + (height - 1) * wrap;
  374. /* left and right */
  375. ptr = buf;
  376. if(w==8)
  377. {
  378. asm volatile(
  379. "1: \n\t"
  380. "movd (%0), %%mm0 \n\t"
  381. "punpcklbw %%mm0, %%mm0 \n\t"
  382. "punpcklwd %%mm0, %%mm0 \n\t"
  383. "punpckldq %%mm0, %%mm0 \n\t"
  384. "movq %%mm0, -8(%0) \n\t"
  385. "movq -8(%0, %2), %%mm1 \n\t"
  386. "punpckhbw %%mm1, %%mm1 \n\t"
  387. "punpckhwd %%mm1, %%mm1 \n\t"
  388. "punpckhdq %%mm1, %%mm1 \n\t"
  389. "movq %%mm1, (%0, %2) \n\t"
  390. "addl %1, %0 \n\t"
  391. "cmpl %3, %0 \n\t"
  392. " jb 1b \n\t"
  393. : "+r" (ptr)
  394. : "r" (wrap), "r" (width), "r" (ptr + wrap*height)
  395. );
  396. }
  397. else
  398. {
  399. asm volatile(
  400. "1: \n\t"
  401. "movd (%0), %%mm0 \n\t"
  402. "punpcklbw %%mm0, %%mm0 \n\t"
  403. "punpcklwd %%mm0, %%mm0 \n\t"
  404. "punpckldq %%mm0, %%mm0 \n\t"
  405. "movq %%mm0, -8(%0) \n\t"
  406. "movq %%mm0, -16(%0) \n\t"
  407. "movq -8(%0, %2), %%mm1 \n\t"
  408. "punpckhbw %%mm1, %%mm1 \n\t"
  409. "punpckhwd %%mm1, %%mm1 \n\t"
  410. "punpckhdq %%mm1, %%mm1 \n\t"
  411. "movq %%mm1, (%0, %2) \n\t"
  412. "movq %%mm1, 8(%0, %2) \n\t"
  413. "addl %1, %0 \n\t"
  414. "cmpl %3, %0 \n\t"
  415. " jb 1b \n\t"
  416. : "+r" (ptr)
  417. : "r" (wrap), "r" (width), "r" (ptr + wrap*height)
  418. );
  419. }
  420. for(i=0;i<w;i+=4) {
  421. /* top and bottom (and hopefully also the corners) */
  422. ptr= buf - (i + 1) * wrap - w;
  423. asm volatile(
  424. "1: \n\t"
  425. "movq (%1, %0), %%mm0 \n\t"
  426. "movq %%mm0, (%0) \n\t"
  427. "movq %%mm0, (%0, %2) \n\t"
  428. "movq %%mm0, (%0, %2, 2) \n\t"
  429. "movq %%mm0, (%0, %3) \n\t"
  430. "addl $8, %0 \n\t"
  431. "cmpl %4, %0 \n\t"
  432. " jb 1b \n\t"
  433. : "+r" (ptr)
  434. : "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w)
  435. );
  436. ptr= last_line + (i + 1) * wrap - w;
  437. asm volatile(
  438. "1: \n\t"
  439. "movq (%1, %0), %%mm0 \n\t"
  440. "movq %%mm0, (%0) \n\t"
  441. "movq %%mm0, (%0, %2) \n\t"
  442. "movq %%mm0, (%0, %2, 2) \n\t"
  443. "movq %%mm0, (%0, %3) \n\t"
  444. "addl $8, %0 \n\t"
  445. "cmpl %4, %0 \n\t"
  446. " jb 1b \n\t"
  447. : "+r" (ptr)
  448. : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w)
  449. );
  450. }
  451. }
  452. #undef HAVE_MMX2
  453. #define RENAME(a) a ## _MMX
  454. #include "mpegvideo_mmx_template.c"
  455. #define HAVE_MMX2
  456. #undef RENAME
  457. #define RENAME(a) a ## _MMX2
  458. #include "mpegvideo_mmx_template.c"
  459. void MPV_common_init_mmx(MpegEncContext *s)
  460. {
  461. if (mm_flags & MM_MMX) {
  462. const int dct_algo = s->avctx->dct_algo;
  463. s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
  464. s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
  465. s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx;
  466. draw_edges = draw_edges_mmx;
  467. if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
  468. if(mm_flags & MM_MMXEXT){
  469. s->dct_quantize= dct_quantize_MMX2;
  470. } else {
  471. s->dct_quantize= dct_quantize_MMX;
  472. }
  473. }
  474. }
  475. }