You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

455 lines
13KB

  1. /*
  2. * The simplest mpeg encoder (well, it was the simplest!)
  3. * Copyright (c) 2000,2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>
  20. * h263 dequantizer by Michael Niedermayer <michaelni@gmx.at>
  21. */
  22. #include "../dsputil.h"
  23. #include "../mpegvideo.h"
  24. #include "../avcodec.h"
  25. #include "../mangle.h"
  26. extern UINT8 zigzag_end[64];
  27. extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
  28. extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale);
  29. extern UINT8 zigzag_direct_noperm[64];
  30. extern UINT16 inv_zigzag_direct16[64];
  31. extern UINT32 inverse[256];
  32. #if 0
  33. /* XXX: GL: I don't understand why this function needs optimization
  34. (it is called only once per frame!), so I disabled it */
  35. void MPV_frame_start(MpegEncContext *s)
  36. {
  37. if (s->pict_type == B_TYPE) {
  38. __asm __volatile(
  39. "movl (%1), %%eax\n\t"
  40. "movl 4(%1), %%edx\n\t"
  41. "movl 8(%1), %%ecx\n\t"
  42. "movl %%eax, (%0)\n\t"
  43. "movl %%edx, 4(%0)\n\t"
  44. "movl %%ecx, 8(%0)\n\t"
  45. :
  46. :"r"(s->current_picture), "r"(s->aux_picture)
  47. :"eax","edx","ecx","memory");
  48. } else {
  49. /* swap next and last */
  50. __asm __volatile(
  51. "movl (%1), %%eax\n\t"
  52. "movl 4(%1), %%edx\n\t"
  53. "movl 8(%1), %%ecx\n\t"
  54. "xchgl (%0), %%eax\n\t"
  55. "xchgl 4(%0), %%edx\n\t"
  56. "xchgl 8(%0), %%ecx\n\t"
  57. "movl %%eax, (%1)\n\t"
  58. "movl %%edx, 4(%1)\n\t"
  59. "movl %%ecx, 8(%1)\n\t"
  60. "movl %%eax, (%2)\n\t"
  61. "movl %%edx, 4(%2)\n\t"
  62. "movl %%ecx, 8(%2)\n\t"
  63. :
  64. :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture)
  65. :"eax","edx","ecx","memory");
  66. }
  67. }
  68. #endif
  69. static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
  70. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  71. static void dct_unquantize_h263_mmx(MpegEncContext *s,
  72. DCTELEM *block, int n, int qscale)
  73. {
  74. int i, level, qmul, qadd, nCoeffs;
  75. qmul = s->qscale << 1;
  76. if (s->h263_aic && s->mb_intra)
  77. qadd = 0;
  78. else
  79. qadd = (s->qscale - 1) | 1;
  80. if (s->mb_intra) {
  81. if (!s->h263_aic) {
  82. if (n < 4)
  83. block[0] = block[0] * s->y_dc_scale;
  84. else
  85. block[0] = block[0] * s->c_dc_scale;
  86. }
  87. for(i=1; i<8; i++) {
  88. level = block[i];
  89. if (level) {
  90. if (level < 0) {
  91. level = level * qmul - qadd;
  92. } else {
  93. level = level * qmul + qadd;
  94. }
  95. block[i] = level;
  96. }
  97. }
  98. nCoeffs=64;
  99. } else {
  100. i = 0;
  101. nCoeffs= zigzag_end[ s->block_last_index[n] ];
  102. }
  103. //printf("%d %d ", qmul, qadd);
  104. asm volatile(
  105. "movd %1, %%mm6 \n\t" //qmul
  106. "packssdw %%mm6, %%mm6 \n\t"
  107. "packssdw %%mm6, %%mm6 \n\t"
  108. "movd %2, %%mm5 \n\t" //qadd
  109. "pxor %%mm7, %%mm7 \n\t"
  110. "packssdw %%mm5, %%mm5 \n\t"
  111. "packssdw %%mm5, %%mm5 \n\t"
  112. "psubw %%mm5, %%mm7 \n\t"
  113. "pxor %%mm4, %%mm4 \n\t"
  114. ".balign 16\n\t"
  115. "1: \n\t"
  116. "movq (%0, %3), %%mm0 \n\t"
  117. "movq 8(%0, %3), %%mm1 \n\t"
  118. "pmullw %%mm6, %%mm0 \n\t"
  119. "pmullw %%mm6, %%mm1 \n\t"
  120. "movq (%0, %3), %%mm2 \n\t"
  121. "movq 8(%0, %3), %%mm3 \n\t"
  122. "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
  123. "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
  124. "pxor %%mm2, %%mm0 \n\t"
  125. "pxor %%mm3, %%mm1 \n\t"
  126. "paddw %%mm7, %%mm0 \n\t"
  127. "paddw %%mm7, %%mm1 \n\t"
  128. "pxor %%mm0, %%mm2 \n\t"
  129. "pxor %%mm1, %%mm3 \n\t"
  130. "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
  131. "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
  132. "pandn %%mm2, %%mm0 \n\t"
  133. "pandn %%mm3, %%mm1 \n\t"
  134. "movq %%mm0, (%0, %3) \n\t"
  135. "movq %%mm1, 8(%0, %3) \n\t"
  136. "addl $16, %3 \n\t"
  137. "js 1b \n\t"
  138. ::"r" (block+nCoeffs), "g"(qmul), "g" (qadd), "r" (2*(i-nCoeffs))
  139. : "memory"
  140. );
  141. }
  142. /*
  143. NK:
  144. Note: looking at PARANOID:
  145. "enable all paranoid tests for rounding, overflows, etc..."
  146. #ifdef PARANOID
  147. if (level < -2048 || level > 2047)
  148. fprintf(stderr, "unquant error %d %d\n", i, level);
  149. #endif
  150. We can suppose that result of two multiplications can't be greate of 0xFFFF
  151. i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
  152. a complex multiplication.
  153. =====================================================
  154. Full formula for multiplication of 2 integer numbers
  155. which are represent as high:low words:
  156. input: value1 = high1:low1
  157. value2 = high2:low2
  158. output: value3 = value1*value2
  159. value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
  160. this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
  161. but this algorithm will compute only 0x66cb0ce4
  162. this limited by 16-bit size of operands
  163. ---------------------------------
  164. tlow1 = high1*low2
  165. tlow2 = high2*low1
  166. tlow1 = tlow1 + tlow2
  167. high3:low3 = low1*low2
  168. high3 += tlow1
  169. */
  170. static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
  171. DCTELEM *block, int n, int qscale)
  172. {
  173. int i, level, nCoeffs;
  174. const UINT16 *quant_matrix;
  175. if(s->alternate_scan) nCoeffs= 64;
  176. else nCoeffs= nCoeffs= zigzag_end[ s->block_last_index[n] ];
  177. if (s->mb_intra) {
  178. if (n < 4)
  179. block[0] = block[0] * s->y_dc_scale;
  180. else
  181. block[0] = block[0] * s->c_dc_scale;
  182. /* isnt used anymore (we have a h263 unquantizer since some time)
  183. if (s->out_format == FMT_H263) {
  184. i = 1;
  185. goto unquant_even;
  186. }*/
  187. /* XXX: only mpeg1 */
  188. quant_matrix = s->intra_matrix;
  189. i=1;
  190. /* Align on 4 elements boundary */
  191. while(i&3)
  192. {
  193. level = block[i];
  194. if (level) {
  195. if (level < 0) level = -level;
  196. level = (int)(level * qscale * quant_matrix[i]) >> 3;
  197. level = (level - 1) | 1;
  198. if (block[i] < 0) level = -level;
  199. block[i] = level;
  200. }
  201. i++;
  202. }
  203. __asm __volatile(
  204. "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */
  205. "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */
  206. "movq %2, %%mm4\n\t"
  207. "movq %%mm6, %%mm7\n\t"
  208. "movq %1, %%mm5\n\t"
  209. "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */
  210. "pxor %%mm6, %%mm6\n\t"
  211. ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory");
  212. for(;i<nCoeffs;i+=4) {
  213. __asm __volatile(
  214. "movq %1, %%mm0\n\t"
  215. "movq %%mm7, %%mm1\n\t"
  216. "movq %%mm0, %%mm2\n\t"
  217. "movq %%mm0, %%mm3\n\t"
  218. "pcmpgtw %%mm6, %%mm2\n\t"
  219. "pmullw %2, %%mm1\n\t"
  220. "pandn %%mm4, %%mm2\n\t"
  221. "por %%mm5, %%mm2\n\t"
  222. "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */
  223. "pcmpeqw %%mm6, %%mm3\n\t"
  224. "pmullw %%mm0, %%mm1\n\t"
  225. "psraw $3, %%mm1\n\t"
  226. "psubw %%mm5, %%mm1\n\t" /* block[i] --; */
  227. "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */
  228. "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */
  229. "pmullw %%mm2, %%mm1\n\t" /* change signs again */
  230. "pand %%mm3, %%mm1\n\t" /* nullify if was zero */
  231. "movq %%mm1, %0"
  232. :"=m"(block[i])
  233. :"m"(block[i]), "m"(quant_matrix[i])
  234. :"memory");
  235. }
  236. } else {
  237. i = 0;
  238. // unquant_even:
  239. quant_matrix = s->non_intra_matrix;
  240. /* Align on 4 elements boundary */
  241. while(i&7)
  242. {
  243. level = block[i];
  244. if (level) {
  245. if (level < 0) level = -level;
  246. level = (((level << 1) + 1) * qscale *
  247. ((int) quant_matrix[i])) >> 4;
  248. level = (level - 1) | 1;
  249. if(block[i] < 0) level = -level;
  250. block[i] = level;
  251. }
  252. i++;
  253. }
  254. asm volatile(
  255. "pcmpeqw %%mm7, %%mm7 \n\t"
  256. "psrlw $15, %%mm7 \n\t"
  257. "movd %2, %%mm6 \n\t"
  258. "packssdw %%mm6, %%mm6 \n\t"
  259. "packssdw %%mm6, %%mm6 \n\t"
  260. ".balign 16\n\t"
  261. "1: \n\t"
  262. "movq (%0, %3), %%mm0 \n\t"
  263. "movq 8(%0, %3), %%mm1 \n\t"
  264. "movq (%1, %3), %%mm4 \n\t"
  265. "movq 8(%1, %3), %%mm5 \n\t"
  266. "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
  267. "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
  268. "pxor %%mm2, %%mm2 \n\t"
  269. "pxor %%mm3, %%mm3 \n\t"
  270. "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
  271. "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
  272. "pxor %%mm2, %%mm0 \n\t"
  273. "pxor %%mm3, %%mm1 \n\t"
  274. "psubw %%mm2, %%mm0 \n\t" // abs(block[i])
  275. "psubw %%mm3, %%mm1 \n\t" // abs(block[i])
  276. "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
  277. "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
  278. "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
  279. "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
  280. "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
  281. "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
  282. "pxor %%mm4, %%mm4 \n\t"
  283. "pxor %%mm5, %%mm5 \n\t" // FIXME slow
  284. "pcmpeqw (%0, %3), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
  285. "pcmpeqw 8(%0, %3), %%mm5 \n\t" // block[i] == 0 ? -1 : 0
  286. "psraw $4, %%mm0 \n\t"
  287. "psraw $4, %%mm1 \n\t"
  288. "psubw %%mm7, %%mm0 \n\t"
  289. "psubw %%mm7, %%mm1 \n\t"
  290. "por %%mm7, %%mm0 \n\t"
  291. "por %%mm7, %%mm1 \n\t"
  292. "pxor %%mm2, %%mm0 \n\t"
  293. "pxor %%mm3, %%mm1 \n\t"
  294. "psubw %%mm2, %%mm0 \n\t"
  295. "psubw %%mm3, %%mm1 \n\t"
  296. "pandn %%mm0, %%mm4 \n\t"
  297. "pandn %%mm1, %%mm5 \n\t"
  298. "movq %%mm4, (%0, %3) \n\t"
  299. "movq %%mm5, 8(%0, %3) \n\t"
  300. "addl $16, %3 \n\t"
  301. "js 1b \n\t"
  302. ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "g" (qscale), "r" (2*(i-nCoeffs))
  303. : "memory"
  304. );
  305. }
  306. }
  307. /* draw the edges of width 'w' of an image of size width, height
  308. this mmx version can only handle w==8 || w==16 */
  309. static void draw_edges_mmx(UINT8 *buf, int wrap, int width, int height, int w)
  310. {
  311. UINT8 *ptr, *last_line;
  312. int i;
  313. last_line = buf + (height - 1) * wrap;
  314. /* left and right */
  315. ptr = buf;
  316. if(w==8)
  317. {
  318. asm volatile(
  319. "1: \n\t"
  320. "movd (%0), %%mm0 \n\t"
  321. "punpcklbw %%mm0, %%mm0 \n\t"
  322. "punpcklwd %%mm0, %%mm0 \n\t"
  323. "punpckldq %%mm0, %%mm0 \n\t"
  324. "movq %%mm0, -8(%0) \n\t"
  325. "movq -8(%0, %2), %%mm1 \n\t"
  326. "punpckhbw %%mm1, %%mm1 \n\t"
  327. "punpckhwd %%mm1, %%mm1 \n\t"
  328. "punpckhdq %%mm1, %%mm1 \n\t"
  329. "movq %%mm1, (%0, %2) \n\t"
  330. "addl %1, %0 \n\t"
  331. "cmpl %3, %0 \n\t"
  332. " jb 1b \n\t"
  333. : "+r" (ptr)
  334. : "r" (wrap), "r" (width), "r" (ptr + wrap*height)
  335. );
  336. }
  337. else
  338. {
  339. asm volatile(
  340. "1: \n\t"
  341. "movd (%0), %%mm0 \n\t"
  342. "punpcklbw %%mm0, %%mm0 \n\t"
  343. "punpcklwd %%mm0, %%mm0 \n\t"
  344. "punpckldq %%mm0, %%mm0 \n\t"
  345. "movq %%mm0, -8(%0) \n\t"
  346. "movq %%mm0, -16(%0) \n\t"
  347. "movq -8(%0, %2), %%mm1 \n\t"
  348. "punpckhbw %%mm1, %%mm1 \n\t"
  349. "punpckhwd %%mm1, %%mm1 \n\t"
  350. "punpckhdq %%mm1, %%mm1 \n\t"
  351. "movq %%mm1, (%0, %2) \n\t"
  352. "movq %%mm1, 8(%0, %2) \n\t"
  353. "addl %1, %0 \n\t"
  354. "cmpl %3, %0 \n\t"
  355. " jb 1b \n\t"
  356. : "+r" (ptr)
  357. : "r" (wrap), "r" (width), "r" (ptr + wrap*height)
  358. );
  359. }
  360. for(i=0;i<w;i+=4) {
  361. /* top and bottom (and hopefully also the corners) */
  362. ptr= buf - (i + 1) * wrap - w;
  363. asm volatile(
  364. "1: \n\t"
  365. "movq (%1, %0), %%mm0 \n\t"
  366. "movq %%mm0, (%0) \n\t"
  367. "movq %%mm0, (%0, %2) \n\t"
  368. "movq %%mm0, (%0, %2, 2) \n\t"
  369. "movq %%mm0, (%0, %3) \n\t"
  370. "addl $8, %0 \n\t"
  371. "cmpl %4, %0 \n\t"
  372. " jb 1b \n\t"
  373. : "+r" (ptr)
  374. : "r" ((int)buf - (int)ptr - w), "r" (-wrap), "r" (-wrap*3), "r" (ptr+width+2*w)
  375. );
  376. ptr= last_line + (i + 1) * wrap - w;
  377. asm volatile(
  378. "1: \n\t"
  379. "movq (%1, %0), %%mm0 \n\t"
  380. "movq %%mm0, (%0) \n\t"
  381. "movq %%mm0, (%0, %2) \n\t"
  382. "movq %%mm0, (%0, %2, 2) \n\t"
  383. "movq %%mm0, (%0, %3) \n\t"
  384. "addl $8, %0 \n\t"
  385. "cmpl %4, %0 \n\t"
  386. " jb 1b \n\t"
  387. : "+r" (ptr)
  388. : "r" ((int)last_line - (int)ptr - w), "r" (wrap), "r" (wrap*3), "r" (ptr+width+2*w)
  389. );
  390. }
  391. }
  392. static volatile int esp_temp;
  393. void unused_var_warning_killer(){
  394. esp_temp++;
  395. }
  396. #undef HAVE_MMX2
  397. #define RENAME(a) a ## _MMX
  398. #include "mpegvideo_mmx_template.c"
  399. #define HAVE_MMX2
  400. #undef RENAME
  401. #define RENAME(a) a ## _MMX2
  402. #include "mpegvideo_mmx_template.c"
  403. void MPV_common_init_mmx(MpegEncContext *s)
  404. {
  405. if (mm_flags & MM_MMX) {
  406. s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
  407. s->dct_unquantize_mpeg = dct_unquantize_mpeg1_mmx;
  408. draw_edges = draw_edges_mmx;
  409. if(mm_flags & MM_MMXEXT){
  410. dct_quantize= dct_quantize_MMX2;
  411. }else{
  412. dct_quantize= dct_quantize_MMX;
  413. }
  414. }
  415. }