You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

228 lines
7.0KB

  1. /*
  2. * The simplest mpeg encoder (well, it was the simplest!)
  3. * Copyright (c) 2000,2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. *
  19. * Optimized for ia32 cpus by Nick Kurshev <nickols_k@mail.ru>
  20. */
  21. #include "../dsputil.h"
  22. #include "../mpegvideo.h"
  23. #if 0
  24. /* XXX: GL: I don't understand why this function needs optimization
  25. (it is called only once per frame!), so I disabled it */
  26. void MPV_frame_start(MpegEncContext *s)
  27. {
  28. if (s->pict_type == B_TYPE) {
  29. __asm __volatile(
  30. "movl (%1), %%eax\n\t"
  31. "movl 4(%1), %%edx\n\t"
  32. "movl 8(%1), %%ecx\n\t"
  33. "movl %%eax, (%0)\n\t"
  34. "movl %%edx, 4(%0)\n\t"
  35. "movl %%ecx, 8(%0)\n\t"
  36. :
  37. :"r"(s->current_picture), "r"(s->aux_picture)
  38. :"eax","edx","ecx","memory");
  39. } else {
  40. /* swap next and last */
  41. __asm __volatile(
  42. "movl (%1), %%eax\n\t"
  43. "movl 4(%1), %%edx\n\t"
  44. "movl 8(%1), %%ecx\n\t"
  45. "xchgl (%0), %%eax\n\t"
  46. "xchgl 4(%0), %%edx\n\t"
  47. "xchgl 8(%0), %%ecx\n\t"
  48. "movl %%eax, (%1)\n\t"
  49. "movl %%edx, 4(%1)\n\t"
  50. "movl %%ecx, 8(%1)\n\t"
  51. "movl %%eax, (%2)\n\t"
  52. "movl %%edx, 4(%2)\n\t"
  53. "movl %%ecx, 8(%2)\n\t"
  54. :
  55. :"r"(s->last_picture), "r"(s->next_picture), "r"(s->current_picture)
  56. :"eax","edx","ecx","memory");
  57. }
  58. }
  59. #endif
  60. static const unsigned long long int mm_wabs __attribute__ ((aligned(8))) = 0xffffffffffffffffULL;
  61. static const unsigned long long int mm_wone __attribute__ ((aligned(8))) = 0x0001000100010001ULL;
  62. /*
  63. NK:
  64. Note: looking at PARANOID:
  65. "enable all paranoid tests for rounding, overflows, etc..."
  66. #ifdef PARANOID
  67. if (level < -2048 || level > 2047)
  68. fprintf(stderr, "unquant error %d %d\n", i, level);
  69. #endif
  70. We can suppose that result of two multiplications can't be greate of 0xFFFF
  71. i.e. is 16-bit, so we use here only PMULLW instruction and can avoid
  72. a complex multiplication.
  73. =====================================================
  74. Full formula for multiplication of 2 integer numbers
  75. which are represent as high:low words:
  76. input: value1 = high1:low1
  77. value2 = high2:low2
  78. output: value3 = value1*value2
  79. value3=high3:low3 (on overflow: modulus 2^32 wrap-around)
  80. this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4
  81. but this algorithm will compute only 0x66cb0ce4
  82. this limited by 16-bit size of operands
  83. ---------------------------------
  84. tlow1 = high1*low2
  85. tlow2 = high2*low1
  86. tlow1 = tlow1 + tlow2
  87. high3:low3 = low1*low2
  88. high3 += tlow1
  89. */
  90. static void dct_unquantize_mpeg1_mmx(MpegEncContext *s,
  91. DCTELEM *block, int n, int qscale)
  92. {
  93. int i, level;
  94. const UINT16 *quant_matrix;
  95. if (s->mb_intra) {
  96. if (n < 4)
  97. block[0] = block[0] * s->y_dc_scale;
  98. else
  99. block[0] = block[0] * s->c_dc_scale;
  100. if (s->out_format == FMT_H263) {
  101. i = 1;
  102. goto unquant_even;
  103. }
  104. /* XXX: only mpeg1 */
  105. quant_matrix = s->intra_matrix;
  106. i=1;
  107. /* Align on 4 elements boundary */
  108. while(i&3)
  109. {
  110. level = block[i];
  111. if (level) {
  112. if (level < 0) level = -level;
  113. level = (int)(level * qscale * quant_matrix[i]) >> 3;
  114. level = (level - 1) | 1;
  115. if (block[i] < 0) level = -level;
  116. block[i] = level;
  117. }
  118. i++;
  119. }
  120. __asm __volatile(
  121. "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */
  122. "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */
  123. "movq %2, %%mm4\n\t"
  124. "movq %%mm6, %%mm7\n\t"
  125. "movq %1, %%mm5\n\t"
  126. "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */
  127. "pxor %%mm6, %%mm6\n\t"
  128. ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory");
  129. for(;i<64;i+=4) {
  130. __asm __volatile(
  131. "movq %1, %%mm0\n\t"
  132. "movq %%mm7, %%mm1\n\t"
  133. "movq %%mm0, %%mm2\n\t"
  134. "movq %%mm0, %%mm3\n\t"
  135. "pcmpgtw %%mm6, %%mm2\n\t"
  136. "pmullw %2, %%mm1\n\t"
  137. "pandn %%mm4, %%mm2\n\t"
  138. "por %%mm5, %%mm2\n\t"
  139. "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */
  140. "pcmpeqw %%mm6, %%mm3\n\t"
  141. "pmullw %%mm0, %%mm1\n\t"
  142. "psraw $3, %%mm1\n\t"
  143. "psubw %%mm5, %%mm1\n\t" /* block[i] --; */
  144. "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */
  145. "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */
  146. "pmullw %%mm2, %%mm1\n\t" /* change signs again */
  147. "pand %%mm3, %%mm1\n\t" /* nullify if was zero */
  148. "movq %%mm1, %0"
  149. :"=m"(block[i])
  150. :"m"(block[i]), "m"(quant_matrix[i])
  151. :"memory");
  152. }
  153. } else {
  154. i = 0;
  155. unquant_even:
  156. quant_matrix = s->non_intra_matrix;
  157. /* Align on 4 elements boundary */
  158. while(i&3)
  159. {
  160. level = block[i];
  161. if (level) {
  162. if (level < 0) level = -level;
  163. level = (((level << 1) + 1) * qscale *
  164. ((int) quant_matrix[i])) >> 4;
  165. level = (level - 1) | 1;
  166. if(block[i] < 0) level = -level;
  167. block[i] = level;
  168. }
  169. i++;
  170. }
  171. __asm __volatile(
  172. "movd %0, %%mm6\n\t" /* mm6 = qscale | 0 */
  173. "punpckldq %%mm6, %%mm6\n\t" /* mm6 = qscale | qscale */
  174. "movq %2, %%mm4\n\t"
  175. "movq %%mm6, %%mm7\n\t"
  176. "movq %1, %%mm5\n\t"
  177. "packssdw %%mm6, %%mm7\n\t" /* mm7 = qscale | qscale | qscale | qscale */
  178. "pxor %%mm6, %%mm6\n\t"
  179. ::"g"(qscale),"m"(mm_wone),"m"(mm_wabs):"memory");
  180. for(;i<64;i+=4) {
  181. __asm __volatile(
  182. "movq %1, %%mm0\n\t"
  183. "movq %%mm7, %%mm1\n\t"
  184. "movq %%mm0, %%mm2\n\t"
  185. "movq %%mm0, %%mm3\n\t"
  186. "pcmpgtw %%mm6, %%mm2\n\t"
  187. "pmullw %2, %%mm1\n\t"
  188. "pandn %%mm4, %%mm2\n\t"
  189. "por %%mm5, %%mm2\n\t"
  190. "pmullw %%mm2, %%mm0\n\t" /* mm0 = abs(block[i]). */
  191. "psllw $1, %%mm0\n\t" /* block[i] <<= 1 */
  192. "paddw %%mm5, %%mm0\n\t" /* block[i] ++ */
  193. "pmullw %%mm0, %%mm1\n\t"
  194. "psraw $4, %%mm1\n\t"
  195. "pcmpeqw %%mm6, %%mm3\n\t"
  196. "psubw %%mm5, %%mm1\n\t" /* block[i] --; */
  197. "pandn %%mm4, %%mm3\n\t" /* fake of pcmpneqw : mm0 != 0 then mm1 = -1 */
  198. "por %%mm5, %%mm1\n\t" /* block[i] |= 1 */
  199. "pmullw %%mm2, %%mm1\n\t" /* change signs again */
  200. "pand %%mm3, %%mm1\n\t" /* nullify if was zero */
  201. "movq %%mm1, %0"
  202. :"=m"(block[i])
  203. :"m"(block[i]), "m"(quant_matrix[i])
  204. :"memory");
  205. }
  206. }
  207. }
  208. void MPV_common_init_mmx(MpegEncContext *s)
  209. {
  210. if (mm_flags & MM_MMX) {
  211. /* XXX: should include h263 optimization too. It would go even
  212. faster! */
  213. s->dct_unquantize = dct_unquantize_mpeg1_mmx;
  214. }
  215. }