You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

225 lines
7.9KB

  1. /*
  2. Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
  3. This program is free software; you can redistribute it and/or modify
  4. it under the terms of the GNU General Public License as published by
  5. the Free Software Foundation; either version 2 of the License, or
  6. (at your option) any later version.
  7. This program is distributed in the hope that it will be useful,
  8. but WITHOUT ANY WARRANTY; without even the implied warranty of
  9. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  10. GNU General Public License for more details.
  11. You should have received a copy of the GNU General Public License
  12. along with this program; if not, write to the Free Software
  13. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  14. */
  15. #undef SPREADW
  16. #undef PMAXW
  17. #ifdef HAVE_MMX2
  18. #define SPREADW(a) "pshufw $0, " #a ", " #a " \n\t"
  19. #define PMAXW(a,b) "pmaxsw " #a ", " #b " \n\t"
  20. #else
  21. #define SPREADW(a) \
  22. "punpcklwd " #a ", " #a " \n\t"\
  23. "punpcklwd " #a ", " #a " \n\t"
  24. #define PMAXW(a,b) \
  25. "psubusw " #a ", " #b " \n\t"\
  26. "paddw " #a ", " #b " \n\t"
  27. #endif
  28. static int RENAME(dct_quantize)(MpegEncContext *s,
  29. DCTELEM *block, int n,
  30. int qscale, int *overflow)
  31. {
  32. int level=0, last_non_zero_p1, q; //=0 is cuz gcc says uninitalized ...
  33. const UINT16 *qmat, *bias;
  34. static __align8 INT16 temp_block[64];
  35. av_fdct (block);
  36. if (s->mb_intra) {
  37. int dummy;
  38. if (n < 4)
  39. q = s->y_dc_scale;
  40. else
  41. q = s->c_dc_scale;
  42. /* note: block[0] is assumed to be positive */
  43. if (!s->h263_aic) {
  44. #if 1
  45. asm volatile (
  46. "xorl %%edx, %%edx \n\t"
  47. "mul %%ecx \n\t"
  48. : "=d" (level), "=a"(dummy)
  49. : "a" (block[0] + (q >> 1)), "c" (inverse[q])
  50. );
  51. #else
  52. asm volatile (
  53. "xorl %%edx, %%edx \n\t"
  54. "divw %%cx \n\t"
  55. "movzwl %%ax, %%eax \n\t"
  56. : "=a" (level)
  57. : "a" (block[0] + (q >> 1)), "c" (q)
  58. : "%edx"
  59. );
  60. #endif
  61. } else
  62. /* For AIC we skip quant/dequant of INTRADC */
  63. level = block[0];
  64. block[0]=0; //avoid fake overflow
  65. // temp_block[0] = (block[0] + (q >> 1)) / q;
  66. last_non_zero_p1 = 1;
  67. bias = s->q_intra_matrix16_bias[qscale];
  68. qmat = s->q_intra_matrix16[qscale];
  69. } else {
  70. last_non_zero_p1 = 0;
  71. bias = s->q_inter_matrix16_bias[qscale];
  72. qmat = s->q_inter_matrix16[qscale];
  73. }
  74. if(s->out_format == FMT_H263){
  75. asm volatile(
  76. "movd %%eax, %%mm3 \n\t" // last_non_zero_p1
  77. SPREADW(%%mm3)
  78. "pxor %%mm7, %%mm7 \n\t" // 0
  79. "pxor %%mm4, %%mm4 \n\t" // 0
  80. "movq (%2), %%mm5 \n\t" // qmat[0]
  81. "pxor %%mm6, %%mm6 \n\t"
  82. "psubw (%3), %%mm6 \n\t" // -bias[0]
  83. "movl $-128, %%eax \n\t"
  84. ".balign 16 \n\t"
  85. "1: \n\t"
  86. "pxor %%mm1, %%mm1 \n\t" // 0
  87. "movq (%1, %%eax), %%mm0 \n\t" // block[i]
  88. "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
  89. "pxor %%mm1, %%mm0 \n\t"
  90. "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
  91. "psubusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0]
  92. "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
  93. "por %%mm0, %%mm4 \n\t"
  94. "pxor %%mm1, %%mm0 \n\t"
  95. "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
  96. "movq %%mm0, (%5, %%eax) \n\t"
  97. "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
  98. "movq (%4, %%eax), %%mm1 \n\t"
  99. "movq %%mm7, (%1, %%eax) \n\t" // 0
  100. "pandn %%mm1, %%mm0 \n\t"
  101. PMAXW(%%mm0, %%mm3)
  102. "addl $8, %%eax \n\t"
  103. " js 1b \n\t"
  104. "movq %%mm3, %%mm0 \n\t"
  105. "psrlq $32, %%mm3 \n\t"
  106. PMAXW(%%mm0, %%mm3)
  107. "movq %%mm3, %%mm0 \n\t"
  108. "psrlq $16, %%mm3 \n\t"
  109. PMAXW(%%mm0, %%mm3)
  110. "movd %%mm3, %%eax \n\t"
  111. "movzbl %%al, %%eax \n\t" // last_non_zero_p1
  112. : "+a" (last_non_zero_p1)
  113. : "r" (block+64), "r" (qmat), "r" (bias),
  114. "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
  115. );
  116. // note the asm is split cuz gcc doesnt like that many operands ...
  117. asm volatile(
  118. "movd %1, %%mm1 \n\t" // max_qcoeff
  119. SPREADW(%%mm1)
  120. "psubusw %%mm1, %%mm4 \n\t"
  121. "packuswb %%mm4, %%mm4 \n\t"
  122. "movd %%mm4, %0 \n\t" // *overflow
  123. : "=g" (*overflow)
  124. : "g" (s->max_qcoeff)
  125. );
  126. }else{ // FMT_H263
  127. asm volatile(
  128. "movd %%eax, %%mm3 \n\t" // last_non_zero_p1
  129. SPREADW(%%mm3)
  130. "pxor %%mm7, %%mm7 \n\t" // 0
  131. "pxor %%mm4, %%mm4 \n\t" // 0
  132. "movl $-128, %%eax \n\t"
  133. ".balign 16 \n\t"
  134. "1: \n\t"
  135. "pxor %%mm1, %%mm1 \n\t" // 0
  136. "movq (%1, %%eax), %%mm0 \n\t" // block[i]
  137. "pcmpgtw %%mm0, %%mm1 \n\t" // block[i] <= 0 ? 0xFF : 0x00
  138. "pxor %%mm1, %%mm0 \n\t"
  139. "psubw %%mm1, %%mm0 \n\t" // ABS(block[i])
  140. "movq (%3, %%eax), %%mm6 \n\t" // bias[0]
  141. "paddusw %%mm6, %%mm0 \n\t" // ABS(block[i]) + bias[0]
  142. "movq (%2, %%eax), %%mm5 \n\t" // qmat[i]
  143. "pmulhw %%mm5, %%mm0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
  144. "por %%mm0, %%mm4 \n\t"
  145. "pxor %%mm1, %%mm0 \n\t"
  146. "psubw %%mm1, %%mm0 \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
  147. "movq %%mm0, (%5, %%eax) \n\t"
  148. "pcmpeqw %%mm7, %%mm0 \n\t" // out==0 ? 0xFF : 0x00
  149. "movq (%4, %%eax), %%mm1 \n\t"
  150. "movq %%mm7, (%1, %%eax) \n\t" // 0
  151. "pandn %%mm1, %%mm0 \n\t"
  152. PMAXW(%%mm0, %%mm3)
  153. "addl $8, %%eax \n\t"
  154. " js 1b \n\t"
  155. "movq %%mm3, %%mm0 \n\t"
  156. "psrlq $32, %%mm3 \n\t"
  157. PMAXW(%%mm0, %%mm3)
  158. "movq %%mm3, %%mm0 \n\t"
  159. "psrlq $16, %%mm3 \n\t"
  160. PMAXW(%%mm0, %%mm3)
  161. "movd %%mm3, %%eax \n\t"
  162. "movzbl %%al, %%eax \n\t" // last_non_zero_p1
  163. : "+a" (last_non_zero_p1)
  164. : "r" (block+64), "r" (qmat+64), "r" (bias+64),
  165. "r" (inv_zigzag_direct16+64), "r" (temp_block+64)
  166. );
  167. // note the asm is split cuz gcc doesnt like that many operands ...
  168. asm volatile(
  169. "movd %1, %%mm1 \n\t" // max_qcoeff
  170. SPREADW(%%mm1)
  171. "psubusw %%mm1, %%mm4 \n\t"
  172. "packuswb %%mm4, %%mm4 \n\t"
  173. "movd %%mm4, %0 \n\t" // *overflow
  174. : "=g" (*overflow)
  175. : "g" (s->max_qcoeff)
  176. );
  177. }
  178. if(s->mb_intra) temp_block[0]= level; //FIXME move afer permute
  179. // last_non_zero_p1=64;
  180. /* permute for IDCT */
  181. asm volatile(
  182. "movl %0, %%eax \n\t"
  183. "pushl %%ebp \n\t"
  184. "movl %%esp, " MANGLE(esp_temp) "\n\t"
  185. "1: \n\t"
  186. "movzbl (%1, %%eax), %%ebx \n\t"
  187. "movzbl 1(%1, %%eax), %%ebp \n\t"
  188. "movw (%2, %%ebx, 2), %%cx \n\t"
  189. "movw (%2, %%ebp, 2), %%sp \n\t"
  190. "movzbl " MANGLE(permutation) "(%%ebx), %%ebx\n\t"
  191. "movzbl " MANGLE(permutation) "(%%ebp), %%ebp\n\t"
  192. "movw %%cx, (%3, %%ebx, 2) \n\t"
  193. "movw %%sp, (%3, %%ebp, 2) \n\t"
  194. "addl $2, %%eax \n\t"
  195. " js 1b \n\t"
  196. "movl " MANGLE(esp_temp) ", %%esp\n\t"
  197. "popl %%ebp \n\t"
  198. :
  199. : "g" (-last_non_zero_p1), "d" (zigzag_direct_noperm+last_non_zero_p1), "S" (temp_block), "D" (block)
  200. : "%eax", "%ebx", "%ecx"
  201. );
  202. /*
  203. for(i=0; i<last_non_zero_p1; i++)
  204. {
  205. int j= zigzag_direct_noperm[i];
  206. block[block_permute_op(j)]= temp_block[j];
  207. }
  208. */
  209. //block_permute(block);
  210. return last_non_zero_p1 - 1;
  211. }