You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

141 lines
5.0KB

  1. /*
  2. * Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>
  3. *
  4. * This library is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2 of the License, or (at your option) any later version.
  8. *
  9. * This library is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with this library; if not, write to the Free Software
  16. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  17. */
  18. /**
  19. * MMX optimized version of (put|avg)_h264_chroma_mc8.
  20. * H264_CHROMA_MC8_TMPL must be defined to the desired function name and
  21. * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg.
  22. */
  23. static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  24. {
  25. uint64_t AA __align8;
  26. uint64_t DD __align8;
  27. unsigned long srcos = (long)src & 7;
  28. uint64_t sh1 __align8 = srcos * 8;
  29. uint64_t sh2 __align8 = 56 - sh1;
  30. int i;
  31. assert(x<8 && y<8 && x>=0 && y>=0);
  32. asm volatile("movd %1, %%mm4\n\t"
  33. "movd %2, %%mm6\n\t"
  34. "punpcklwd %%mm4, %%mm4\n\t"
  35. "punpcklwd %%mm6, %%mm6\n\t"
  36. "punpckldq %%mm4, %%mm4\n\t" /* mm4 = x words */
  37. "punpckldq %%mm6, %%mm6\n\t" /* mm6 = y words */
  38. "movq %%mm4, %%mm5\n\t"
  39. "pmullw %%mm6, %%mm4\n\t" /* mm4 = x * y */
  40. "psllw $3, %%mm5\n\t"
  41. "psllw $3, %%mm6\n\t"
  42. "movq %%mm5, %%mm7\n\t"
  43. "paddw %%mm6, %%mm7\n\t"
  44. "movq %%mm4, %0\n\t" /* DD = x * y */
  45. "psubw %%mm4, %%mm5\n\t" /* mm5 = B = 8x - xy */
  46. "psubw %%mm4, %%mm6\n\t" /* mm6 = C = 8y - xy */
  47. "paddw %3, %%mm4\n\t"
  48. "psubw %%mm7, %%mm4\n\t" /* mm4 = A = xy - (8x+8y) + 64 */
  49. "pxor %%mm7, %%mm7\n\t"
  50. : "=m" (DD) : "rm" (x), "rm" (y), "m" (ff_pw_64));
  51. asm volatile("movq %%mm4, %0" : "=m" (AA));
  52. src -= srcos;
  53. asm volatile(
  54. /* mm0 = src[0..7], mm1 = src[1..8] */
  55. "movq %0, %%mm1\n\t"
  56. "movq %1, %%mm0\n\t"
  57. "psrlq %2, %%mm1\n\t"
  58. "psllq %3, %%mm0\n\t"
  59. "movq %%mm0, %%mm4\n\t"
  60. "psllq $8, %%mm0\n\t"
  61. "por %%mm1, %%mm0\n\t"
  62. "psrlq $8, %%mm1\n\t"
  63. "por %%mm4, %%mm1\n\t"
  64. : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2));
  65. for(i=0; i<h; i++) {
  66. asm volatile(
  67. /* [mm2,mm3] = A * src[0..7] */
  68. "movq %%mm0, %%mm2\n\t"
  69. "punpcklbw %%mm7, %%mm2\n\t"
  70. "pmullw %0, %%mm2\n\t"
  71. "movq %%mm0, %%mm3\n\t"
  72. "punpckhbw %%mm7, %%mm3\n\t"
  73. "pmullw %0, %%mm3\n\t"
  74. /* [mm2,mm3] += B * src[1..8] */
  75. "movq %%mm1, %%mm0\n\t"
  76. "punpcklbw %%mm7, %%mm0\n\t"
  77. "pmullw %%mm5, %%mm0\n\t"
  78. "punpckhbw %%mm7, %%mm1\n\t"
  79. "pmullw %%mm5, %%mm1\n\t"
  80. "paddw %%mm0, %%mm2\n\t"
  81. "paddw %%mm1, %%mm3\n\t"
  82. : : "m" (AA));
  83. src += stride;
  84. asm volatile(
  85. /* mm0 = src[0..7], mm1 = src[1..8] */
  86. "movq %0, %%mm1\n\t"
  87. "movq %1, %%mm0\n\t"
  88. "psrlq %2, %%mm1\n\t"
  89. "psllq %3, %%mm0\n\t"
  90. "movq %%mm0, %%mm4\n\t"
  91. "psllq $8, %%mm0\n\t"
  92. "por %%mm1, %%mm0\n\t"
  93. "psrlq $8, %%mm1\n\t"
  94. "por %%mm4, %%mm1\n\t"
  95. : : "m" (src[0]), "m" (src[8]), "m" (sh1), "m" (sh2));
  96. asm volatile(
  97. /* [mm2,mm3] += C * src[0..7] */
  98. "movq %mm0, %mm4\n\t"
  99. "punpcklbw %mm7, %mm4\n\t"
  100. "pmullw %mm6, %mm4\n\t"
  101. "paddw %mm4, %mm2\n\t"
  102. "movq %mm0, %mm4\n\t"
  103. "punpckhbw %mm7, %mm4\n\t"
  104. "pmullw %mm6, %mm4\n\t"
  105. "paddw %mm4, %mm3\n\t");
  106. asm volatile(
  107. /* [mm2,mm3] += D * src[1..8] */
  108. "movq %%mm1, %%mm4\n\t"
  109. "punpcklbw %%mm7, %%mm4\n\t"
  110. "pmullw %0, %%mm4\n\t"
  111. "paddw %%mm4, %%mm2\n\t"
  112. "movq %%mm1, %%mm4\n\t"
  113. "punpckhbw %%mm7, %%mm4\n\t"
  114. "pmullw %0, %%mm4\n\t"
  115. "paddw %%mm4, %%mm3\n\t"
  116. : : "m" (DD));
  117. asm volatile(
  118. /* dst[0..7] = pack(([mm2,mm3] + 32) >> 6) */
  119. "paddw %1, %%mm2\n\t"
  120. "paddw %1, %%mm3\n\t"
  121. "psrlw $6, %%mm2\n\t"
  122. "psrlw $6, %%mm3\n\t"
  123. "packuswb %%mm3, %%mm2\n\t"
  124. H264_CHROMA_OP(%0, %%mm2)
  125. "movq %%mm2, %0\n\t"
  126. : "=m" (dst[0]) : "m" (ff_pw_32));
  127. dst+= stride;
  128. }
  129. }