You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

180 lines
7.5KB

  1. /*
  2. * DSP utils mmx functions are compiled twice for rnd/no_rnd
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  7. * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
  8. * and improved by Zdenek Kabelac <kabi@users.sf.net>
  9. *
  10. * This file is part of Libav.
  11. *
  12. * Libav is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU Lesser General Public
  14. * License as published by the Free Software Foundation; either
  15. * version 2.1 of the License, or (at your option) any later version.
  16. *
  17. * Libav is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  20. * Lesser General Public License for more details.
  21. *
  22. * You should have received a copy of the GNU Lesser General Public
  23. * License along with Libav; if not, write to the Free Software
  24. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25. */
  26. // put_pixels
  27. static void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  28. {
  29. MOVQ_ZERO(mm7);
  30. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  31. __asm__ volatile(
  32. "movq (%1), %%mm0 \n\t"
  33. "movq 1(%1), %%mm4 \n\t"
  34. "movq %%mm0, %%mm1 \n\t"
  35. "movq %%mm4, %%mm5 \n\t"
  36. "punpcklbw %%mm7, %%mm0 \n\t"
  37. "punpcklbw %%mm7, %%mm4 \n\t"
  38. "punpckhbw %%mm7, %%mm1 \n\t"
  39. "punpckhbw %%mm7, %%mm5 \n\t"
  40. "paddusw %%mm0, %%mm4 \n\t"
  41. "paddusw %%mm1, %%mm5 \n\t"
  42. "xor %%"REG_a", %%"REG_a" \n\t"
  43. "add %3, %1 \n\t"
  44. ".p2align 3 \n\t"
  45. "1: \n\t"
  46. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  47. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  48. "movq %%mm0, %%mm1 \n\t"
  49. "movq %%mm2, %%mm3 \n\t"
  50. "punpcklbw %%mm7, %%mm0 \n\t"
  51. "punpcklbw %%mm7, %%mm2 \n\t"
  52. "punpckhbw %%mm7, %%mm1 \n\t"
  53. "punpckhbw %%mm7, %%mm3 \n\t"
  54. "paddusw %%mm2, %%mm0 \n\t"
  55. "paddusw %%mm3, %%mm1 \n\t"
  56. "paddusw %%mm6, %%mm4 \n\t"
  57. "paddusw %%mm6, %%mm5 \n\t"
  58. "paddusw %%mm0, %%mm4 \n\t"
  59. "paddusw %%mm1, %%mm5 \n\t"
  60. "psrlw $2, %%mm4 \n\t"
  61. "psrlw $2, %%mm5 \n\t"
  62. "packuswb %%mm5, %%mm4 \n\t"
  63. "movq %%mm4, (%2, %%"REG_a") \n\t"
  64. "add %3, %%"REG_a" \n\t"
  65. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  66. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  67. "movq %%mm2, %%mm3 \n\t"
  68. "movq %%mm4, %%mm5 \n\t"
  69. "punpcklbw %%mm7, %%mm2 \n\t"
  70. "punpcklbw %%mm7, %%mm4 \n\t"
  71. "punpckhbw %%mm7, %%mm3 \n\t"
  72. "punpckhbw %%mm7, %%mm5 \n\t"
  73. "paddusw %%mm2, %%mm4 \n\t"
  74. "paddusw %%mm3, %%mm5 \n\t"
  75. "paddusw %%mm6, %%mm0 \n\t"
  76. "paddusw %%mm6, %%mm1 \n\t"
  77. "paddusw %%mm4, %%mm0 \n\t"
  78. "paddusw %%mm5, %%mm1 \n\t"
  79. "psrlw $2, %%mm0 \n\t"
  80. "psrlw $2, %%mm1 \n\t"
  81. "packuswb %%mm1, %%mm0 \n\t"
  82. "movq %%mm0, (%2, %%"REG_a") \n\t"
  83. "add %3, %%"REG_a" \n\t"
  84. "subl $2, %0 \n\t"
  85. "jnz 1b \n\t"
  86. :"+g"(h), "+S"(pixels)
  87. :"D"(block), "r"((x86_reg)line_size)
  88. :REG_a, "memory");
  89. }
  90. // avg_pixels
  91. // this routine is 'slightly' suboptimal but mostly unused
  92. static void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  93. {
  94. MOVQ_ZERO(mm7);
  95. SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
  96. __asm__ volatile(
  97. "movq (%1), %%mm0 \n\t"
  98. "movq 1(%1), %%mm4 \n\t"
  99. "movq %%mm0, %%mm1 \n\t"
  100. "movq %%mm4, %%mm5 \n\t"
  101. "punpcklbw %%mm7, %%mm0 \n\t"
  102. "punpcklbw %%mm7, %%mm4 \n\t"
  103. "punpckhbw %%mm7, %%mm1 \n\t"
  104. "punpckhbw %%mm7, %%mm5 \n\t"
  105. "paddusw %%mm0, %%mm4 \n\t"
  106. "paddusw %%mm1, %%mm5 \n\t"
  107. "xor %%"REG_a", %%"REG_a" \n\t"
  108. "add %3, %1 \n\t"
  109. ".p2align 3 \n\t"
  110. "1: \n\t"
  111. "movq (%1, %%"REG_a"), %%mm0 \n\t"
  112. "movq 1(%1, %%"REG_a"), %%mm2 \n\t"
  113. "movq %%mm0, %%mm1 \n\t"
  114. "movq %%mm2, %%mm3 \n\t"
  115. "punpcklbw %%mm7, %%mm0 \n\t"
  116. "punpcklbw %%mm7, %%mm2 \n\t"
  117. "punpckhbw %%mm7, %%mm1 \n\t"
  118. "punpckhbw %%mm7, %%mm3 \n\t"
  119. "paddusw %%mm2, %%mm0 \n\t"
  120. "paddusw %%mm3, %%mm1 \n\t"
  121. "paddusw %%mm6, %%mm4 \n\t"
  122. "paddusw %%mm6, %%mm5 \n\t"
  123. "paddusw %%mm0, %%mm4 \n\t"
  124. "paddusw %%mm1, %%mm5 \n\t"
  125. "psrlw $2, %%mm4 \n\t"
  126. "psrlw $2, %%mm5 \n\t"
  127. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  128. "packuswb %%mm5, %%mm4 \n\t"
  129. "pcmpeqd %%mm2, %%mm2 \n\t"
  130. "paddb %%mm2, %%mm2 \n\t"
  131. OP_AVG(%%mm3, %%mm4, %%mm5, %%mm2)
  132. "movq %%mm5, (%2, %%"REG_a") \n\t"
  133. "add %3, %%"REG_a" \n\t"
  134. "movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
  135. "movq 1(%1, %%"REG_a"), %%mm4 \n\t"
  136. "movq %%mm2, %%mm3 \n\t"
  137. "movq %%mm4, %%mm5 \n\t"
  138. "punpcklbw %%mm7, %%mm2 \n\t"
  139. "punpcklbw %%mm7, %%mm4 \n\t"
  140. "punpckhbw %%mm7, %%mm3 \n\t"
  141. "punpckhbw %%mm7, %%mm5 \n\t"
  142. "paddusw %%mm2, %%mm4 \n\t"
  143. "paddusw %%mm3, %%mm5 \n\t"
  144. "paddusw %%mm6, %%mm0 \n\t"
  145. "paddusw %%mm6, %%mm1 \n\t"
  146. "paddusw %%mm4, %%mm0 \n\t"
  147. "paddusw %%mm5, %%mm1 \n\t"
  148. "psrlw $2, %%mm0 \n\t"
  149. "psrlw $2, %%mm1 \n\t"
  150. "movq (%2, %%"REG_a"), %%mm3 \n\t"
  151. "packuswb %%mm1, %%mm0 \n\t"
  152. "pcmpeqd %%mm2, %%mm2 \n\t"
  153. "paddb %%mm2, %%mm2 \n\t"
  154. OP_AVG(%%mm3, %%mm0, %%mm1, %%mm2)
  155. "movq %%mm1, (%2, %%"REG_a") \n\t"
  156. "add %3, %%"REG_a" \n\t"
  157. "subl $2, %0 \n\t"
  158. "jnz 1b \n\t"
  159. :"+g"(h), "+S"(pixels)
  160. :"D"(block), "r"((x86_reg)line_size)
  161. :REG_a, "memory");
  162. }
  163. //FIXME optimize
  164. static void DEF(put, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
  165. DEF(put, pixels8_xy2)(block , pixels , line_size, h);
  166. DEF(put, pixels8_xy2)(block+8, pixels+8, line_size, h);
  167. }
  168. static void DEF(avg, pixels16_xy2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h){
  169. DEF(avg, pixels8_xy2)(block , pixels , line_size, h);
  170. DEF(avg, pixels8_xy2)(block+8, pixels+8, line_size, h);
  171. }