You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

224 lines
6.1KB

  1. /*
  2. * Alpha optimized DSP utils
  3. * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #include "asm.h"
  20. #include "../dsputil.h"
  21. void simple_idct_axp(DCTELEM *block);
  22. static void put_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
  23. int line_size)
  24. {
  25. int i = 8;
  26. do {
  27. UINT64 shorts;
  28. shorts = ldq(block);
  29. shorts = maxsw4(shorts, 0);
  30. shorts = minsw4(shorts, WORD_VEC(0x00ff));
  31. stl(pkwb(shorts), pixels);
  32. shorts = ldq(block + 4);
  33. shorts = maxsw4(shorts, 0);
  34. shorts = minsw4(shorts, WORD_VEC(0x00ff));
  35. stl(pkwb(shorts), pixels + 4);
  36. pixels += line_size;
  37. block += 8;
  38. } while (--i);
  39. }
  40. static void add_pixels_clamped_axp(const DCTELEM *block, UINT8 *pixels,
  41. int line_size)
  42. {
  43. int i = 8;
  44. do {
  45. UINT64 shorts;
  46. shorts = ldq(block);
  47. shorts &= ~WORD_VEC(0x8000); /* clear highest bit to avoid overflow */
  48. shorts += unpkbw(ldl(pixels));
  49. shorts &= ~WORD_VEC(0x8000); /* hibit would be set for e. g. -2 + 3 */
  50. shorts = minuw4(shorts, WORD_VEC(0x4000)); /* set neg. to 0x4000 */
  51. shorts &= ~WORD_VEC(0x4000); /* ...and zap them */
  52. shorts = minsw4(shorts, WORD_VEC(0x00ff)); /* clamp to 255 */
  53. stl(pkwb(shorts), pixels);
  54. /* next 4 */
  55. shorts = ldq(block + 4);
  56. shorts &= ~WORD_VEC(0x8000);
  57. shorts += unpkbw(ldl(pixels + 4));
  58. shorts &= ~WORD_VEC(0x8000);
  59. shorts = minuw4(shorts, WORD_VEC(0x4000));
  60. shorts &= ~WORD_VEC(0x4000);
  61. shorts = minsw4(shorts, WORD_VEC(0x00ff));
  62. stl(pkwb(shorts), pixels + 4);
  63. pixels += line_size;
  64. block += 8;
  65. } while (--i);
  66. }
  67. /* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1
  68. Since the immediate result could be greater than 255, we do the
  69. shift first. The result is too low by one if the bytes were both
  70. odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */
  71. static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2)
  72. {
  73. UINT64 correction = (l1 & l2) & BYTE_VEC(0x01);
  74. l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
  75. l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
  76. return l1 + l2 + correction;
  77. }
  78. /* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1
  79. The '1' only has an effect when one byte is even and the other odd,
  80. i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01).
  81. Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */
  82. static inline UINT64 avg2(UINT64 l1, UINT64 l2)
  83. {
  84. UINT64 correction = (l1 | l2) & BYTE_VEC(0x01);
  85. l1 = (l1 & ~BYTE_VEC(0x01)) >> 1;
  86. l2 = (l2 & ~BYTE_VEC(0x01)) >> 1;
  87. return l1 + l2 + correction;
  88. }
  89. static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
  90. {
  91. UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
  92. + ((l2 & ~BYTE_VEC(0x03)) >> 2)
  93. + ((l3 & ~BYTE_VEC(0x03)) >> 2)
  94. + ((l4 & ~BYTE_VEC(0x03)) >> 2);
  95. UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
  96. + (l2 & BYTE_VEC(0x03))
  97. + (l3 & BYTE_VEC(0x03))
  98. + (l4 & BYTE_VEC(0x03))
  99. + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
  100. return r1 + r2;
  101. }
  102. static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4)
  103. {
  104. UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
  105. + ((l2 & ~BYTE_VEC(0x03)) >> 2)
  106. + ((l3 & ~BYTE_VEC(0x03)) >> 2)
  107. + ((l4 & ~BYTE_VEC(0x03)) >> 2);
  108. UINT64 r2 = (( (l1 & BYTE_VEC(0x03))
  109. + (l2 & BYTE_VEC(0x03))
  110. + (l3 & BYTE_VEC(0x03))
  111. + (l4 & BYTE_VEC(0x03))
  112. + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
  113. return r1 + r2;
  114. }
  115. #define PIXOPNAME(suffix) put ## suffix
  116. #define BTYPE UINT8
  117. #define AVG2 avg2
  118. #define AVG4 avg4
  119. #define STORE(l, b) stq(l, b)
  120. #include "pixops.h"
  121. #undef PIXOPNAME
  122. #undef BTYPE
  123. #undef AVG2
  124. #undef AVG4
  125. #undef STORE
  126. #define PIXOPNAME(suffix) put_no_rnd ## suffix
  127. #define BTYPE UINT8
  128. #define AVG2 avg2_no_rnd
  129. #define AVG4 avg4_no_rnd
  130. #define STORE(l, b) stq(l, b)
  131. #include "pixops.h"
  132. #undef PIXOPNAME
  133. #undef BTYPE
  134. #undef AVG2
  135. #undef AVG4
  136. #undef STORE
  137. /* The following functions are untested. */
  138. #if 0
  139. #define PIXOPNAME(suffix) avg ## suffix
  140. #define BTYPE UINT8
  141. #define AVG2 avg2
  142. #define AVG4 avg4
  143. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  144. #include "pixops.h"
  145. #undef PIXOPNAME
  146. #undef BTYPE
  147. #undef AVG2
  148. #undef AVG4
  149. #undef STORE
  150. #define PIXOPNAME(suffix) avg_no_rnd ## suffix
  151. #define BTYPE UINT8
  152. #define AVG2 avg2_no_rnd
  153. #define AVG4 avg4_no_rnd
  154. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  155. #include "pixops.h"
  156. #undef PIXOPNAME
  157. #undef BTYPE
  158. #undef AVG2
  159. #undef AVG4
  160. #undef STORE
  161. #define PIXOPNAME(suffix) sub ## suffix
  162. #define BTYPE DCTELEM
  163. #define AVG2 avg2
  164. #define AVG4 avg4
  165. #define STORE(l, block) do { \
  166. UINT64 xxx = l; \
  167. (block)[0] -= (xxx >> 0) & 0xff; \
  168. (block)[1] -= (xxx >> 8) & 0xff; \
  169. (block)[2] -= (xxx >> 16) & 0xff; \
  170. (block)[3] -= (xxx >> 24) & 0xff; \
  171. (block)[4] -= (xxx >> 32) & 0xff; \
  172. (block)[5] -= (xxx >> 40) & 0xff; \
  173. (block)[6] -= (xxx >> 48) & 0xff; \
  174. (block)[7] -= (xxx >> 56) & 0xff; \
  175. } while (0)
  176. #include "pixops.h"
  177. #undef PIXOPNAME
  178. #undef BTYPE
  179. #undef AVG2
  180. #undef AVG4
  181. #undef STORE
  182. #endif
  183. void dsputil_init_alpha(void)
  184. {
  185. put_pixels_tab[0] = put_pixels_axp;
  186. put_pixels_tab[1] = put_pixels_x2_axp;
  187. put_pixels_tab[2] = put_pixels_y2_axp;
  188. put_pixels_tab[3] = put_pixels_xy2_axp;
  189. put_no_rnd_pixels_tab[0] = put_pixels_axp;
  190. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
  191. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
  192. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
  193. /* amask clears all bits that correspond to present features. */
  194. if (amask(AMASK_MVI) == 0) {
  195. fprintf(stderr, "MVI extension detected\n");
  196. put_pixels_clamped = put_pixels_clamped_axp;
  197. add_pixels_clamped = add_pixels_clamped_axp;
  198. }
  199. }