You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

288 lines
8.1KB

  1. /*
  2. * Alpha optimized DSP utils
  3. * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. */
  19. #include "asm.h"
  20. #include "../dsputil.h"
  21. void simple_idct_axp(DCTELEM *block);
  22. void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
  23. int line_size, int h);
  24. void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
  25. int line_size);
  26. void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
  27. int line_size);
  28. #if 0
  29. /* These functions were the base for the optimized assembler routines,
  30. and remain here for documentation purposes. */
  31. static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
  32. int line_size)
  33. {
  34. int i = 8;
  35. uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
  36. ASM_ACCEPT_MVI;
  37. do {
  38. uint64_t shorts0, shorts1;
  39. shorts0 = ldq(block);
  40. shorts0 = maxsw4(shorts0, 0);
  41. shorts0 = minsw4(shorts0, clampmask);
  42. stl(pkwb(shorts0), pixels);
  43. shorts1 = ldq(block + 4);
  44. shorts1 = maxsw4(shorts1, 0);
  45. shorts1 = minsw4(shorts1, clampmask);
  46. stl(pkwb(shorts1), pixels + 4);
  47. pixels += line_size;
  48. block += 8;
  49. } while (--i);
  50. }
  51. void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
  52. int line_size)
  53. {
  54. int h = 8;
  55. /* Keep this function a leaf function by generating the constants
  56. manually (mainly for the hack value ;-). */
  57. uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
  58. uint64_t signmask = zap(-1, 0x33);
  59. signmask ^= signmask >> 1; /* 0x8000800080008000 */
  60. ASM_ACCEPT_MVI;
  61. do {
  62. uint64_t shorts0, pix0, signs0;
  63. uint64_t shorts1, pix1, signs1;
  64. shorts0 = ldq(block);
  65. shorts1 = ldq(block + 4);
  66. pix0 = unpkbw(ldl(pixels));
  67. /* Signed subword add (MMX paddw). */
  68. signs0 = shorts0 & signmask;
  69. shorts0 &= ~signmask;
  70. shorts0 += pix0;
  71. shorts0 ^= signs0;
  72. /* Clamp. */
  73. shorts0 = maxsw4(shorts0, 0);
  74. shorts0 = minsw4(shorts0, clampmask);
  75. /* Next 4. */
  76. pix1 = unpkbw(ldl(pixels + 4));
  77. signs1 = shorts1 & signmask;
  78. shorts1 &= ~signmask;
  79. shorts1 += pix1;
  80. shorts1 ^= signs1;
  81. shorts1 = maxsw4(shorts1, 0);
  82. shorts1 = minsw4(shorts1, clampmask);
  83. stl(pkwb(shorts0), pixels);
  84. stl(pkwb(shorts1), pixels + 4);
  85. pixels += line_size;
  86. block += 8;
  87. } while (--h);
  88. }
  89. #endif
  90. static void clear_blocks_axp(DCTELEM *blocks) {
  91. uint64_t *p = (uint64_t *) blocks;
  92. int n = sizeof(DCTELEM) * 6 * 64;
  93. do {
  94. p[0] = 0;
  95. p[1] = 0;
  96. p[2] = 0;
  97. p[3] = 0;
  98. p[4] = 0;
  99. p[5] = 0;
  100. p[6] = 0;
  101. p[7] = 0;
  102. p += 8;
  103. n -= 8 * 8;
  104. } while (n);
  105. }
  106. static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
  107. {
  108. return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
  109. }
  110. static inline uint64_t avg2(uint64_t a, uint64_t b)
  111. {
  112. return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
  113. }
  114. static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
  115. {
  116. uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
  117. + ((l2 & ~BYTE_VEC(0x03)) >> 2)
  118. + ((l3 & ~BYTE_VEC(0x03)) >> 2)
  119. + ((l4 & ~BYTE_VEC(0x03)) >> 2);
  120. uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
  121. + (l2 & BYTE_VEC(0x03))
  122. + (l3 & BYTE_VEC(0x03))
  123. + (l4 & BYTE_VEC(0x03))
  124. + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
  125. return r1 + r2;
  126. }
  127. static inline uint64_t avg4_no_rnd(uint64_t l1, uint64_t l2,
  128. uint64_t l3, uint64_t l4)
  129. {
  130. uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
  131. + ((l2 & ~BYTE_VEC(0x03)) >> 2)
  132. + ((l3 & ~BYTE_VEC(0x03)) >> 2)
  133. + ((l4 & ~BYTE_VEC(0x03)) >> 2);
  134. uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
  135. + (l2 & BYTE_VEC(0x03))
  136. + (l3 & BYTE_VEC(0x03))
  137. + (l4 & BYTE_VEC(0x03))
  138. + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
  139. return r1 + r2;
  140. }
  141. #define OP(LOAD, STORE, INCR) \
  142. do { \
  143. STORE(LOAD(pixels), block); \
  144. pixels += line_size; \
  145. block += INCR; \
  146. } while (--h)
  147. #define OP_X2(LOAD, STORE, INCR) \
  148. do { \
  149. uint64_t pix1, pix2; \
  150. \
  151. pix1 = LOAD(pixels); \
  152. pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  153. STORE(AVG2(pix1, pix2), block); \
  154. pixels += line_size; \
  155. block += INCR; \
  156. } while (--h)
  157. #define OP_Y2(LOAD, STORE, INCR) \
  158. do { \
  159. uint64_t pix = LOAD(pixels); \
  160. do { \
  161. uint64_t next_pix; \
  162. \
  163. pixels += line_size; \
  164. next_pix = LOAD(pixels); \
  165. STORE(AVG2(pix, next_pix), block); \
  166. block += INCR; \
  167. pix = next_pix; \
  168. } while (--h); \
  169. } while (0)
  170. #define OP_XY2(LOAD, STORE, INCR) \
  171. do { \
  172. uint64_t pix1 = LOAD(pixels); \
  173. uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  174. \
  175. do { \
  176. uint64_t next_pix1, next_pix2; \
  177. \
  178. pixels += line_size; \
  179. next_pix1 = LOAD(pixels); \
  180. next_pix2 = next_pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  181. \
  182. STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); \
  183. \
  184. block += INCR; \
  185. pix1 = next_pix1; \
  186. pix2 = next_pix2; \
  187. } while (--h); \
  188. } while (0)
  189. #define MAKE_OP(BTYPE, OPNAME, SUFF, OPKIND, STORE, INCR) \
  190. static void OPNAME ## _pixels ## SUFF ## _axp(BTYPE *block, \
  191. const uint8_t *pixels, \
  192. int line_size, int h) \
  193. { \
  194. if ((size_t) pixels & 0x7) { \
  195. OPKIND(uldq, STORE, INCR); \
  196. } else { \
  197. OPKIND(ldq, STORE, INCR); \
  198. } \
  199. }
  200. #define PIXOP(BTYPE, OPNAME, STORE, INCR) \
  201. MAKE_OP(BTYPE, OPNAME, , OP, STORE, INCR); \
  202. MAKE_OP(BTYPE, OPNAME, _x2, OP_X2, STORE, INCR); \
  203. MAKE_OP(BTYPE, OPNAME, _y2, OP_Y2, STORE, INCR); \
  204. MAKE_OP(BTYPE, OPNAME, _xy2, OP_XY2, STORE, INCR);
  205. /* Rounding primitives. */
  206. #define AVG2 avg2
  207. #define AVG4 avg4
  208. #define STORE(l, b) stq(l, b)
  209. PIXOP(uint8_t, put, STORE, line_size);
  210. #undef STORE
  211. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  212. PIXOP(uint8_t, avg, STORE, line_size);
  213. /* Not rounding primitives. */
  214. #undef AVG2
  215. #undef AVG4
  216. #undef STORE
  217. #define AVG2 avg2_no_rnd
  218. #define AVG4 avg4_no_rnd
  219. #define STORE(l, b) stq(l, b)
  220. PIXOP(uint8_t, put_no_rnd, STORE, line_size);
  221. #undef STORE
  222. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  223. PIXOP(uint8_t, avg_no_rnd, STORE, line_size);
  224. void dsputil_init_alpha(void)
  225. {
  226. put_pixels_tab[0] = put_pixels_axp_asm;
  227. put_pixels_tab[1] = put_pixels_x2_axp;
  228. put_pixels_tab[2] = put_pixels_y2_axp;
  229. put_pixels_tab[3] = put_pixels_xy2_axp;
  230. put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
  231. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
  232. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
  233. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
  234. avg_pixels_tab[0] = avg_pixels_axp;
  235. avg_pixels_tab[1] = avg_pixels_x2_axp;
  236. avg_pixels_tab[2] = avg_pixels_y2_axp;
  237. avg_pixels_tab[3] = avg_pixels_xy2_axp;
  238. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
  239. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
  240. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
  241. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
  242. clear_blocks = clear_blocks_axp;
  243. /* amask clears all bits that correspond to present features. */
  244. if (amask(AMASK_MVI) == 0) {
  245. put_pixels_clamped = put_pixels_clamped_mvi_asm;
  246. add_pixels_clamped = add_pixels_clamped_mvi_asm;
  247. }
  248. }