You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

291 lines
11KB

  1. /*
  2. * Alpha optimized DSP utils
  3. * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. */
  19. #include "asm.h"
  20. #include "../dsputil.h"
  21. void simple_idct_axp(DCTELEM *block);
  22. void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
  23. int line_size, int h);
  24. void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
  25. int line_size);
  26. void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
  27. int line_size);
  28. #if 0
  29. /* These functions were the base for the optimized assembler routines,
  30. and remain here for documentation purposes. */
  31. static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
  32. int line_size)
  33. {
  34. int i = 8;
  35. uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
  36. ASM_ACCEPT_MVI;
  37. do {
  38. uint64_t shorts0, shorts1;
  39. shorts0 = ldq(block);
  40. shorts0 = maxsw4(shorts0, 0);
  41. shorts0 = minsw4(shorts0, clampmask);
  42. stl(pkwb(shorts0), pixels);
  43. shorts1 = ldq(block + 4);
  44. shorts1 = maxsw4(shorts1, 0);
  45. shorts1 = minsw4(shorts1, clampmask);
  46. stl(pkwb(shorts1), pixels + 4);
  47. pixels += line_size;
  48. block += 8;
  49. } while (--i);
  50. }
  51. void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels,
  52. int line_size)
  53. {
  54. int h = 8;
  55. /* Keep this function a leaf function by generating the constants
  56. manually (mainly for the hack value ;-). */
  57. uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
  58. uint64_t signmask = zap(-1, 0x33);
  59. signmask ^= signmask >> 1; /* 0x8000800080008000 */
  60. ASM_ACCEPT_MVI;
  61. do {
  62. uint64_t shorts0, pix0, signs0;
  63. uint64_t shorts1, pix1, signs1;
  64. shorts0 = ldq(block);
  65. shorts1 = ldq(block + 4);
  66. pix0 = unpkbw(ldl(pixels));
  67. /* Signed subword add (MMX paddw). */
  68. signs0 = shorts0 & signmask;
  69. shorts0 &= ~signmask;
  70. shorts0 += pix0;
  71. shorts0 ^= signs0;
  72. /* Clamp. */
  73. shorts0 = maxsw4(shorts0, 0);
  74. shorts0 = minsw4(shorts0, clampmask);
  75. /* Next 4. */
  76. pix1 = unpkbw(ldl(pixels + 4));
  77. signs1 = shorts1 & signmask;
  78. shorts1 &= ~signmask;
  79. shorts1 += pix1;
  80. shorts1 ^= signs1;
  81. shorts1 = maxsw4(shorts1, 0);
  82. shorts1 = minsw4(shorts1, clampmask);
  83. stl(pkwb(shorts0), pixels);
  84. stl(pkwb(shorts1), pixels + 4);
  85. pixels += line_size;
  86. block += 8;
  87. } while (--h);
  88. }
  89. #endif
  90. static void clear_blocks_axp(DCTELEM *blocks) {
  91. uint64_t *p = (uint64_t *) blocks;
  92. int n = sizeof(DCTELEM) * 6 * 64;
  93. do {
  94. p[0] = 0;
  95. p[1] = 0;
  96. p[2] = 0;
  97. p[3] = 0;
  98. p[4] = 0;
  99. p[5] = 0;
  100. p[6] = 0;
  101. p[7] = 0;
  102. p += 8;
  103. n -= 8 * 8;
  104. } while (n);
  105. }
  106. static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
  107. {
  108. return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
  109. }
  110. static inline uint64_t avg2(uint64_t a, uint64_t b)
  111. {
  112. return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
  113. }
  114. #if 0
  115. /* The XY2 routines basically utilize this scheme, but reuse parts in
  116. each iteration. */
  117. static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
  118. {
  119. uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
  120. + ((l2 & ~BYTE_VEC(0x03)) >> 2)
  121. + ((l3 & ~BYTE_VEC(0x03)) >> 2)
  122. + ((l4 & ~BYTE_VEC(0x03)) >> 2);
  123. uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
  124. + (l2 & BYTE_VEC(0x03))
  125. + (l3 & BYTE_VEC(0x03))
  126. + (l4 & BYTE_VEC(0x03))
  127. + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
  128. return r1 + r2;
  129. }
  130. #endif
  131. #define OP(LOAD, STORE) \
  132. do { \
  133. STORE(LOAD(pixels), block); \
  134. pixels += line_size; \
  135. block += line_size; \
  136. } while (--h)
  137. #define OP_X2(LOAD, STORE) \
  138. do { \
  139. uint64_t pix1, pix2; \
  140. \
  141. pix1 = LOAD(pixels); \
  142. pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  143. STORE(AVG2(pix1, pix2), block); \
  144. pixels += line_size; \
  145. block += line_size; \
  146. } while (--h)
  147. #define OP_Y2(LOAD, STORE) \
  148. do { \
  149. uint64_t pix = LOAD(pixels); \
  150. do { \
  151. uint64_t next_pix; \
  152. \
  153. pixels += line_size; \
  154. next_pix = LOAD(pixels); \
  155. STORE(AVG2(pix, next_pix), block); \
  156. block += line_size; \
  157. pix = next_pix; \
  158. } while (--h); \
  159. } while (0)
  160. #define OP_XY2(LOAD, STORE) \
  161. do { \
  162. uint64_t pix1 = LOAD(pixels); \
  163. uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  164. uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
  165. + (pix2 & BYTE_VEC(0x03)); \
  166. uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
  167. + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
  168. \
  169. do { \
  170. uint64_t npix1, npix2; \
  171. uint64_t npix_l, npix_h; \
  172. uint64_t avg; \
  173. \
  174. pixels += line_size; \
  175. npix1 = LOAD(pixels); \
  176. npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  177. npix_l = (npix1 & BYTE_VEC(0x03)) \
  178. + (npix2 & BYTE_VEC(0x03)); \
  179. npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
  180. + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
  181. avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
  182. + pix_h + npix_h; \
  183. STORE(avg, block); \
  184. \
  185. block += line_size; \
  186. pix_l = npix_l; \
  187. pix_h = npix_h; \
  188. } while (--h); \
  189. } while (0)
  190. #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
  191. static void OPNAME ## _pixels ## SUFF ## _axp \
  192. (uint8_t *restrict block, const uint8_t *restrict pixels, \
  193. int line_size, int h) \
  194. { \
  195. if ((size_t) pixels & 0x7) { \
  196. OPKIND(uldq, STORE); \
  197. } else { \
  198. OPKIND(ldq, STORE); \
  199. } \
  200. }
  201. #define PIXOP(OPNAME, STORE) \
  202. MAKE_OP(OPNAME, , OP, STORE) \
  203. MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
  204. MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
  205. MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
  206. /* Rounding primitives. */
  207. #define AVG2 avg2
  208. #define AVG4 avg4
  209. #define AVG4_ROUNDER BYTE_VEC(0x02)
  210. #define STORE(l, b) stq(l, b)
  211. PIXOP(put, STORE);
  212. #undef STORE
  213. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  214. PIXOP(avg, STORE);
  215. /* Not rounding primitives. */
  216. #undef AVG2
  217. #undef AVG4
  218. #undef AVG4_ROUNDER
  219. #undef STORE
  220. #define AVG2 avg2_no_rnd
  221. #define AVG4 avg4_no_rnd
  222. #define AVG4_ROUNDER BYTE_VEC(0x01)
  223. #define STORE(l, b) stq(l, b)
  224. PIXOP(put_no_rnd, STORE);
  225. #undef STORE
  226. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  227. PIXOP(avg_no_rnd, STORE);
  228. void dsputil_init_alpha(void)
  229. {
  230. put_pixels_tab[0] = put_pixels_axp_asm;
  231. put_pixels_tab[1] = put_pixels_x2_axp;
  232. put_pixels_tab[2] = put_pixels_y2_axp;
  233. put_pixels_tab[3] = put_pixels_xy2_axp;
  234. put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
  235. put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
  236. put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
  237. put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
  238. avg_pixels_tab[0] = avg_pixels_axp;
  239. avg_pixels_tab[1] = avg_pixels_x2_axp;
  240. avg_pixels_tab[2] = avg_pixels_y2_axp;
  241. avg_pixels_tab[3] = avg_pixels_xy2_axp;
  242. avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
  243. avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
  244. avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
  245. avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
  246. clear_blocks = clear_blocks_axp;
  247. /* amask clears all bits that correspond to present features. */
  248. if (amask(AMASK_MVI) == 0) {
  249. put_pixels_clamped = put_pixels_clamped_mvi_asm;
  250. add_pixels_clamped = add_pixels_clamped_mvi_asm;
  251. }
  252. }