You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

214 lines
9.5KB

  1. /*
  2. * Alpha optimized DSP utils
  3. * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
  4. *
  5. * This file is part of Libav.
  6. *
  7. * Libav is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * Libav is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with Libav; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/attributes.h"
  22. #include "libavcodec/hpeldsp.h"
  23. #include "hpeldsp_alpha.h"
  24. #include "asm.h"
  25. static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
  26. {
  27. return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
  28. }
  29. static inline uint64_t avg2(uint64_t a, uint64_t b)
  30. {
  31. return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
  32. }
  33. #if 0
  34. /* The XY2 routines basically utilize this scheme, but reuse parts in
  35. each iteration. */
  36. static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
  37. {
  38. uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
  39. + ((l2 & ~BYTE_VEC(0x03)) >> 2)
  40. + ((l3 & ~BYTE_VEC(0x03)) >> 2)
  41. + ((l4 & ~BYTE_VEC(0x03)) >> 2);
  42. uint64_t r2 = (( (l1 & BYTE_VEC(0x03))
  43. + (l2 & BYTE_VEC(0x03))
  44. + (l3 & BYTE_VEC(0x03))
  45. + (l4 & BYTE_VEC(0x03))
  46. + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
  47. return r1 + r2;
  48. }
  49. #endif
  50. #define OP(LOAD, STORE) \
  51. do { \
  52. STORE(LOAD(pixels), block); \
  53. pixels += line_size; \
  54. block += line_size; \
  55. } while (--h)
  56. #define OP_X2(LOAD, STORE) \
  57. do { \
  58. uint64_t pix1, pix2; \
  59. \
  60. pix1 = LOAD(pixels); \
  61. pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  62. STORE(AVG2(pix1, pix2), block); \
  63. pixels += line_size; \
  64. block += line_size; \
  65. } while (--h)
  66. #define OP_Y2(LOAD, STORE) \
  67. do { \
  68. uint64_t pix = LOAD(pixels); \
  69. do { \
  70. uint64_t next_pix; \
  71. \
  72. pixels += line_size; \
  73. next_pix = LOAD(pixels); \
  74. STORE(AVG2(pix, next_pix), block); \
  75. block += line_size; \
  76. pix = next_pix; \
  77. } while (--h); \
  78. } while (0)
  79. #define OP_XY2(LOAD, STORE) \
  80. do { \
  81. uint64_t pix1 = LOAD(pixels); \
  82. uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  83. uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \
  84. + (pix2 & BYTE_VEC(0x03)); \
  85. uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \
  86. + ((pix2 & ~BYTE_VEC(0x03)) >> 2); \
  87. \
  88. do { \
  89. uint64_t npix1, npix2; \
  90. uint64_t npix_l, npix_h; \
  91. uint64_t avg; \
  92. \
  93. pixels += line_size; \
  94. npix1 = LOAD(pixels); \
  95. npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \
  96. npix_l = (npix1 & BYTE_VEC(0x03)) \
  97. + (npix2 & BYTE_VEC(0x03)); \
  98. npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \
  99. + ((npix2 & ~BYTE_VEC(0x03)) >> 2); \
  100. avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \
  101. + pix_h + npix_h; \
  102. STORE(avg, block); \
  103. \
  104. block += line_size; \
  105. pix_l = npix_l; \
  106. pix_h = npix_h; \
  107. } while (--h); \
  108. } while (0)
  109. #define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \
  110. static void OPNAME ## _pixels ## SUFF ## _axp \
  111. (uint8_t *restrict block, const uint8_t *restrict pixels, \
  112. ptrdiff_t line_size, int h) \
  113. { \
  114. if ((size_t) pixels & 0x7) { \
  115. OPKIND(uldq, STORE); \
  116. } else { \
  117. OPKIND(ldq, STORE); \
  118. } \
  119. } \
  120. \
  121. static void OPNAME ## _pixels16 ## SUFF ## _axp \
  122. (uint8_t *restrict block, const uint8_t *restrict pixels, \
  123. ptrdiff_t line_size, int h) \
  124. { \
  125. OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \
  126. OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \
  127. }
  128. #define PIXOP(OPNAME, STORE) \
  129. MAKE_OP(OPNAME, , OP, STORE) \
  130. MAKE_OP(OPNAME, _x2, OP_X2, STORE) \
  131. MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \
  132. MAKE_OP(OPNAME, _xy2, OP_XY2, STORE)
  133. /* Rounding primitives. */
  134. #define AVG2 avg2
  135. #define AVG4 avg4
  136. #define AVG4_ROUNDER BYTE_VEC(0x02)
  137. #define STORE(l, b) stq(l, b)
  138. PIXOP(put, STORE);
  139. #undef STORE
  140. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  141. PIXOP(avg, STORE);
  142. /* Not rounding primitives. */
  143. #undef AVG2
  144. #undef AVG4
  145. #undef AVG4_ROUNDER
  146. #undef STORE
  147. #define AVG2 avg2_no_rnd
  148. #define AVG4 avg4_no_rnd
  149. #define AVG4_ROUNDER BYTE_VEC(0x01)
  150. #define STORE(l, b) stq(l, b)
  151. PIXOP(put_no_rnd, STORE);
  152. #undef STORE
  153. #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
  154. PIXOP(avg_no_rnd, STORE);
  155. static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels,
  156. ptrdiff_t line_size, int h)
  157. {
  158. put_pixels_axp_asm(block, pixels, line_size, h);
  159. put_pixels_axp_asm(block + 8, pixels + 8, line_size, h);
  160. }
  161. av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags)
  162. {
  163. c->put_pixels_tab[0][0] = put_pixels16_axp_asm;
  164. c->put_pixels_tab[0][1] = put_pixels16_x2_axp;
  165. c->put_pixels_tab[0][2] = put_pixels16_y2_axp;
  166. c->put_pixels_tab[0][3] = put_pixels16_xy2_axp;
  167. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm;
  168. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp;
  169. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp;
  170. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp;
  171. c->avg_pixels_tab[0][0] = avg_pixels16_axp;
  172. c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp;
  173. c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp;
  174. c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp;
  175. c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp;
  176. c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp;
  177. c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp;
  178. c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp;
  179. c->put_pixels_tab[1][0] = put_pixels_axp_asm;
  180. c->put_pixels_tab[1][1] = put_pixels_x2_axp;
  181. c->put_pixels_tab[1][2] = put_pixels_y2_axp;
  182. c->put_pixels_tab[1][3] = put_pixels_xy2_axp;
  183. c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm;
  184. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp;
  185. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp;
  186. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp;
  187. c->avg_pixels_tab[1][0] = avg_pixels_axp;
  188. c->avg_pixels_tab[1][1] = avg_pixels_x2_axp;
  189. c->avg_pixels_tab[1][2] = avg_pixels_y2_axp;
  190. c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp;
  191. }