You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

263 lines
9.9KB

  1. /*
  2. * ARMv4L optimized DSP utils
  3. * Copyright (c) 2001 Lionel Ulmer.
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "../dsputil.h"
  22. #ifdef HAVE_IPP
  23. #include "ipp.h"
  24. #endif
  25. extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
  26. extern void j_rev_dct_ARM(DCTELEM *data);
  27. extern void simple_idct_ARM(DCTELEM *data);
  28. extern void simple_idct_armv5te(DCTELEM *data);
  29. extern void simple_idct_put_armv5te(uint8_t *dest, int line_size,
  30. DCTELEM *data);
  31. extern void simple_idct_add_armv5te(uint8_t *dest, int line_size,
  32. DCTELEM *data);
  33. /* XXX: local hack */
  34. static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
  35. static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
  36. void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  37. void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  38. void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  39. void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  40. void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  41. void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  42. void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  43. void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  44. CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8)
  45. CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8)
  46. CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8)
  47. CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8)
  48. CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8)
  49. CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8)
  50. static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
  51. {
  52. asm volatile (
  53. "mov r10, #8 \n\t"
  54. "1: \n\t"
  55. /* load dest */
  56. "ldr r4, [%1] \n\t"
  57. /* block[0] and block[1]*/
  58. "ldrsh r5, [%0] \n\t"
  59. "ldrsh r7, [%0, #2] \n\t"
  60. "and r6, r4, #0xFF \n\t"
  61. "and r8, r4, #0xFF00 \n\t"
  62. "add r6, r5, r6 \n\t"
  63. "add r8, r7, r8, lsr #8 \n\t"
  64. "mvn r5, r5 \n\t"
  65. "mvn r7, r7 \n\t"
  66. "tst r6, #0x100 \n\t"
  67. "movne r6, r5, lsr #24 \n\t"
  68. "tst r8, #0x100 \n\t"
  69. "movne r8, r7, lsr #24 \n\t"
  70. "mov r9, r6 \n\t"
  71. "ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
  72. "orr r9, r9, r8, lsl #8 \n\t"
  73. /* block[2] and block[3] */
  74. /* [A] */
  75. "ldrsh r7, [%0, #6] \n\t"
  76. "and r6, r4, #0xFF0000 \n\t"
  77. "and r8, r4, #0xFF000000 \n\t"
  78. "add r6, r5, r6, lsr #16 \n\t"
  79. "add r8, r7, r8, lsr #24 \n\t"
  80. "mvn r5, r5 \n\t"
  81. "mvn r7, r7 \n\t"
  82. "tst r6, #0x100 \n\t"
  83. "movne r6, r5, lsr #24 \n\t"
  84. "tst r8, #0x100 \n\t"
  85. "movne r8, r7, lsr #24 \n\t"
  86. "orr r9, r9, r6, lsl #16 \n\t"
  87. "ldr r4, [%1, #4] \n\t" /* moved form [B] */
  88. "orr r9, r9, r8, lsl #24 \n\t"
  89. /* store dest */
  90. "ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
  91. "str r9, [%1] \n\t"
  92. /* load dest */
  93. /* [B] */
  94. /* block[4] and block[5] */
  95. /* [C] */
  96. "ldrsh r7, [%0, #10] \n\t"
  97. "and r6, r4, #0xFF \n\t"
  98. "and r8, r4, #0xFF00 \n\t"
  99. "add r6, r5, r6 \n\t"
  100. "add r8, r7, r8, lsr #8 \n\t"
  101. "mvn r5, r5 \n\t"
  102. "mvn r7, r7 \n\t"
  103. "tst r6, #0x100 \n\t"
  104. "movne r6, r5, lsr #24 \n\t"
  105. "tst r8, #0x100 \n\t"
  106. "movne r8, r7, lsr #24 \n\t"
  107. "mov r9, r6 \n\t"
  108. "ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
  109. "orr r9, r9, r8, lsl #8 \n\t"
  110. /* block[6] and block[7] */
  111. /* [D] */
  112. "ldrsh r7, [%0, #14] \n\t"
  113. "and r6, r4, #0xFF0000 \n\t"
  114. "and r8, r4, #0xFF000000 \n\t"
  115. "add r6, r5, r6, lsr #16 \n\t"
  116. "add r8, r7, r8, lsr #24 \n\t"
  117. "mvn r5, r5 \n\t"
  118. "mvn r7, r7 \n\t"
  119. "tst r6, #0x100 \n\t"
  120. "movne r6, r5, lsr #24 \n\t"
  121. "tst r8, #0x100 \n\t"
  122. "movne r8, r7, lsr #24 \n\t"
  123. "orr r9, r9, r6, lsl #16 \n\t"
  124. "add %0, %0, #16 \n\t" /* moved from [E] */
  125. "orr r9, r9, r8, lsl #24 \n\t"
  126. "subs r10, r10, #1 \n\t" /* moved from [F] */
  127. /* store dest */
  128. "str r9, [%1, #4] \n\t"
  129. /* [E] */
  130. /* [F] */
  131. "add %1, %1, %2 \n\t"
  132. "bne 1b \n\t"
  133. : "+r"(block),
  134. "+r"(dest)
  135. : "r"(line_size)
  136. : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
  137. }
  138. /* XXX: those functions should be suppressed ASAP when all IDCTs are
  139. converted */
  140. static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
  141. {
  142. j_rev_dct_ARM (block);
  143. ff_put_pixels_clamped(block, dest, line_size);
  144. }
  145. static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
  146. {
  147. j_rev_dct_ARM (block);
  148. ff_add_pixels_clamped(block, dest, line_size);
  149. }
  150. static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
  151. {
  152. simple_idct_ARM (block);
  153. ff_put_pixels_clamped(block, dest, line_size);
  154. }
  155. static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
  156. {
  157. simple_idct_ARM (block);
  158. ff_add_pixels_clamped(block, dest, line_size);
  159. }
  160. #ifdef HAVE_IPP
  161. static void simple_idct_ipp(DCTELEM *block)
  162. {
  163. ippiDCT8x8Inv_Video_16s_C1I(block);
  164. }
  165. static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
  166. {
  167. ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
  168. }
  169. void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
  170. static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
  171. {
  172. ippiDCT8x8Inv_Video_16s_C1I(block);
  173. #ifdef HAVE_IWMMXT
  174. add_pixels_clamped_iwmmxt(block, dest, line_size);
  175. #else
  176. add_pixels_clamped_ARM(block, dest, line_size);
  177. #endif
  178. }
  179. #endif
  180. void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
  181. {
  182. int idct_algo= avctx->idct_algo;
  183. ff_put_pixels_clamped = c->put_pixels_clamped;
  184. ff_add_pixels_clamped = c->add_pixels_clamped;
  185. if(idct_algo == FF_IDCT_AUTO){
  186. #if defined(HAVE_IPP)
  187. idct_algo = FF_IDCT_IPP;
  188. #elif defined(HAVE_ARMV5TE)
  189. idct_algo = FF_IDCT_SIMPLEARMV5TE;
  190. #else
  191. idct_algo = FF_IDCT_ARM;
  192. #endif
  193. }
  194. if(idct_algo==FF_IDCT_ARM){
  195. c->idct_put= j_rev_dct_ARM_put;
  196. c->idct_add= j_rev_dct_ARM_add;
  197. c->idct = j_rev_dct_ARM;
  198. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
  199. } else if (idct_algo==FF_IDCT_SIMPLEARM){
  200. c->idct_put= simple_idct_ARM_put;
  201. c->idct_add= simple_idct_ARM_add;
  202. c->idct = simple_idct_ARM;
  203. c->idct_permutation_type= FF_NO_IDCT_PERM;
  204. #ifdef HAVE_ARMV5TE
  205. } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
  206. c->idct_put= simple_idct_put_armv5te;
  207. c->idct_add= simple_idct_add_armv5te;
  208. c->idct = simple_idct_armv5te;
  209. c->idct_permutation_type = FF_NO_IDCT_PERM;
  210. #endif
  211. #ifdef HAVE_IPP
  212. } else if (idct_algo==FF_IDCT_IPP){
  213. c->idct_put= simple_idct_ipp_put;
  214. c->idct_add= simple_idct_ipp_add;
  215. c->idct = simple_idct_ipp;
  216. c->idct_permutation_type= FF_NO_IDCT_PERM;
  217. #endif
  218. }
  219. /* c->put_pixels_tab[0][0] = put_pixels16_arm; */ // NG!
  220. c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
  221. c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
  222. /* c->put_pixels_tab[0][3] = put_pixels16_xy2_arm; /\* NG *\/ */
  223. /* c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm; */
  224. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
  225. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
  226. /* c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm; //NG */
  227. c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
  228. c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
  229. /* c->put_pixels_tab[1][2] = put_pixels8_y2_arm; //NG */
  230. /* c->put_pixels_tab[1][3] = put_pixels8_xy2_arm; //NG */
  231. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
  232. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
  233. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
  234. /* c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;//NG */
  235. #ifdef HAVE_IWMMXT
  236. dsputil_init_iwmmxt(c, avctx);
  237. #endif
  238. }