You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

306 lines
11KB

  1. /*
  2. * ARMv4L optimized DSP utils
  3. * Copyright (c) 2001 Lionel Ulmer.
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavcodec/dsputil.h"
  22. #ifdef HAVE_IPP
  23. #include <ipp.h>
  24. #endif
  25. extern void dsputil_init_iwmmxt(DSPContext* c, AVCodecContext *avctx);
  26. extern void ff_float_init_arm_vfp(DSPContext* c, AVCodecContext *avctx);
  27. extern void j_rev_dct_ARM(DCTELEM *data);
  28. extern void simple_idct_ARM(DCTELEM *data);
  29. extern void simple_idct_armv5te(DCTELEM *data);
  30. extern void simple_idct_put_armv5te(uint8_t *dest, int line_size,
  31. DCTELEM *data);
  32. extern void simple_idct_add_armv5te(uint8_t *dest, int line_size,
  33. DCTELEM *data);
  34. extern void ff_simple_idct_armv6(DCTELEM *data);
  35. extern void ff_simple_idct_put_armv6(uint8_t *dest, int line_size,
  36. DCTELEM *data);
  37. extern void ff_simple_idct_add_armv6(uint8_t *dest, int line_size,
  38. DCTELEM *data);
  39. /* XXX: local hack */
  40. static void (*ff_put_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
  41. static void (*ff_add_pixels_clamped)(const DCTELEM *block, uint8_t *pixels, int line_size);
  42. void put_pixels8_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  43. void put_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  44. void put_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  45. void put_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  46. void put_no_rnd_pixels8_x2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  47. void put_no_rnd_pixels8_y2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  48. void put_no_rnd_pixels8_xy2_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  49. void put_pixels16_arm(uint8_t *block, const uint8_t *pixels, int line_size, int h);
  50. CALL_2X_PIXELS(put_pixels16_x2_arm , put_pixels8_x2_arm , 8)
  51. CALL_2X_PIXELS(put_pixels16_y2_arm , put_pixels8_y2_arm , 8)
  52. CALL_2X_PIXELS(put_pixels16_xy2_arm, put_pixels8_xy2_arm, 8)
  53. CALL_2X_PIXELS(put_no_rnd_pixels16_x2_arm , put_no_rnd_pixels8_x2_arm , 8)
  54. CALL_2X_PIXELS(put_no_rnd_pixels16_y2_arm , put_no_rnd_pixels8_y2_arm , 8)
  55. CALL_2X_PIXELS(put_no_rnd_pixels16_xy2_arm, put_no_rnd_pixels8_xy2_arm, 8)
  56. static void add_pixels_clamped_ARM(short *block, unsigned char *dest, int line_size)
  57. {
  58. __asm__ volatile (
  59. "mov r10, #8 \n\t"
  60. "1: \n\t"
  61. /* load dest */
  62. "ldr r4, [%1] \n\t"
  63. /* block[0] and block[1]*/
  64. "ldrsh r5, [%0] \n\t"
  65. "ldrsh r7, [%0, #2] \n\t"
  66. "and r6, r4, #0xFF \n\t"
  67. "and r8, r4, #0xFF00 \n\t"
  68. "add r6, r5, r6 \n\t"
  69. "add r8, r7, r8, lsr #8 \n\t"
  70. "mvn r5, r5 \n\t"
  71. "mvn r7, r7 \n\t"
  72. "tst r6, #0x100 \n\t"
  73. "movne r6, r5, lsr #24 \n\t"
  74. "tst r8, #0x100 \n\t"
  75. "movne r8, r7, lsr #24 \n\t"
  76. "mov r9, r6 \n\t"
  77. "ldrsh r5, [%0, #4] \n\t" /* moved form [A] */
  78. "orr r9, r9, r8, lsl #8 \n\t"
  79. /* block[2] and block[3] */
  80. /* [A] */
  81. "ldrsh r7, [%0, #6] \n\t"
  82. "and r6, r4, #0xFF0000 \n\t"
  83. "and r8, r4, #0xFF000000 \n\t"
  84. "add r6, r5, r6, lsr #16 \n\t"
  85. "add r8, r7, r8, lsr #24 \n\t"
  86. "mvn r5, r5 \n\t"
  87. "mvn r7, r7 \n\t"
  88. "tst r6, #0x100 \n\t"
  89. "movne r6, r5, lsr #24 \n\t"
  90. "tst r8, #0x100 \n\t"
  91. "movne r8, r7, lsr #24 \n\t"
  92. "orr r9, r9, r6, lsl #16 \n\t"
  93. "ldr r4, [%1, #4] \n\t" /* moved form [B] */
  94. "orr r9, r9, r8, lsl #24 \n\t"
  95. /* store dest */
  96. "ldrsh r5, [%0, #8] \n\t" /* moved form [C] */
  97. "str r9, [%1] \n\t"
  98. /* load dest */
  99. /* [B] */
  100. /* block[4] and block[5] */
  101. /* [C] */
  102. "ldrsh r7, [%0, #10] \n\t"
  103. "and r6, r4, #0xFF \n\t"
  104. "and r8, r4, #0xFF00 \n\t"
  105. "add r6, r5, r6 \n\t"
  106. "add r8, r7, r8, lsr #8 \n\t"
  107. "mvn r5, r5 \n\t"
  108. "mvn r7, r7 \n\t"
  109. "tst r6, #0x100 \n\t"
  110. "movne r6, r5, lsr #24 \n\t"
  111. "tst r8, #0x100 \n\t"
  112. "movne r8, r7, lsr #24 \n\t"
  113. "mov r9, r6 \n\t"
  114. "ldrsh r5, [%0, #12] \n\t" /* moved from [D] */
  115. "orr r9, r9, r8, lsl #8 \n\t"
  116. /* block[6] and block[7] */
  117. /* [D] */
  118. "ldrsh r7, [%0, #14] \n\t"
  119. "and r6, r4, #0xFF0000 \n\t"
  120. "and r8, r4, #0xFF000000 \n\t"
  121. "add r6, r5, r6, lsr #16 \n\t"
  122. "add r8, r7, r8, lsr #24 \n\t"
  123. "mvn r5, r5 \n\t"
  124. "mvn r7, r7 \n\t"
  125. "tst r6, #0x100 \n\t"
  126. "movne r6, r5, lsr #24 \n\t"
  127. "tst r8, #0x100 \n\t"
  128. "movne r8, r7, lsr #24 \n\t"
  129. "orr r9, r9, r6, lsl #16 \n\t"
  130. "add %0, %0, #16 \n\t" /* moved from [E] */
  131. "orr r9, r9, r8, lsl #24 \n\t"
  132. "subs r10, r10, #1 \n\t" /* moved from [F] */
  133. /* store dest */
  134. "str r9, [%1, #4] \n\t"
  135. /* [E] */
  136. /* [F] */
  137. "add %1, %1, %2 \n\t"
  138. "bne 1b \n\t"
  139. : "+r"(block),
  140. "+r"(dest)
  141. : "r"(line_size)
  142. : "r4", "r5", "r6", "r7", "r8", "r9", "r10", "cc", "memory" );
  143. }
  144. /* XXX: those functions should be suppressed ASAP when all IDCTs are
  145. converted */
  146. static void j_rev_dct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
  147. {
  148. j_rev_dct_ARM (block);
  149. ff_put_pixels_clamped(block, dest, line_size);
  150. }
  151. static void j_rev_dct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
  152. {
  153. j_rev_dct_ARM (block);
  154. ff_add_pixels_clamped(block, dest, line_size);
  155. }
  156. static void simple_idct_ARM_put(uint8_t *dest, int line_size, DCTELEM *block)
  157. {
  158. simple_idct_ARM (block);
  159. ff_put_pixels_clamped(block, dest, line_size);
  160. }
  161. static void simple_idct_ARM_add(uint8_t *dest, int line_size, DCTELEM *block)
  162. {
  163. simple_idct_ARM (block);
  164. ff_add_pixels_clamped(block, dest, line_size);
  165. }
  166. #ifdef HAVE_IPP
  167. static void simple_idct_ipp(DCTELEM *block)
  168. {
  169. ippiDCT8x8Inv_Video_16s_C1I(block);
  170. }
  171. static void simple_idct_ipp_put(uint8_t *dest, int line_size, DCTELEM *block)
  172. {
  173. ippiDCT8x8Inv_Video_16s8u_C1R(block, dest, line_size);
  174. }
  175. void add_pixels_clamped_iwmmxt(const DCTELEM *block, uint8_t *pixels, int line_size);
  176. static void simple_idct_ipp_add(uint8_t *dest, int line_size, DCTELEM *block)
  177. {
  178. ippiDCT8x8Inv_Video_16s_C1I(block);
  179. #ifdef HAVE_IWMMXT
  180. add_pixels_clamped_iwmmxt(block, dest, line_size);
  181. #else
  182. add_pixels_clamped_ARM(block, dest, line_size);
  183. #endif
  184. }
  185. #endif
  186. #ifdef HAVE_ARMV5TE
  187. static void prefetch_arm(void *mem, int stride, int h)
  188. {
  189. __asm__ volatile(
  190. "1: \n\t"
  191. "subs %0, %0, #1 \n\t"
  192. "pld [%1] \n\t"
  193. "add %1, %1, %2 \n\t"
  194. "bgt 1b \n\t"
  195. : "+r"(h), "+r"(mem) : "r"(stride));
  196. }
  197. #endif
  198. int mm_support(void)
  199. {
  200. return ENABLE_IWMMXT * MM_IWMMXT;
  201. }
  202. void dsputil_init_armv4l(DSPContext* c, AVCodecContext *avctx)
  203. {
  204. int idct_algo= avctx->idct_algo;
  205. ff_put_pixels_clamped = c->put_pixels_clamped;
  206. ff_add_pixels_clamped = c->add_pixels_clamped;
  207. if (avctx->lowres == 0) {
  208. if(idct_algo == FF_IDCT_AUTO){
  209. #if defined(HAVE_IPP)
  210. idct_algo = FF_IDCT_IPP;
  211. #elif defined(HAVE_ARMV6)
  212. idct_algo = FF_IDCT_SIMPLEARMV6;
  213. #elif defined(HAVE_ARMV5TE)
  214. idct_algo = FF_IDCT_SIMPLEARMV5TE;
  215. #else
  216. idct_algo = FF_IDCT_ARM;
  217. #endif
  218. }
  219. if(idct_algo==FF_IDCT_ARM){
  220. c->idct_put= j_rev_dct_ARM_put;
  221. c->idct_add= j_rev_dct_ARM_add;
  222. c->idct = j_rev_dct_ARM;
  223. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;/* FF_NO_IDCT_PERM */
  224. } else if (idct_algo==FF_IDCT_SIMPLEARM){
  225. c->idct_put= simple_idct_ARM_put;
  226. c->idct_add= simple_idct_ARM_add;
  227. c->idct = simple_idct_ARM;
  228. c->idct_permutation_type= FF_NO_IDCT_PERM;
  229. #ifdef HAVE_ARMV6
  230. } else if (idct_algo==FF_IDCT_SIMPLEARMV6){
  231. c->idct_put= ff_simple_idct_put_armv6;
  232. c->idct_add= ff_simple_idct_add_armv6;
  233. c->idct = ff_simple_idct_armv6;
  234. c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
  235. #endif
  236. #ifdef HAVE_ARMV5TE
  237. } else if (idct_algo==FF_IDCT_SIMPLEARMV5TE){
  238. c->idct_put= simple_idct_put_armv5te;
  239. c->idct_add= simple_idct_add_armv5te;
  240. c->idct = simple_idct_armv5te;
  241. c->idct_permutation_type = FF_NO_IDCT_PERM;
  242. #endif
  243. #ifdef HAVE_IPP
  244. } else if (idct_algo==FF_IDCT_IPP){
  245. c->idct_put= simple_idct_ipp_put;
  246. c->idct_add= simple_idct_ipp_add;
  247. c->idct = simple_idct_ipp;
  248. c->idct_permutation_type= FF_NO_IDCT_PERM;
  249. #endif
  250. }
  251. }
  252. c->put_pixels_tab[0][0] = put_pixels16_arm;
  253. c->put_pixels_tab[0][1] = put_pixels16_x2_arm; //OK!
  254. c->put_pixels_tab[0][2] = put_pixels16_y2_arm; //OK!
  255. c->put_pixels_tab[0][3] = put_pixels16_xy2_arm;
  256. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_arm;
  257. c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_arm; // OK
  258. c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_arm; //OK
  259. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_arm;
  260. c->put_pixels_tab[1][0] = put_pixels8_arm; //OK
  261. c->put_pixels_tab[1][1] = put_pixels8_x2_arm; //OK
  262. c->put_pixels_tab[1][2] = put_pixels8_y2_arm;
  263. c->put_pixels_tab[1][3] = put_pixels8_xy2_arm;
  264. c->put_no_rnd_pixels_tab[1][0] = put_pixels8_arm;//OK
  265. c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_arm; //OK
  266. c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_arm; //OK
  267. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_arm;
  268. #ifdef HAVE_ARMV5TE
  269. c->prefetch = prefetch_arm;
  270. #endif
  271. #ifdef HAVE_IWMMXT
  272. dsputil_init_iwmmxt(c, avctx);
  273. #endif
  274. #ifdef HAVE_ARMVFP
  275. ff_float_init_arm_vfp(c, avctx);
  276. #endif
  277. }