You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

240 lines
8.1KB

  1. /*
  2. * Format Conversion Utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of Libav.
  7. *
  8. * Libav is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * Libav is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with Libav; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. *
  22. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23. */
  24. #include "libavutil/cpu.h"
  25. #include "libavutil/x86_cpu.h"
  26. #include "libavcodec/fmtconvert.h"
  27. static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
  28. {
  29. x86_reg i = -4*len;
  30. __asm__ volatile(
  31. "movss %3, %%xmm4 \n"
  32. "shufps $0, %%xmm4, %%xmm4 \n"
  33. "1: \n"
  34. "cvtpi2ps (%2,%0), %%xmm0 \n"
  35. "cvtpi2ps 8(%2,%0), %%xmm1 \n"
  36. "cvtpi2ps 16(%2,%0), %%xmm2 \n"
  37. "cvtpi2ps 24(%2,%0), %%xmm3 \n"
  38. "movlhps %%xmm1, %%xmm0 \n"
  39. "movlhps %%xmm3, %%xmm2 \n"
  40. "mulps %%xmm4, %%xmm0 \n"
  41. "mulps %%xmm4, %%xmm2 \n"
  42. "movaps %%xmm0, (%1,%0) \n"
  43. "movaps %%xmm2, 16(%1,%0) \n"
  44. "add $32, %0 \n"
  45. "jl 1b \n"
  46. :"+r"(i)
  47. :"r"(dst+len), "r"(src+len), "m"(mul)
  48. );
  49. }
  50. static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
  51. {
  52. x86_reg i = -4*len;
  53. __asm__ volatile(
  54. "movss %3, %%xmm4 \n"
  55. "shufps $0, %%xmm4, %%xmm4 \n"
  56. "1: \n"
  57. "cvtdq2ps (%2,%0), %%xmm0 \n"
  58. "cvtdq2ps 16(%2,%0), %%xmm1 \n"
  59. "mulps %%xmm4, %%xmm0 \n"
  60. "mulps %%xmm4, %%xmm1 \n"
  61. "movaps %%xmm0, (%1,%0) \n"
  62. "movaps %%xmm1, 16(%1,%0) \n"
  63. "add $32, %0 \n"
  64. "jl 1b \n"
  65. :"+r"(i)
  66. :"r"(dst+len), "r"(src+len), "m"(mul)
  67. );
  68. }
  69. #if HAVE_YASM
  70. void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
  71. void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
  72. void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
  73. void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
  74. void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
  75. void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
  76. #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
  77. #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
  78. /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
  79. static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
  80. DECLARE_ALIGNED(16, int16_t, tmp)[len];\
  81. int i,j,c;\
  82. for(c=0; c<channels; c++){\
  83. ff_float_to_int16_##cpu(tmp, src[c], len);\
  84. for(i=0, j=c; i<len; i++, j+=channels)\
  85. dst[j] = tmp[i];\
  86. }\
  87. }\
  88. \
  89. static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
  90. if(channels==1)\
  91. ff_float_to_int16_##cpu(dst, src[0], len);\
  92. else if(channels==2){\
  93. x86_reg reglen = len; \
  94. const float *src0 = src[0];\
  95. const float *src1 = src[1];\
  96. __asm__ volatile(\
  97. "shl $2, %0 \n"\
  98. "add %0, %1 \n"\
  99. "add %0, %2 \n"\
  100. "add %0, %3 \n"\
  101. "neg %0 \n"\
  102. body\
  103. :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
  104. );\
  105. }else if(channels==6){\
  106. ff_float_to_int16_interleave6_##cpu(dst, src, len);\
  107. }else\
  108. float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
  109. }
  110. FLOAT_TO_INT16_INTERLEAVE(3dnow,
  111. "1: \n"
  112. "pf2id (%2,%0), %%mm0 \n"
  113. "pf2id 8(%2,%0), %%mm1 \n"
  114. "pf2id (%3,%0), %%mm2 \n"
  115. "pf2id 8(%3,%0), %%mm3 \n"
  116. "packssdw %%mm1, %%mm0 \n"
  117. "packssdw %%mm3, %%mm2 \n"
  118. "movq %%mm0, %%mm1 \n"
  119. "punpcklwd %%mm2, %%mm0 \n"
  120. "punpckhwd %%mm2, %%mm1 \n"
  121. "movq %%mm0, (%1,%0)\n"
  122. "movq %%mm1, 8(%1,%0)\n"
  123. "add $16, %0 \n"
  124. "js 1b \n"
  125. "femms \n"
  126. )
  127. FLOAT_TO_INT16_INTERLEAVE(sse,
  128. "1: \n"
  129. "cvtps2pi (%2,%0), %%mm0 \n"
  130. "cvtps2pi 8(%2,%0), %%mm1 \n"
  131. "cvtps2pi (%3,%0), %%mm2 \n"
  132. "cvtps2pi 8(%3,%0), %%mm3 \n"
  133. "packssdw %%mm1, %%mm0 \n"
  134. "packssdw %%mm3, %%mm2 \n"
  135. "movq %%mm0, %%mm1 \n"
  136. "punpcklwd %%mm2, %%mm0 \n"
  137. "punpckhwd %%mm2, %%mm1 \n"
  138. "movq %%mm0, (%1,%0)\n"
  139. "movq %%mm1, 8(%1,%0)\n"
  140. "add $16, %0 \n"
  141. "js 1b \n"
  142. "emms \n"
  143. )
  144. FLOAT_TO_INT16_INTERLEAVE(sse2,
  145. "1: \n"
  146. "cvtps2dq (%2,%0), %%xmm0 \n"
  147. "cvtps2dq (%3,%0), %%xmm1 \n"
  148. "packssdw %%xmm1, %%xmm0 \n"
  149. "movhlps %%xmm0, %%xmm1 \n"
  150. "punpcklwd %%xmm1, %%xmm0 \n"
  151. "movdqa %%xmm0, (%1,%0) \n"
  152. "add $16, %0 \n"
  153. "js 1b \n"
  154. )
  155. static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
  156. if(channels==6)
  157. ff_float_to_int16_interleave6_3dn2(dst, src, len);
  158. else
  159. float_to_int16_interleave_3dnow(dst, src, len, channels);
  160. }
  161. void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
  162. void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
  163. void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
  164. void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
  165. static void float_interleave_mmx(float *dst, const float **src,
  166. unsigned int len, int channels)
  167. {
  168. if (channels == 2) {
  169. ff_float_interleave2_mmx(dst, src, len);
  170. } else if (channels == 6)
  171. ff_float_interleave6_mmx(dst, src, len);
  172. else
  173. ff_float_interleave_c(dst, src, len, channels);
  174. }
  175. static void float_interleave_sse(float *dst, const float **src,
  176. unsigned int len, int channels)
  177. {
  178. if (channels == 2) {
  179. ff_float_interleave2_sse(dst, src, len);
  180. } else if (channels == 6)
  181. ff_float_interleave6_sse(dst, src, len);
  182. else
  183. ff_float_interleave_c(dst, src, len, channels);
  184. }
  185. #endif
  186. void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
  187. {
  188. int mm_flags = av_get_cpu_flags();
  189. if (mm_flags & AV_CPU_FLAG_MMX) {
  190. #if HAVE_YASM
  191. c->float_interleave = float_interleave_mmx;
  192. if(mm_flags & AV_CPU_FLAG_3DNOW){
  193. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  194. c->float_to_int16 = ff_float_to_int16_3dnow;
  195. c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
  196. }
  197. }
  198. if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
  199. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  200. c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
  201. }
  202. }
  203. #endif
  204. if(mm_flags & AV_CPU_FLAG_SSE){
  205. c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
  206. #if HAVE_YASM
  207. c->float_to_int16 = ff_float_to_int16_sse;
  208. c->float_to_int16_interleave = float_to_int16_interleave_sse;
  209. c->float_interleave = float_interleave_sse;
  210. #endif
  211. }
  212. if(mm_flags & AV_CPU_FLAG_SSE2){
  213. c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
  214. #if HAVE_YASM
  215. c->float_to_int16 = ff_float_to_int16_sse2;
  216. c->float_to_int16_interleave = float_to_int16_interleave_sse2;
  217. #endif
  218. }
  219. }
  220. }