You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

303 lines
11KB

  1. /*
  2. * Format Conversion Utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. *
  22. * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
  23. */
  24. #include "libavutil/cpu.h"
  25. #include "libavutil/x86_cpu.h"
  26. #include "libavcodec/fmtconvert.h"
  27. static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len)
  28. {
  29. x86_reg i = -4*len;
  30. __asm__ volatile(
  31. "movss %3, %%xmm4 \n"
  32. "shufps $0, %%xmm4, %%xmm4 \n"
  33. "1: \n"
  34. "cvtpi2ps (%2,%0), %%xmm0 \n"
  35. "cvtpi2ps 8(%2,%0), %%xmm1 \n"
  36. "cvtpi2ps 16(%2,%0), %%xmm2 \n"
  37. "cvtpi2ps 24(%2,%0), %%xmm3 \n"
  38. "movlhps %%xmm1, %%xmm0 \n"
  39. "movlhps %%xmm3, %%xmm2 \n"
  40. "mulps %%xmm4, %%xmm0 \n"
  41. "mulps %%xmm4, %%xmm2 \n"
  42. "movaps %%xmm0, (%1,%0) \n"
  43. "movaps %%xmm2, 16(%1,%0) \n"
  44. "add $32, %0 \n"
  45. "jl 1b \n"
  46. :"+r"(i)
  47. :"r"(dst+len), "r"(src+len), "m"(mul)
  48. );
  49. }
  50. static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len)
  51. {
  52. x86_reg i = -4*len;
  53. __asm__ volatile(
  54. "movss %3, %%xmm4 \n"
  55. "shufps $0, %%xmm4, %%xmm4 \n"
  56. "1: \n"
  57. "cvtdq2ps (%2,%0), %%xmm0 \n"
  58. "cvtdq2ps 16(%2,%0), %%xmm1 \n"
  59. "mulps %%xmm4, %%xmm0 \n"
  60. "mulps %%xmm4, %%xmm1 \n"
  61. "movaps %%xmm0, (%1,%0) \n"
  62. "movaps %%xmm1, 16(%1,%0) \n"
  63. "add $32, %0 \n"
  64. "jl 1b \n"
  65. :"+r"(i)
  66. :"r"(dst+len), "r"(src+len), "m"(mul)
  67. );
  68. }
  69. static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){
  70. x86_reg reglen = len;
  71. // not bit-exact: pf2id uses different rounding than C and SSE
  72. __asm__ volatile(
  73. "add %0 , %0 \n\t"
  74. "lea (%2,%0,2) , %2 \n\t"
  75. "add %0 , %1 \n\t"
  76. "neg %0 \n\t"
  77. "1: \n\t"
  78. "pf2id (%2,%0,2) , %%mm0 \n\t"
  79. "pf2id 8(%2,%0,2) , %%mm1 \n\t"
  80. "pf2id 16(%2,%0,2) , %%mm2 \n\t"
  81. "pf2id 24(%2,%0,2) , %%mm3 \n\t"
  82. "packssdw %%mm1 , %%mm0 \n\t"
  83. "packssdw %%mm3 , %%mm2 \n\t"
  84. "movq %%mm0 , (%1,%0) \n\t"
  85. "movq %%mm2 , 8(%1,%0) \n\t"
  86. "add $16 , %0 \n\t"
  87. " js 1b \n\t"
  88. "femms \n\t"
  89. :"+r"(reglen), "+r"(dst), "+r"(src)
  90. );
  91. }
  92. static void float_to_int16_sse(int16_t *dst, const float *src, long len){
  93. x86_reg reglen = len;
  94. __asm__ volatile(
  95. "add %0 , %0 \n\t"
  96. "lea (%2,%0,2) , %2 \n\t"
  97. "add %0 , %1 \n\t"
  98. "neg %0 \n\t"
  99. "1: \n\t"
  100. "cvtps2pi (%2,%0,2) , %%mm0 \n\t"
  101. "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t"
  102. "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t"
  103. "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t"
  104. "packssdw %%mm1 , %%mm0 \n\t"
  105. "packssdw %%mm3 , %%mm2 \n\t"
  106. "movq %%mm0 , (%1,%0) \n\t"
  107. "movq %%mm2 , 8(%1,%0) \n\t"
  108. "add $16 , %0 \n\t"
  109. " js 1b \n\t"
  110. "emms \n\t"
  111. :"+r"(reglen), "+r"(dst), "+r"(src)
  112. );
  113. }
  114. static void float_to_int16_sse2(int16_t *dst, const float *src, long len){
  115. x86_reg reglen = len;
  116. __asm__ volatile(
  117. "add %0 , %0 \n\t"
  118. "lea (%2,%0,2) , %2 \n\t"
  119. "add %0 , %1 \n\t"
  120. "neg %0 \n\t"
  121. "1: \n\t"
  122. "cvtps2dq (%2,%0,2) , %%xmm0 \n\t"
  123. "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t"
  124. "packssdw %%xmm1 , %%xmm0 \n\t"
  125. "movdqa %%xmm0 , (%1,%0) \n\t"
  126. "add $16 , %0 \n\t"
  127. " js 1b \n\t"
  128. :"+r"(reglen), "+r"(dst), "+r"(src)
  129. );
  130. }
  131. void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
  132. void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
  133. void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len);
  134. #if !HAVE_YASM
  135. #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6)
  136. #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
  137. #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6)
  138. #endif
  139. #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
  140. #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \
  141. /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
  142. static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
  143. DECLARE_ALIGNED(16, int16_t, tmp)[len];\
  144. int i,j,c;\
  145. for(c=0; c<channels; c++){\
  146. float_to_int16_##cpu(tmp, src[c], len);\
  147. for(i=0, j=c; i<len; i++, j+=channels)\
  148. dst[j] = tmp[i];\
  149. }\
  150. }\
  151. \
  152. static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
  153. if(channels==1)\
  154. float_to_int16_##cpu(dst, src[0], len);\
  155. else if(channels==2){\
  156. x86_reg reglen = len; \
  157. const float *src0 = src[0];\
  158. const float *src1 = src[1];\
  159. __asm__ volatile(\
  160. "shl $2, %0 \n"\
  161. "add %0, %1 \n"\
  162. "add %0, %2 \n"\
  163. "add %0, %3 \n"\
  164. "neg %0 \n"\
  165. body\
  166. :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\
  167. );\
  168. }else if(channels==6){\
  169. ff_float_to_int16_interleave6_##cpu(dst, src, len);\
  170. }else\
  171. float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
  172. }
  173. FLOAT_TO_INT16_INTERLEAVE(3dnow,
  174. "1: \n"
  175. "pf2id (%2,%0), %%mm0 \n"
  176. "pf2id 8(%2,%0), %%mm1 \n"
  177. "pf2id (%3,%0), %%mm2 \n"
  178. "pf2id 8(%3,%0), %%mm3 \n"
  179. "packssdw %%mm1, %%mm0 \n"
  180. "packssdw %%mm3, %%mm2 \n"
  181. "movq %%mm0, %%mm1 \n"
  182. "punpcklwd %%mm2, %%mm0 \n"
  183. "punpckhwd %%mm2, %%mm1 \n"
  184. "movq %%mm0, (%1,%0)\n"
  185. "movq %%mm1, 8(%1,%0)\n"
  186. "add $16, %0 \n"
  187. "js 1b \n"
  188. "femms \n"
  189. )
  190. FLOAT_TO_INT16_INTERLEAVE(sse,
  191. "1: \n"
  192. "cvtps2pi (%2,%0), %%mm0 \n"
  193. "cvtps2pi 8(%2,%0), %%mm1 \n"
  194. "cvtps2pi (%3,%0), %%mm2 \n"
  195. "cvtps2pi 8(%3,%0), %%mm3 \n"
  196. "packssdw %%mm1, %%mm0 \n"
  197. "packssdw %%mm3, %%mm2 \n"
  198. "movq %%mm0, %%mm1 \n"
  199. "punpcklwd %%mm2, %%mm0 \n"
  200. "punpckhwd %%mm2, %%mm1 \n"
  201. "movq %%mm0, (%1,%0)\n"
  202. "movq %%mm1, 8(%1,%0)\n"
  203. "add $16, %0 \n"
  204. "js 1b \n"
  205. "emms \n"
  206. )
  207. FLOAT_TO_INT16_INTERLEAVE(sse2,
  208. "1: \n"
  209. "cvtps2dq (%2,%0), %%xmm0 \n"
  210. "cvtps2dq (%3,%0), %%xmm1 \n"
  211. "packssdw %%xmm1, %%xmm0 \n"
  212. "movhlps %%xmm0, %%xmm1 \n"
  213. "punpcklwd %%xmm1, %%xmm0 \n"
  214. "movdqa %%xmm0, (%1,%0) \n"
  215. "add $16, %0 \n"
  216. "js 1b \n"
  217. )
  218. static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){
  219. if(channels==6)
  220. ff_float_to_int16_interleave6_3dn2(dst, src, len);
  221. else
  222. float_to_int16_interleave_3dnow(dst, src, len, channels);
  223. }
  224. #if HAVE_YASM
  225. void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
  226. void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
  227. void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
  228. void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
  229. static void float_interleave_mmx(float *dst, const float **src,
  230. unsigned int len, int channels)
  231. {
  232. if (channels == 2) {
  233. ff_float_interleave2_mmx(dst, src, len);
  234. } else if (channels == 6)
  235. ff_float_interleave6_mmx(dst, src, len);
  236. else
  237. ff_float_interleave_c(dst, src, len, channels);
  238. }
  239. static void float_interleave_sse(float *dst, const float **src,
  240. unsigned int len, int channels)
  241. {
  242. if (channels == 2) {
  243. ff_float_interleave2_sse(dst, src, len);
  244. } else if (channels == 6)
  245. ff_float_interleave6_sse(dst, src, len);
  246. else
  247. ff_float_interleave_c(dst, src, len, channels);
  248. }
  249. #endif
  250. void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
  251. {
  252. int mm_flags = av_get_cpu_flags();
  253. if (mm_flags & AV_CPU_FLAG_MMX) {
  254. #if HAVE_YASM
  255. c->float_interleave = float_interleave_mmx;
  256. #endif
  257. if(mm_flags & AV_CPU_FLAG_3DNOW){
  258. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  259. c->float_to_int16 = float_to_int16_3dnow;
  260. c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
  261. }
  262. }
  263. if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
  264. if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
  265. c->float_to_int16_interleave = float_to_int16_interleave_3dn2;
  266. }
  267. }
  268. if(mm_flags & AV_CPU_FLAG_SSE){
  269. c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse;
  270. c->float_to_int16 = float_to_int16_sse;
  271. c->float_to_int16_interleave = float_to_int16_interleave_sse;
  272. #if HAVE_YASM
  273. c->float_interleave = float_interleave_sse;
  274. #endif
  275. }
  276. if(mm_flags & AV_CPU_FLAG_SSE2){
  277. c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2;
  278. c->float_to_int16 = float_to_int16_sse2;
  279. c->float_to_int16_interleave = float_to_int16_interleave_sse2;
  280. }
  281. }
  282. }