You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

368 lines
9.2KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86inc.asm"
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_TEXT
  24. ;---------------------------------------------------------------------------------
  25. ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
  26. ;---------------------------------------------------------------------------------
  27. %macro INT32_TO_FLOAT_FMUL_SCALAR 2
  28. %if UNIX64
  29. cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
  30. %else
  31. cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
  32. %endif
  33. %if WIN64
  34. SWAP 0, 2
  35. %elif ARCH_X86_32
  36. movss m0, mulm
  37. %endif
  38. SPLATD m0
  39. shl lenq, 2
  40. add srcq, lenq
  41. add dstq, lenq
  42. neg lenq
  43. .loop:
  44. %ifidn %1, sse2
  45. cvtdq2ps m1, [srcq+lenq ]
  46. cvtdq2ps m2, [srcq+lenq+16]
  47. %else
  48. cvtpi2ps m1, [srcq+lenq ]
  49. cvtpi2ps m3, [srcq+lenq+ 8]
  50. cvtpi2ps m2, [srcq+lenq+16]
  51. cvtpi2ps m4, [srcq+lenq+24]
  52. movlhps m1, m3
  53. movlhps m2, m4
  54. %endif
  55. mulps m1, m0
  56. mulps m2, m0
  57. mova [dstq+lenq ], m1
  58. mova [dstq+lenq+16], m2
  59. add lenq, 32
  60. jl .loop
  61. REP_RET
  62. %endmacro
  63. INIT_XMM
  64. %define SPLATD SPLATD_SSE
  65. %define movdqa movaps
  66. INT32_TO_FLOAT_FMUL_SCALAR sse, 5
  67. %undef movdqa
  68. %define SPLATD SPLATD_SSE2
  69. INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
  70. %undef SPLATD
  71. ;------------------------------------------------------------------------------
  72. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  73. ;------------------------------------------------------------------------------
  74. %macro FLOAT_TO_INT16 2
  75. cglobal float_to_int16_%1, 3,3,%2, dst, src, len
  76. add lenq, lenq
  77. lea srcq, [srcq+2*lenq]
  78. add dstq, lenq
  79. neg lenq
  80. .loop:
  81. %ifidn %1, sse2
  82. cvtps2dq m0, [srcq+2*lenq ]
  83. cvtps2dq m1, [srcq+2*lenq+16]
  84. packssdw m0, m1
  85. mova [dstq+lenq], m0
  86. %else
  87. cvtps2pi m0, [srcq+2*lenq ]
  88. cvtps2pi m1, [srcq+2*lenq+ 8]
  89. cvtps2pi m2, [srcq+2*lenq+16]
  90. cvtps2pi m3, [srcq+2*lenq+24]
  91. packssdw m0, m1
  92. packssdw m2, m3
  93. mova [dstq+lenq ], m0
  94. mova [dstq+lenq+8], m2
  95. %endif
  96. add lenq, 16
  97. js .loop
  98. %ifnidn %1, sse2
  99. emms
  100. %endif
  101. REP_RET
  102. %endmacro
  103. INIT_XMM
  104. FLOAT_TO_INT16 sse2, 2
  105. INIT_MMX
  106. FLOAT_TO_INT16 sse, 0
  107. %define cvtps2pi pf2id
  108. FLOAT_TO_INT16 3dnow, 0
  109. %undef cvtps2pi
  110. ;-------------------------------------------------------------------------------
  111. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  112. ;-------------------------------------------------------------------------------
  113. %macro FLOAT_TO_INT16_INTERLEAVE2 1
  114. cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
  115. lea lenq, [4*r2q]
  116. mov src1q, [src0q+gprsize]
  117. mov src0q, [src0q]
  118. add dstq, lenq
  119. add src0q, lenq
  120. add src1q, lenq
  121. neg lenq
  122. .loop:
  123. %ifidn %1, sse2
  124. cvtps2dq m0, [src0q+lenq]
  125. cvtps2dq m1, [src1q+lenq]
  126. packssdw m0, m1
  127. movhlps m1, m0
  128. punpcklwd m0, m1
  129. mova [dstq+lenq], m0
  130. %else
  131. cvtps2pi m0, [src0q+lenq ]
  132. cvtps2pi m1, [src0q+lenq+8]
  133. cvtps2pi m2, [src1q+lenq ]
  134. cvtps2pi m3, [src1q+lenq+8]
  135. packssdw m0, m1
  136. packssdw m2, m3
  137. mova m1, m0
  138. punpcklwd m0, m2
  139. punpckhwd m1, m2
  140. mova [dstq+lenq ], m0
  141. mova [dstq+lenq+8], m1
  142. %endif
  143. add lenq, 16
  144. js .loop
  145. %ifnidn %1, sse2
  146. emms
  147. %endif
  148. REP_RET
  149. %endmacro
  150. INIT_MMX
  151. %define cvtps2pi pf2id
  152. FLOAT_TO_INT16_INTERLEAVE2 3dnow
  153. %undef cvtps2pi
  154. %define movdqa movaps
  155. FLOAT_TO_INT16_INTERLEAVE2 sse
  156. %undef movdqa
  157. INIT_XMM
  158. FLOAT_TO_INT16_INTERLEAVE2 sse2
  159. %macro PSWAPD_SSE 2
  160. pshufw %1, %2, 0x4e
  161. %endmacro
  162. %macro PSWAPD_3DN1 2
  163. movq %1, %2
  164. psrlq %1, 32
  165. punpckldq %1, %2
  166. %endmacro
  167. %macro FLOAT_TO_INT16_INTERLEAVE6 1
  168. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  169. cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, src5, len
  170. %if ARCH_X86_64
  171. mov lend, r2d
  172. %else
  173. %define lend dword r2m
  174. %endif
  175. mov src1q, [srcq+1*gprsize]
  176. mov src2q, [srcq+2*gprsize]
  177. mov src3q, [srcq+3*gprsize]
  178. mov src4q, [srcq+4*gprsize]
  179. mov src5q, [srcq+5*gprsize]
  180. mov srcq, [srcq]
  181. sub src1q, srcq
  182. sub src2q, srcq
  183. sub src3q, srcq
  184. sub src4q, srcq
  185. sub src5q, srcq
  186. .loop:
  187. cvtps2pi mm0, [srcq]
  188. cvtps2pi mm1, [srcq+src1q]
  189. cvtps2pi mm2, [srcq+src2q]
  190. cvtps2pi mm3, [srcq+src3q]
  191. cvtps2pi mm4, [srcq+src4q]
  192. cvtps2pi mm5, [srcq+src5q]
  193. packssdw mm0, mm3
  194. packssdw mm1, mm4
  195. packssdw mm2, mm5
  196. pswapd mm3, mm0
  197. punpcklwd mm0, mm1
  198. punpckhwd mm1, mm2
  199. punpcklwd mm2, mm3
  200. pswapd mm3, mm0
  201. punpckldq mm0, mm2
  202. punpckhdq mm2, mm1
  203. punpckldq mm1, mm3
  204. movq [dstq ], mm0
  205. movq [dstq+16], mm2
  206. movq [dstq+ 8], mm1
  207. add srcq, 8
  208. add dstq, 24
  209. sub lend, 2
  210. jg .loop
  211. emms
  212. RET
  213. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  214. %define pswapd PSWAPD_SSE
  215. FLOAT_TO_INT16_INTERLEAVE6 sse
  216. %define cvtps2pi pf2id
  217. %define pswapd PSWAPD_3DN1
  218. FLOAT_TO_INT16_INTERLEAVE6 3dnow
  219. %undef pswapd
  220. FLOAT_TO_INT16_INTERLEAVE6 3dn2
  221. %undef cvtps2pi
  222. ;-----------------------------------------------------------------------------
  223. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  224. ;-----------------------------------------------------------------------------
  225. %macro FLOAT_INTERLEAVE6 2
  226. cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, len
  227. %if ARCH_X86_64
  228. mov lend, r2d
  229. %else
  230. %define lend dword r2m
  231. %endif
  232. mov src1q, [srcq+1*gprsize]
  233. mov src2q, [srcq+2*gprsize]
  234. mov src3q, [srcq+3*gprsize]
  235. mov src4q, [srcq+4*gprsize]
  236. mov src5q, [srcq+5*gprsize]
  237. mov srcq, [srcq]
  238. sub src1q, srcq
  239. sub src2q, srcq
  240. sub src3q, srcq
  241. sub src4q, srcq
  242. sub src5q, srcq
  243. .loop:
  244. %ifidn %1, sse
  245. movaps m0, [srcq]
  246. movaps m1, [srcq+src1q]
  247. movaps m2, [srcq+src2q]
  248. movaps m3, [srcq+src3q]
  249. movaps m4, [srcq+src4q]
  250. movaps m5, [srcq+src5q]
  251. SBUTTERFLYPS 0, 1, 6
  252. SBUTTERFLYPS 2, 3, 6
  253. SBUTTERFLYPS 4, 5, 6
  254. movaps m6, m4
  255. shufps m4, m0, 0xe4
  256. movlhps m0, m2
  257. movhlps m6, m2
  258. movaps [dstq ], m0
  259. movaps [dstq+16], m4
  260. movaps [dstq+32], m6
  261. movaps m6, m5
  262. shufps m5, m1, 0xe4
  263. movlhps m1, m3
  264. movhlps m6, m3
  265. movaps [dstq+48], m1
  266. movaps [dstq+64], m5
  267. movaps [dstq+80], m6
  268. %else ; mmx
  269. movq m0, [srcq]
  270. movq m1, [srcq+src1q]
  271. movq m2, [srcq+src2q]
  272. movq m3, [srcq+src3q]
  273. movq m4, [srcq+src4q]
  274. movq m5, [srcq+src5q]
  275. SBUTTERFLY dq, 0, 1, 6
  276. SBUTTERFLY dq, 2, 3, 6
  277. SBUTTERFLY dq, 4, 5, 6
  278. movq [dstq ], m0
  279. movq [dstq+ 8], m2
  280. movq [dstq+16], m4
  281. movq [dstq+24], m1
  282. movq [dstq+32], m3
  283. movq [dstq+40], m5
  284. %endif
  285. add srcq, mmsize
  286. add dstq, mmsize*6
  287. sub lend, mmsize/4
  288. jg .loop
  289. %ifidn %1, mmx
  290. emms
  291. %endif
  292. REP_RET
  293. %endmacro
  294. INIT_MMX
  295. FLOAT_INTERLEAVE6 mmx, 0
  296. INIT_XMM
  297. FLOAT_INTERLEAVE6 sse, 7
  298. ;-----------------------------------------------------------------------------
  299. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  300. ;-----------------------------------------------------------------------------
  301. %macro FLOAT_INTERLEAVE2 2
  302. cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
  303. mov src1q, [srcq+gprsize]
  304. mov srcq, [srcq ]
  305. sub src1q, srcq
  306. .loop
  307. MOVPS m0, [srcq ]
  308. MOVPS m1, [srcq+src1q ]
  309. MOVPS m3, [srcq +mmsize]
  310. MOVPS m4, [srcq+src1q+mmsize]
  311. MOVPS m2, m0
  312. PUNPCKLDQ m0, m1
  313. PUNPCKHDQ m2, m1
  314. MOVPS m1, m3
  315. PUNPCKLDQ m3, m4
  316. PUNPCKHDQ m1, m4
  317. MOVPS [dstq ], m0
  318. MOVPS [dstq+1*mmsize], m2
  319. MOVPS [dstq+2*mmsize], m3
  320. MOVPS [dstq+3*mmsize], m1
  321. add srcq, mmsize*2
  322. add dstq, mmsize*4
  323. sub lend, mmsize/2
  324. jg .loop
  325. %ifidn %1, mmx
  326. emms
  327. %endif
  328. REP_RET
  329. %endmacro
  330. INIT_MMX
  331. %define MOVPS movq
  332. %define PUNPCKLDQ punpckldq
  333. %define PUNPCKHDQ punpckhdq
  334. FLOAT_INTERLEAVE2 mmx, 0
  335. INIT_XMM
  336. %define MOVPS movaps
  337. %define PUNPCKLDQ unpcklps
  338. %define PUNPCKHDQ unpckhps
  339. FLOAT_INTERLEAVE2 sse, 5