You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

370 lines
9.2KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86inc.asm"
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_TEXT
  24. ;---------------------------------------------------------------------------------
  25. ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
  26. ;---------------------------------------------------------------------------------
  27. %macro INT32_TO_FLOAT_FMUL_SCALAR 2
  28. %ifdef UNIX64
  29. cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
  30. %else
  31. cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
  32. %endif
  33. %ifdef WIN64
  34. SWAP 0, 2
  35. %elifdef ARCH_X86_32
  36. movss m0, mulm
  37. %endif
  38. SPLATD m0
  39. shl lenq, 2
  40. add srcq, lenq
  41. add dstq, lenq
  42. neg lenq
  43. .loop:
  44. %ifidn %1, sse2
  45. cvtdq2ps m1, [srcq+lenq ]
  46. cvtdq2ps m2, [srcq+lenq+16]
  47. %else
  48. cvtpi2ps m1, [srcq+lenq ]
  49. cvtpi2ps m3, [srcq+lenq+ 8]
  50. cvtpi2ps m2, [srcq+lenq+16]
  51. cvtpi2ps m4, [srcq+lenq+24]
  52. movlhps m1, m3
  53. movlhps m2, m4
  54. %endif
  55. mulps m1, m0
  56. mulps m2, m0
  57. mova [dstq+lenq ], m1
  58. mova [dstq+lenq+16], m2
  59. add lenq, 32
  60. jl .loop
  61. REP_RET
  62. %endmacro
  63. INIT_XMM
  64. %define SPLATD SPLATD_SSE
  65. %define movdqa movaps
  66. INT32_TO_FLOAT_FMUL_SCALAR sse, 5
  67. %undef movdqa
  68. %define SPLATD SPLATD_SSE2
  69. INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
  70. %undef SPLATD
  71. ;------------------------------------------------------------------------------
  72. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  73. ;------------------------------------------------------------------------------
  74. %macro FLOAT_TO_INT16 2
  75. cglobal float_to_int16_%1, 3,3,%2, dst, src, len
  76. add lenq, lenq
  77. lea srcq, [srcq+2*lenq]
  78. add dstq, lenq
  79. neg lenq
  80. .loop:
  81. %ifidn %1, sse2
  82. cvtps2dq m0, [srcq+2*lenq ]
  83. cvtps2dq m1, [srcq+2*lenq+16]
  84. packssdw m0, m1
  85. mova [dstq+lenq], m0
  86. %else
  87. cvtps2pi m0, [srcq+2*lenq ]
  88. cvtps2pi m1, [srcq+2*lenq+ 8]
  89. cvtps2pi m2, [srcq+2*lenq+16]
  90. cvtps2pi m3, [srcq+2*lenq+24]
  91. packssdw m0, m1
  92. packssdw m2, m3
  93. mova [dstq+lenq ], m0
  94. mova [dstq+lenq+8], m2
  95. %endif
  96. add lenq, 16
  97. js .loop
  98. %ifnidn %1, sse2
  99. emms
  100. %endif
  101. REP_RET
  102. %endmacro
  103. INIT_XMM
  104. FLOAT_TO_INT16 sse2, 2
  105. INIT_MMX
  106. FLOAT_TO_INT16 sse, 0
  107. %define cvtps2pi pf2id
  108. FLOAT_TO_INT16 3dnow, 0
  109. %undef cvtps2pi
  110. ;-------------------------------------------------------------------------------
  111. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  112. ;-------------------------------------------------------------------------------
  113. %macro FLOAT_TO_INT16_INTERLEAVE2 1
  114. cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
  115. lea lenq, [4*r2q]
  116. mov src1q, [src0q+gprsize]
  117. mov src0q, [src0q]
  118. add dstq, lenq
  119. add src0q, lenq
  120. add src1q, lenq
  121. neg lenq
  122. .loop:
  123. %ifidn %1, sse2
  124. cvtps2dq m0, [src0q+lenq]
  125. cvtps2dq m1, [src1q+lenq]
  126. packssdw m0, m1
  127. movhlps m1, m0
  128. punpcklwd m0, m1
  129. mova [dstq+lenq], m0
  130. %else
  131. cvtps2pi m0, [src0q+lenq ]
  132. cvtps2pi m1, [src0q+lenq+8]
  133. cvtps2pi m2, [src1q+lenq ]
  134. cvtps2pi m3, [src1q+lenq+8]
  135. packssdw m0, m1
  136. packssdw m2, m3
  137. mova m1, m0
  138. punpcklwd m0, m2
  139. punpckhwd m1, m2
  140. mova [dstq+lenq ], m0
  141. mova [dstq+lenq+8], m1
  142. %endif
  143. add lenq, 16
  144. js .loop
  145. %ifnidn %1, sse2
  146. emms
  147. %endif
  148. REP_RET
  149. %endmacro
  150. INIT_MMX
  151. %define cvtps2pi pf2id
  152. FLOAT_TO_INT16_INTERLEAVE2 3dnow
  153. %undef cvtps2pi
  154. %define movdqa movaps
  155. FLOAT_TO_INT16_INTERLEAVE2 sse
  156. %undef movdqa
  157. INIT_XMM
  158. FLOAT_TO_INT16_INTERLEAVE2 sse2
  159. %macro PSWAPD_SSE 2
  160. pshufw %1, %2, 0x4e
  161. %endmacro
  162. %macro PSWAPD_3DN1 2
  163. movq %1, %2
  164. psrlq %1, 32
  165. punpckldq %1, %2
  166. %endmacro
  167. %macro FLOAT_TO_INT16_INTERLEAVE6 1
  168. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  169. cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
  170. %ifdef ARCH_X86_64
  171. %define lend r10d
  172. mov lend, r2d
  173. %else
  174. %define lend dword r2m
  175. %endif
  176. mov src1q, [srcq+1*gprsize]
  177. mov src2q, [srcq+2*gprsize]
  178. mov src3q, [srcq+3*gprsize]
  179. mov src4q, [srcq+4*gprsize]
  180. mov src5q, [srcq+5*gprsize]
  181. mov srcq, [srcq]
  182. sub src1q, srcq
  183. sub src2q, srcq
  184. sub src3q, srcq
  185. sub src4q, srcq
  186. sub src5q, srcq
  187. .loop:
  188. cvtps2pi mm0, [srcq]
  189. cvtps2pi mm1, [srcq+src1q]
  190. cvtps2pi mm2, [srcq+src2q]
  191. cvtps2pi mm3, [srcq+src3q]
  192. cvtps2pi mm4, [srcq+src4q]
  193. cvtps2pi mm5, [srcq+src5q]
  194. packssdw mm0, mm3
  195. packssdw mm1, mm4
  196. packssdw mm2, mm5
  197. pswapd mm3, mm0
  198. punpcklwd mm0, mm1
  199. punpckhwd mm1, mm2
  200. punpcklwd mm2, mm3
  201. pswapd mm3, mm0
  202. punpckldq mm0, mm2
  203. punpckhdq mm2, mm1
  204. punpckldq mm1, mm3
  205. movq [dstq ], mm0
  206. movq [dstq+16], mm2
  207. movq [dstq+ 8], mm1
  208. add srcq, 8
  209. add dstq, 24
  210. sub lend, 2
  211. jg .loop
  212. emms
  213. RET
  214. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  215. %define pswapd PSWAPD_SSE
  216. FLOAT_TO_INT16_INTERLEAVE6 sse
  217. %define cvtps2pi pf2id
  218. %define pswapd PSWAPD_3DN1
  219. FLOAT_TO_INT16_INTERLEAVE6 3dnow
  220. %undef pswapd
  221. FLOAT_TO_INT16_INTERLEAVE6 3dn2
  222. %undef cvtps2pi
  223. ;-----------------------------------------------------------------------------
  224. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  225. ;-----------------------------------------------------------------------------
  226. %macro FLOAT_INTERLEAVE6 2
  227. cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
  228. %ifdef ARCH_X86_64
  229. %define lend r10d
  230. mov lend, r2d
  231. %else
  232. %define lend dword r2m
  233. %endif
  234. mov src1q, [srcq+1*gprsize]
  235. mov src2q, [srcq+2*gprsize]
  236. mov src3q, [srcq+3*gprsize]
  237. mov src4q, [srcq+4*gprsize]
  238. mov src5q, [srcq+5*gprsize]
  239. mov srcq, [srcq]
  240. sub src1q, srcq
  241. sub src2q, srcq
  242. sub src3q, srcq
  243. sub src4q, srcq
  244. sub src5q, srcq
  245. .loop:
  246. %ifidn %1, sse
  247. movaps m0, [srcq]
  248. movaps m1, [srcq+src1q]
  249. movaps m2, [srcq+src2q]
  250. movaps m3, [srcq+src3q]
  251. movaps m4, [srcq+src4q]
  252. movaps m5, [srcq+src5q]
  253. SBUTTERFLYPS 0, 1, 6
  254. SBUTTERFLYPS 2, 3, 6
  255. SBUTTERFLYPS 4, 5, 6
  256. movaps m6, m4
  257. shufps m4, m0, 0xe4
  258. movlhps m0, m2
  259. movhlps m6, m2
  260. movaps [dstq ], m0
  261. movaps [dstq+16], m4
  262. movaps [dstq+32], m6
  263. movaps m6, m5
  264. shufps m5, m1, 0xe4
  265. movlhps m1, m3
  266. movhlps m6, m3
  267. movaps [dstq+48], m1
  268. movaps [dstq+64], m5
  269. movaps [dstq+80], m6
  270. %else ; mmx
  271. movq m0, [srcq]
  272. movq m1, [srcq+src1q]
  273. movq m2, [srcq+src2q]
  274. movq m3, [srcq+src3q]
  275. movq m4, [srcq+src4q]
  276. movq m5, [srcq+src5q]
  277. SBUTTERFLY dq, 0, 1, 6
  278. SBUTTERFLY dq, 2, 3, 6
  279. SBUTTERFLY dq, 4, 5, 6
  280. movq [dstq ], m0
  281. movq [dstq+ 8], m2
  282. movq [dstq+16], m4
  283. movq [dstq+24], m1
  284. movq [dstq+32], m3
  285. movq [dstq+40], m5
  286. %endif
  287. add srcq, mmsize
  288. add dstq, mmsize*6
  289. sub lend, mmsize/4
  290. jg .loop
  291. %ifidn %1, mmx
  292. emms
  293. %endif
  294. REP_RET
  295. %endmacro
  296. INIT_MMX
  297. FLOAT_INTERLEAVE6 mmx, 0
  298. INIT_XMM
  299. FLOAT_INTERLEAVE6 sse, 7
  300. ;-----------------------------------------------------------------------------
  301. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  302. ;-----------------------------------------------------------------------------
  303. %macro FLOAT_INTERLEAVE2 2
  304. cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
  305. mov src1q, [srcq+gprsize]
  306. mov srcq, [srcq ]
  307. sub src1q, srcq
  308. .loop
  309. MOVPS m0, [srcq ]
  310. MOVPS m1, [srcq+src1q ]
  311. MOVPS m3, [srcq +mmsize]
  312. MOVPS m4, [srcq+src1q+mmsize]
  313. MOVPS m2, m0
  314. PUNPCKLDQ m0, m1
  315. PUNPCKHDQ m2, m1
  316. MOVPS m1, m3
  317. PUNPCKLDQ m3, m4
  318. PUNPCKHDQ m1, m4
  319. MOVPS [dstq ], m0
  320. MOVPS [dstq+1*mmsize], m2
  321. MOVPS [dstq+2*mmsize], m3
  322. MOVPS [dstq+3*mmsize], m1
  323. add srcq, mmsize*2
  324. add dstq, mmsize*4
  325. sub lend, mmsize/2
  326. jg .loop
  327. %ifidn %1, mmx
  328. emms
  329. %endif
  330. REP_RET
  331. %endmacro
  332. INIT_MMX
  333. %define MOVPS movq
  334. %define PUNPCKLDQ punpckldq
  335. %define PUNPCKHDQ punpckhdq
  336. FLOAT_INTERLEAVE2 mmx, 0
  337. INIT_XMM
  338. %define MOVPS movaps
  339. %define PUNPCKLDQ unpcklps
  340. %define PUNPCKHDQ unpckhps
  341. FLOAT_INTERLEAVE2 sse, 5