You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

366 lines
9.2KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_TEXT
  24. ;---------------------------------------------------------------------------------
  25. ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
  26. ;---------------------------------------------------------------------------------
  27. %macro INT32_TO_FLOAT_FMUL_SCALAR 2
  28. %ifdef ARCH_X86_64
  29. cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
  30. %else
  31. cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
  32. movss m0, mulm
  33. %endif
  34. SPLATD m0
  35. shl lenq, 2
  36. add srcq, lenq
  37. add dstq, lenq
  38. neg lenq
  39. .loop:
  40. %ifidn %1, sse2
  41. cvtdq2ps m1, [srcq+lenq ]
  42. cvtdq2ps m2, [srcq+lenq+16]
  43. %else
  44. cvtpi2ps m1, [srcq+lenq ]
  45. cvtpi2ps m3, [srcq+lenq+ 8]
  46. cvtpi2ps m2, [srcq+lenq+16]
  47. cvtpi2ps m4, [srcq+lenq+24]
  48. movlhps m1, m3
  49. movlhps m2, m4
  50. %endif
  51. mulps m1, m0
  52. mulps m2, m0
  53. mova [dstq+lenq ], m1
  54. mova [dstq+lenq+16], m2
  55. add lenq, 32
  56. jl .loop
  57. REP_RET
  58. %endmacro
  59. INIT_XMM
  60. %define SPLATD SPLATD_SSE
  61. %define movdqa movaps
  62. INT32_TO_FLOAT_FMUL_SCALAR sse, 5
  63. %undef movdqa
  64. %define SPLATD SPLATD_SSE2
  65. INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
  66. %undef SPLATD
  67. ;------------------------------------------------------------------------------
  68. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  69. ;------------------------------------------------------------------------------
  70. %macro FLOAT_TO_INT16 2
  71. cglobal float_to_int16_%1, 3,3,%2, dst, src, len
  72. add lenq, lenq
  73. lea srcq, [srcq+2*lenq]
  74. add dstq, lenq
  75. neg lenq
  76. .loop:
  77. %ifidn %1, sse2
  78. cvtps2dq m0, [srcq+2*lenq ]
  79. cvtps2dq m1, [srcq+2*lenq+16]
  80. packssdw m0, m1
  81. mova [dstq+lenq], m0
  82. %else
  83. cvtps2pi m0, [srcq+2*lenq ]
  84. cvtps2pi m1, [srcq+2*lenq+ 8]
  85. cvtps2pi m2, [srcq+2*lenq+16]
  86. cvtps2pi m3, [srcq+2*lenq+24]
  87. packssdw m0, m1
  88. packssdw m2, m3
  89. mova [dstq+lenq ], m0
  90. mova [dstq+lenq+8], m2
  91. %endif
  92. add lenq, 16
  93. js .loop
  94. %ifnidn %1, sse2
  95. emms
  96. %endif
  97. REP_RET
  98. %endmacro
  99. INIT_XMM
  100. FLOAT_TO_INT16 sse2, 2
  101. INIT_MMX
  102. FLOAT_TO_INT16 sse, 0
  103. %define cvtps2pi pf2id
  104. FLOAT_TO_INT16 3dnow, 0
  105. %undef cvtps2pi
  106. ;-------------------------------------------------------------------------------
  107. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  108. ;-------------------------------------------------------------------------------
  109. %macro FLOAT_TO_INT16_INTERLEAVE2 1
  110. cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
  111. lea lenq, [4*r2q]
  112. mov src1q, [src0q+gprsize]
  113. mov src0q, [src0q]
  114. add dstq, lenq
  115. add src0q, lenq
  116. add src1q, lenq
  117. neg lenq
  118. .loop:
  119. %ifidn %1, sse2
  120. cvtps2dq m0, [src0q+lenq]
  121. cvtps2dq m1, [src1q+lenq]
  122. packssdw m0, m1
  123. movhlps m1, m0
  124. punpcklwd m0, m1
  125. mova [dstq+lenq], m0
  126. %else
  127. cvtps2pi m0, [src0q+lenq ]
  128. cvtps2pi m1, [src0q+lenq+8]
  129. cvtps2pi m2, [src1q+lenq ]
  130. cvtps2pi m3, [src1q+lenq+8]
  131. packssdw m0, m1
  132. packssdw m2, m3
  133. mova m1, m0
  134. punpcklwd m0, m2
  135. punpckhwd m1, m2
  136. mova [dstq+lenq ], m0
  137. mova [dstq+lenq+8], m1
  138. %endif
  139. add lenq, 16
  140. js .loop
  141. %ifnidn %1, sse2
  142. emms
  143. %endif
  144. REP_RET
  145. %endmacro
  146. INIT_MMX
  147. %define cvtps2pi pf2id
  148. FLOAT_TO_INT16_INTERLEAVE2 3dnow
  149. %undef cvtps2pi
  150. %define movdqa movaps
  151. FLOAT_TO_INT16_INTERLEAVE2 sse
  152. %undef movdqa
  153. INIT_XMM
  154. FLOAT_TO_INT16_INTERLEAVE2 sse2
  155. %macro PSWAPD_SSE 2
  156. pshufw %1, %2, 0x4e
  157. %endmacro
  158. %macro PSWAPD_3DN1 2
  159. movq %1, %2
  160. psrlq %1, 32
  161. punpckldq %1, %2
  162. %endmacro
  163. %macro FLOAT_TO_INT16_INTERLEAVE6 1
  164. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  165. cglobal float_to_int16_interleave6_%1, 2,7,0, dst, src, src1, src2, src3, src4, src5
  166. %ifdef ARCH_X86_64
  167. %define lend r10d
  168. mov lend, r2d
  169. %else
  170. %define lend dword r2m
  171. %endif
  172. mov src1q, [srcq+1*gprsize]
  173. mov src2q, [srcq+2*gprsize]
  174. mov src3q, [srcq+3*gprsize]
  175. mov src4q, [srcq+4*gprsize]
  176. mov src5q, [srcq+5*gprsize]
  177. mov srcq, [srcq]
  178. sub src1q, srcq
  179. sub src2q, srcq
  180. sub src3q, srcq
  181. sub src4q, srcq
  182. sub src5q, srcq
  183. .loop:
  184. cvtps2pi mm0, [srcq]
  185. cvtps2pi mm1, [srcq+src1q]
  186. cvtps2pi mm2, [srcq+src2q]
  187. cvtps2pi mm3, [srcq+src3q]
  188. cvtps2pi mm4, [srcq+src4q]
  189. cvtps2pi mm5, [srcq+src5q]
  190. packssdw mm0, mm3
  191. packssdw mm1, mm4
  192. packssdw mm2, mm5
  193. pswapd mm3, mm0
  194. punpcklwd mm0, mm1
  195. punpckhwd mm1, mm2
  196. punpcklwd mm2, mm3
  197. pswapd mm3, mm0
  198. punpckldq mm0, mm2
  199. punpckhdq mm2, mm1
  200. punpckldq mm1, mm3
  201. movq [dstq ], mm0
  202. movq [dstq+16], mm2
  203. movq [dstq+ 8], mm1
  204. add srcq, 8
  205. add dstq, 24
  206. sub lend, 2
  207. jg .loop
  208. emms
  209. RET
  210. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  211. %define pswapd PSWAPD_SSE
  212. FLOAT_TO_INT16_INTERLEAVE6 sse
  213. %define cvtps2pi pf2id
  214. %define pswapd PSWAPD_3DN1
  215. FLOAT_TO_INT16_INTERLEAVE6 3dnow
  216. %undef pswapd
  217. FLOAT_TO_INT16_INTERLEAVE6 3dn2
  218. %undef cvtps2pi
  219. ;-----------------------------------------------------------------------------
  220. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  221. ;-----------------------------------------------------------------------------
  222. %macro FLOAT_INTERLEAVE6 2
  223. cglobal float_interleave6_%1, 2,7,%2, dst, src, src1, src2, src3, src4, src5
  224. %ifdef ARCH_X86_64
  225. %define lend r10d
  226. mov lend, r2d
  227. %else
  228. %define lend dword r2m
  229. %endif
  230. mov src1q, [srcq+1*gprsize]
  231. mov src2q, [srcq+2*gprsize]
  232. mov src3q, [srcq+3*gprsize]
  233. mov src4q, [srcq+4*gprsize]
  234. mov src5q, [srcq+5*gprsize]
  235. mov srcq, [srcq]
  236. sub src1q, srcq
  237. sub src2q, srcq
  238. sub src3q, srcq
  239. sub src4q, srcq
  240. sub src5q, srcq
  241. .loop:
  242. %ifidn %1, sse
  243. movaps m0, [srcq]
  244. movaps m1, [srcq+src1q]
  245. movaps m2, [srcq+src2q]
  246. movaps m3, [srcq+src3q]
  247. movaps m4, [srcq+src4q]
  248. movaps m5, [srcq+src5q]
  249. SBUTTERFLYPS 0, 1, 6
  250. SBUTTERFLYPS 2, 3, 6
  251. SBUTTERFLYPS 4, 5, 6
  252. movaps m6, m4
  253. shufps m4, m0, 0xe4
  254. movlhps m0, m2
  255. movhlps m6, m2
  256. movaps [dstq ], m0
  257. movaps [dstq+16], m4
  258. movaps [dstq+32], m6
  259. movaps m6, m5
  260. shufps m5, m1, 0xe4
  261. movlhps m1, m3
  262. movhlps m6, m3
  263. movaps [dstq+48], m1
  264. movaps [dstq+64], m5
  265. movaps [dstq+80], m6
  266. %else ; mmx
  267. movq m0, [srcq]
  268. movq m1, [srcq+src1q]
  269. movq m2, [srcq+src2q]
  270. movq m3, [srcq+src3q]
  271. movq m4, [srcq+src4q]
  272. movq m5, [srcq+src5q]
  273. SBUTTERFLY dq, 0, 1, 6
  274. SBUTTERFLY dq, 2, 3, 6
  275. SBUTTERFLY dq, 4, 5, 6
  276. movq [dstq ], m0
  277. movq [dstq+ 8], m2
  278. movq [dstq+16], m4
  279. movq [dstq+24], m1
  280. movq [dstq+32], m3
  281. movq [dstq+40], m5
  282. %endif
  283. add srcq, mmsize
  284. add dstq, mmsize*6
  285. sub lend, mmsize/4
  286. jg .loop
  287. %ifidn %1, mmx
  288. emms
  289. %endif
  290. REP_RET
  291. %endmacro
  292. INIT_MMX
  293. FLOAT_INTERLEAVE6 mmx, 0
  294. INIT_XMM
  295. FLOAT_INTERLEAVE6 sse, 7
  296. ;-----------------------------------------------------------------------------
  297. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  298. ;-----------------------------------------------------------------------------
  299. %macro FLOAT_INTERLEAVE2 2
  300. cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
  301. mov src1q, [srcq+gprsize]
  302. mov srcq, [srcq ]
  303. sub src1q, srcq
  304. .loop
  305. MOVPS m0, [srcq ]
  306. MOVPS m1, [srcq+src1q ]
  307. MOVPS m3, [srcq +mmsize]
  308. MOVPS m4, [srcq+src1q+mmsize]
  309. MOVPS m2, m0
  310. PUNPCKLDQ m0, m1
  311. PUNPCKHDQ m2, m1
  312. MOVPS m1, m3
  313. PUNPCKLDQ m3, m4
  314. PUNPCKHDQ m1, m4
  315. MOVPS [dstq ], m0
  316. MOVPS [dstq+1*mmsize], m2
  317. MOVPS [dstq+2*mmsize], m3
  318. MOVPS [dstq+3*mmsize], m1
  319. add srcq, mmsize*2
  320. add dstq, mmsize*4
  321. sub lend, mmsize/2
  322. jg .loop
  323. %ifidn %1, mmx
  324. emms
  325. %endif
  326. REP_RET
  327. %endmacro
  328. INIT_MMX
  329. %define MOVPS movq
  330. %define PUNPCKLDQ punpckldq
  331. %define PUNPCKHDQ punpckhdq
  332. FLOAT_INTERLEAVE2 mmx, 0
  333. INIT_XMM
  334. %define MOVPS movaps
  335. %define PUNPCKLDQ unpcklps
  336. %define PUNPCKHDQ unpckhps
  337. FLOAT_INTERLEAVE2 sse, 5