You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

446 lines
11KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. %include "x86util.asm"
  23. SECTION_TEXT
  24. ;---------------------------------------------------------------------------------
  25. ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
  26. ;---------------------------------------------------------------------------------
  27. %macro INT32_TO_FLOAT_FMUL_SCALAR 2
  28. %if UNIX64
  29. cglobal int32_to_float_fmul_scalar_%1, 3,3,%2, dst, src, len
  30. %else
  31. cglobal int32_to_float_fmul_scalar_%1, 4,4,%2, dst, src, mul, len
  32. %endif
  33. %if WIN64
  34. SWAP 0, 2
  35. %elif ARCH_X86_32
  36. movss m0, mulm
  37. %endif
  38. SPLATD m0
  39. shl lenq, 2
  40. add srcq, lenq
  41. add dstq, lenq
  42. neg lenq
  43. .loop:
  44. %ifidn %1, sse2
  45. cvtdq2ps m1, [srcq+lenq ]
  46. cvtdq2ps m2, [srcq+lenq+16]
  47. %else
  48. cvtpi2ps m1, [srcq+lenq ]
  49. cvtpi2ps m3, [srcq+lenq+ 8]
  50. cvtpi2ps m2, [srcq+lenq+16]
  51. cvtpi2ps m4, [srcq+lenq+24]
  52. movlhps m1, m3
  53. movlhps m2, m4
  54. %endif
  55. mulps m1, m0
  56. mulps m2, m0
  57. mova [dstq+lenq ], m1
  58. mova [dstq+lenq+16], m2
  59. add lenq, 32
  60. jl .loop
  61. REP_RET
  62. %endmacro
  63. INIT_XMM
  64. %define SPLATD SPLATD_SSE
  65. %define movdqa movaps
  66. INT32_TO_FLOAT_FMUL_SCALAR sse, 5
  67. %undef movdqa
  68. %define SPLATD SPLATD_SSE2
  69. INT32_TO_FLOAT_FMUL_SCALAR sse2, 3
  70. %undef SPLATD
  71. ;------------------------------------------------------------------------------
  72. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  73. ;------------------------------------------------------------------------------
  74. %macro FLOAT_TO_INT16 2
  75. cglobal float_to_int16_%1, 3,3,%2, dst, src, len
  76. add lenq, lenq
  77. lea srcq, [srcq+2*lenq]
  78. add dstq, lenq
  79. neg lenq
  80. .loop:
  81. %ifidn %1, sse2
  82. cvtps2dq m0, [srcq+2*lenq ]
  83. cvtps2dq m1, [srcq+2*lenq+16]
  84. packssdw m0, m1
  85. mova [dstq+lenq], m0
  86. %else
  87. cvtps2pi m0, [srcq+2*lenq ]
  88. cvtps2pi m1, [srcq+2*lenq+ 8]
  89. cvtps2pi m2, [srcq+2*lenq+16]
  90. cvtps2pi m3, [srcq+2*lenq+24]
  91. packssdw m0, m1
  92. packssdw m2, m3
  93. mova [dstq+lenq ], m0
  94. mova [dstq+lenq+8], m2
  95. %endif
  96. add lenq, 16
  97. js .loop
  98. %ifnidn %1, sse2
  99. emms
  100. %endif
  101. REP_RET
  102. %endmacro
  103. INIT_XMM
  104. FLOAT_TO_INT16 sse2, 2
  105. INIT_MMX
  106. FLOAT_TO_INT16 sse, 0
  107. %define cvtps2pi pf2id
  108. FLOAT_TO_INT16 3dnow, 0
  109. %undef cvtps2pi
  110. ;------------------------------------------------------------------------------
  111. ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
  112. ;------------------------------------------------------------------------------
  113. %macro FLOAT_TO_INT16_STEP 2
  114. cglobal float_to_int16_step_%1, 4,7,%2, dst, src, len, step, step3, v1, v2
  115. add lenq, lenq
  116. lea srcq, [srcq+2*lenq]
  117. lea step3q, [stepq*3]
  118. neg lenq
  119. .loop:
  120. %ifidn %1, sse2
  121. cvtps2dq m0, [srcq+2*lenq ]
  122. cvtps2dq m1, [srcq+2*lenq+16]
  123. packssdw m0, m1
  124. movd v1d, m0
  125. psrldq m0, 4
  126. movd v2d, m0
  127. psrldq m0, 4
  128. mov [dstq], v1w
  129. mov [dstq+stepq*4], v2w
  130. shr v1d, 16
  131. shr v2d, 16
  132. mov [dstq+stepq*2], v1w
  133. mov [dstq+step3q*2], v2w
  134. lea dstq, [dstq+stepq*8]
  135. movd v1d, m0
  136. psrldq m0, 4
  137. movd v2d, m0
  138. mov [dstq], v1w
  139. mov [dstq+stepq*4], v2w
  140. shr v1d, 16
  141. shr v2d, 16
  142. mov [dstq+stepq*2], v1w
  143. mov [dstq+step3q*2], v2w
  144. lea dstq, [dstq+stepq*8]
  145. %else
  146. cvtps2pi m0, [srcq+2*lenq ]
  147. cvtps2pi m1, [srcq+2*lenq+ 8]
  148. cvtps2pi m2, [srcq+2*lenq+16]
  149. cvtps2pi m3, [srcq+2*lenq+24]
  150. packssdw m0, m1
  151. packssdw m2, m3
  152. movd v1d, m0
  153. psrlq m0, 32
  154. movd v2d, m0
  155. mov [dstq], v1w
  156. mov [dstq+stepq*4], v2w
  157. shr v1d, 16
  158. shr v2d, 16
  159. mov [dstq+stepq*2], v1w
  160. mov [dstq+step3q*2], v2w
  161. lea dstq, [dstq+stepq*8]
  162. movd v1d, m2
  163. psrlq m2, 32
  164. movd v2d, m2
  165. mov [dstq], v1w
  166. mov [dstq+stepq*4], v2w
  167. shr v1d, 16
  168. shr v2d, 16
  169. mov [dstq+stepq*2], v1w
  170. mov [dstq+step3q*2], v2w
  171. lea dstq, [dstq+stepq*8]
  172. %endif
  173. add lenq, 16
  174. js .loop
  175. %ifnidn %1, sse2
  176. emms
  177. %endif
  178. REP_RET
  179. %endmacro
  180. INIT_XMM
  181. FLOAT_TO_INT16_STEP sse2, 2
  182. INIT_MMX
  183. FLOAT_TO_INT16_STEP sse, 0
  184. %define cvtps2pi pf2id
  185. FLOAT_TO_INT16_STEP 3dnow, 0
  186. %undef cvtps2pi
  187. ;-------------------------------------------------------------------------------
  188. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  189. ;-------------------------------------------------------------------------------
  190. %macro FLOAT_TO_INT16_INTERLEAVE2 1
  191. cglobal float_to_int16_interleave2_%1, 3,4,2, dst, src0, src1, len
  192. lea lenq, [4*r2q]
  193. mov src1q, [src0q+gprsize]
  194. mov src0q, [src0q]
  195. add dstq, lenq
  196. add src0q, lenq
  197. add src1q, lenq
  198. neg lenq
  199. .loop:
  200. %ifidn %1, sse2
  201. cvtps2dq m0, [src0q+lenq]
  202. cvtps2dq m1, [src1q+lenq]
  203. packssdw m0, m1
  204. movhlps m1, m0
  205. punpcklwd m0, m1
  206. mova [dstq+lenq], m0
  207. %else
  208. cvtps2pi m0, [src0q+lenq ]
  209. cvtps2pi m1, [src0q+lenq+8]
  210. cvtps2pi m2, [src1q+lenq ]
  211. cvtps2pi m3, [src1q+lenq+8]
  212. packssdw m0, m1
  213. packssdw m2, m3
  214. mova m1, m0
  215. punpcklwd m0, m2
  216. punpckhwd m1, m2
  217. mova [dstq+lenq ], m0
  218. mova [dstq+lenq+8], m1
  219. %endif
  220. add lenq, 16
  221. js .loop
  222. %ifnidn %1, sse2
  223. emms
  224. %endif
  225. REP_RET
  226. %endmacro
  227. INIT_MMX
  228. %define cvtps2pi pf2id
  229. FLOAT_TO_INT16_INTERLEAVE2 3dnow
  230. %undef cvtps2pi
  231. %define movdqa movaps
  232. FLOAT_TO_INT16_INTERLEAVE2 sse
  233. %undef movdqa
  234. INIT_XMM
  235. FLOAT_TO_INT16_INTERLEAVE2 sse2
  236. %macro PSWAPD_SSE 2
  237. pshufw %1, %2, 0x4e
  238. %endmacro
  239. %macro PSWAPD_3DN1 2
  240. movq %1, %2
  241. psrlq %1, 32
  242. punpckldq %1, %2
  243. %endmacro
  244. %macro FLOAT_TO_INT16_INTERLEAVE6 1
  245. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  246. cglobal float_to_int16_interleave6_%1, 2,8,0, dst, src, src1, src2, src3, src4, src5, len
  247. %if ARCH_X86_64
  248. mov lend, r2d
  249. %else
  250. %define lend dword r2m
  251. %endif
  252. mov src1q, [srcq+1*gprsize]
  253. mov src2q, [srcq+2*gprsize]
  254. mov src3q, [srcq+3*gprsize]
  255. mov src4q, [srcq+4*gprsize]
  256. mov src5q, [srcq+5*gprsize]
  257. mov srcq, [srcq]
  258. sub src1q, srcq
  259. sub src2q, srcq
  260. sub src3q, srcq
  261. sub src4q, srcq
  262. sub src5q, srcq
  263. .loop:
  264. cvtps2pi mm0, [srcq]
  265. cvtps2pi mm1, [srcq+src1q]
  266. cvtps2pi mm2, [srcq+src2q]
  267. cvtps2pi mm3, [srcq+src3q]
  268. cvtps2pi mm4, [srcq+src4q]
  269. cvtps2pi mm5, [srcq+src5q]
  270. packssdw mm0, mm3
  271. packssdw mm1, mm4
  272. packssdw mm2, mm5
  273. pswapd mm3, mm0
  274. punpcklwd mm0, mm1
  275. punpckhwd mm1, mm2
  276. punpcklwd mm2, mm3
  277. pswapd mm3, mm0
  278. punpckldq mm0, mm2
  279. punpckhdq mm2, mm1
  280. punpckldq mm1, mm3
  281. movq [dstq ], mm0
  282. movq [dstq+16], mm2
  283. movq [dstq+ 8], mm1
  284. add srcq, 8
  285. add dstq, 24
  286. sub lend, 2
  287. jg .loop
  288. emms
  289. RET
  290. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  291. %define pswapd PSWAPD_SSE
  292. FLOAT_TO_INT16_INTERLEAVE6 sse
  293. %define cvtps2pi pf2id
  294. %define pswapd PSWAPD_3DN1
  295. FLOAT_TO_INT16_INTERLEAVE6 3dnow
  296. %undef pswapd
  297. FLOAT_TO_INT16_INTERLEAVE6 3dn2
  298. %undef cvtps2pi
  299. ;-----------------------------------------------------------------------------
  300. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  301. ;-----------------------------------------------------------------------------
  302. %macro FLOAT_INTERLEAVE6 2
  303. cglobal float_interleave6_%1, 2,8,%2, dst, src, src1, src2, src3, src4, src5, len
  304. %if ARCH_X86_64
  305. mov lend, r2d
  306. %else
  307. %define lend dword r2m
  308. %endif
  309. mov src1q, [srcq+1*gprsize]
  310. mov src2q, [srcq+2*gprsize]
  311. mov src3q, [srcq+3*gprsize]
  312. mov src4q, [srcq+4*gprsize]
  313. mov src5q, [srcq+5*gprsize]
  314. mov srcq, [srcq]
  315. sub src1q, srcq
  316. sub src2q, srcq
  317. sub src3q, srcq
  318. sub src4q, srcq
  319. sub src5q, srcq
  320. .loop:
  321. %ifidn %1, sse
  322. movaps m0, [srcq]
  323. movaps m1, [srcq+src1q]
  324. movaps m2, [srcq+src2q]
  325. movaps m3, [srcq+src3q]
  326. movaps m4, [srcq+src4q]
  327. movaps m5, [srcq+src5q]
  328. SBUTTERFLYPS 0, 1, 6
  329. SBUTTERFLYPS 2, 3, 6
  330. SBUTTERFLYPS 4, 5, 6
  331. movaps m6, m4
  332. shufps m4, m0, 0xe4
  333. movlhps m0, m2
  334. movhlps m6, m2
  335. movaps [dstq ], m0
  336. movaps [dstq+16], m4
  337. movaps [dstq+32], m6
  338. movaps m6, m5
  339. shufps m5, m1, 0xe4
  340. movlhps m1, m3
  341. movhlps m6, m3
  342. movaps [dstq+48], m1
  343. movaps [dstq+64], m5
  344. movaps [dstq+80], m6
  345. %else ; mmx
  346. movq m0, [srcq]
  347. movq m1, [srcq+src1q]
  348. movq m2, [srcq+src2q]
  349. movq m3, [srcq+src3q]
  350. movq m4, [srcq+src4q]
  351. movq m5, [srcq+src5q]
  352. SBUTTERFLY dq, 0, 1, 6
  353. SBUTTERFLY dq, 2, 3, 6
  354. SBUTTERFLY dq, 4, 5, 6
  355. movq [dstq ], m0
  356. movq [dstq+ 8], m2
  357. movq [dstq+16], m4
  358. movq [dstq+24], m1
  359. movq [dstq+32], m3
  360. movq [dstq+40], m5
  361. %endif
  362. add srcq, mmsize
  363. add dstq, mmsize*6
  364. sub lend, mmsize/4
  365. jg .loop
  366. %ifidn %1, mmx
  367. emms
  368. %endif
  369. REP_RET
  370. %endmacro
  371. INIT_MMX
  372. FLOAT_INTERLEAVE6 mmx, 0
  373. INIT_XMM
  374. FLOAT_INTERLEAVE6 sse, 7
  375. ;-----------------------------------------------------------------------------
  376. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  377. ;-----------------------------------------------------------------------------
  378. %macro FLOAT_INTERLEAVE2 2
  379. cglobal float_interleave2_%1, 3,4,%2, dst, src, len, src1
  380. mov src1q, [srcq+gprsize]
  381. mov srcq, [srcq ]
  382. sub src1q, srcq
  383. .loop
  384. MOVPS m0, [srcq ]
  385. MOVPS m1, [srcq+src1q ]
  386. MOVPS m3, [srcq +mmsize]
  387. MOVPS m4, [srcq+src1q+mmsize]
  388. MOVPS m2, m0
  389. PUNPCKLDQ m0, m1
  390. PUNPCKHDQ m2, m1
  391. MOVPS m1, m3
  392. PUNPCKLDQ m3, m4
  393. PUNPCKHDQ m1, m4
  394. MOVPS [dstq ], m0
  395. MOVPS [dstq+1*mmsize], m2
  396. MOVPS [dstq+2*mmsize], m3
  397. MOVPS [dstq+3*mmsize], m1
  398. add srcq, mmsize*2
  399. add dstq, mmsize*4
  400. sub lend, mmsize/2
  401. jg .loop
  402. %ifidn %1, mmx
  403. emms
  404. %endif
  405. REP_RET
  406. %endmacro
  407. INIT_MMX
  408. %define MOVPS movq
  409. %define PUNPCKLDQ punpckldq
  410. %define PUNPCKHDQ punpckhdq
  411. FLOAT_INTERLEAVE2 mmx, 0
  412. INIT_XMM
  413. %define MOVPS movaps
  414. %define PUNPCKLDQ unpcklps
  415. %define PUNPCKHDQ unpckhps
  416. FLOAT_INTERLEAVE2 sse, 5