You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

446 lines
11KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_TEXT
  23. %macro CVTPS2PI 2
  24. %if cpuflag(sse)
  25. cvtps2pi %1, %2
  26. %elif cpuflag(3dnow)
  27. pf2id %1, %2
  28. %endif
  29. %endmacro
  30. ;---------------------------------------------------------------------------------
  31. ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
  32. ;---------------------------------------------------------------------------------
  33. %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  34. %if UNIX64
  35. cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  36. %else
  37. cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  38. %endif
  39. %if WIN64
  40. SWAP 0, 2
  41. %elif ARCH_X86_32
  42. movss m0, mulm
  43. %endif
  44. SPLATD m0
  45. shl lenq, 2
  46. add srcq, lenq
  47. add dstq, lenq
  48. neg lenq
  49. .loop:
  50. %if cpuflag(sse2)
  51. cvtdq2ps m1, [srcq+lenq ]
  52. cvtdq2ps m2, [srcq+lenq+16]
  53. %else
  54. cvtpi2ps m1, [srcq+lenq ]
  55. cvtpi2ps m3, [srcq+lenq+ 8]
  56. cvtpi2ps m2, [srcq+lenq+16]
  57. cvtpi2ps m4, [srcq+lenq+24]
  58. movlhps m1, m3
  59. movlhps m2, m4
  60. %endif
  61. mulps m1, m0
  62. mulps m2, m0
  63. mova [dstq+lenq ], m1
  64. mova [dstq+lenq+16], m2
  65. add lenq, 32
  66. jl .loop
  67. REP_RET
  68. %endmacro
  69. INIT_XMM sse
  70. %define SPLATD SPLATD_SSE
  71. INT32_TO_FLOAT_FMUL_SCALAR 5
  72. INIT_XMM sse2
  73. %define SPLATD SPLATD_SSE2
  74. INT32_TO_FLOAT_FMUL_SCALAR 3
  75. %undef SPLATD
  76. ;------------------------------------------------------------------------------
  77. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  78. ;------------------------------------------------------------------------------
  79. %macro FLOAT_TO_INT16 1
  80. cglobal float_to_int16, 3, 3, %1, dst, src, len
  81. add lenq, lenq
  82. lea srcq, [srcq+2*lenq]
  83. add dstq, lenq
  84. neg lenq
  85. .loop:
  86. %if cpuflag(sse2)
  87. cvtps2dq m0, [srcq+2*lenq ]
  88. cvtps2dq m1, [srcq+2*lenq+16]
  89. packssdw m0, m1
  90. mova [dstq+lenq], m0
  91. %else
  92. CVTPS2PI m0, [srcq+2*lenq ]
  93. CVTPS2PI m1, [srcq+2*lenq+ 8]
  94. CVTPS2PI m2, [srcq+2*lenq+16]
  95. CVTPS2PI m3, [srcq+2*lenq+24]
  96. packssdw m0, m1
  97. packssdw m2, m3
  98. mova [dstq+lenq ], m0
  99. mova [dstq+lenq+8], m2
  100. %endif
  101. add lenq, 16
  102. js .loop
  103. %if mmsize == 8
  104. emms
  105. %endif
  106. REP_RET
  107. %endmacro
  108. INIT_XMM sse2
  109. FLOAT_TO_INT16 2
  110. INIT_MMX sse
  111. FLOAT_TO_INT16 0
  112. INIT_MMX 3dnow
  113. FLOAT_TO_INT16 0
  114. ;------------------------------------------------------------------------------
  115. ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
  116. ;------------------------------------------------------------------------------
  117. %macro FLOAT_TO_INT16_STEP 1
  118. cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
  119. add lenq, lenq
  120. lea srcq, [srcq+2*lenq]
  121. lea step3q, [stepq*3]
  122. neg lenq
  123. .loop:
  124. %if cpuflag(sse2)
  125. cvtps2dq m0, [srcq+2*lenq ]
  126. cvtps2dq m1, [srcq+2*lenq+16]
  127. packssdw m0, m1
  128. movd v1d, m0
  129. psrldq m0, 4
  130. movd v2d, m0
  131. psrldq m0, 4
  132. mov [dstq], v1w
  133. mov [dstq+stepq*4], v2w
  134. shr v1d, 16
  135. shr v2d, 16
  136. mov [dstq+stepq*2], v1w
  137. mov [dstq+step3q*2], v2w
  138. lea dstq, [dstq+stepq*8]
  139. movd v1d, m0
  140. psrldq m0, 4
  141. movd v2d, m0
  142. mov [dstq], v1w
  143. mov [dstq+stepq*4], v2w
  144. shr v1d, 16
  145. shr v2d, 16
  146. mov [dstq+stepq*2], v1w
  147. mov [dstq+step3q*2], v2w
  148. lea dstq, [dstq+stepq*8]
  149. %else
  150. CVTPS2PI m0, [srcq+2*lenq ]
  151. CVTPS2PI m1, [srcq+2*lenq+ 8]
  152. CVTPS2PI m2, [srcq+2*lenq+16]
  153. CVTPS2PI m3, [srcq+2*lenq+24]
  154. packssdw m0, m1
  155. packssdw m2, m3
  156. movd v1d, m0
  157. psrlq m0, 32
  158. movd v2d, m0
  159. mov [dstq], v1w
  160. mov [dstq+stepq*4], v2w
  161. shr v1d, 16
  162. shr v2d, 16
  163. mov [dstq+stepq*2], v1w
  164. mov [dstq+step3q*2], v2w
  165. lea dstq, [dstq+stepq*8]
  166. movd v1d, m2
  167. psrlq m2, 32
  168. movd v2d, m2
  169. mov [dstq], v1w
  170. mov [dstq+stepq*4], v2w
  171. shr v1d, 16
  172. shr v2d, 16
  173. mov [dstq+stepq*2], v1w
  174. mov [dstq+step3q*2], v2w
  175. lea dstq, [dstq+stepq*8]
  176. %endif
  177. add lenq, 16
  178. js .loop
  179. %if mmsize == 8
  180. emms
  181. %endif
  182. REP_RET
  183. %endmacro
  184. INIT_XMM sse2
  185. FLOAT_TO_INT16_STEP 2
  186. INIT_MMX sse
  187. FLOAT_TO_INT16_STEP 0
  188. INIT_MMX 3dnow
  189. FLOAT_TO_INT16_STEP 0
  190. ;-------------------------------------------------------------------------------
  191. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  192. ;-------------------------------------------------------------------------------
  193. %macro FLOAT_TO_INT16_INTERLEAVE2 0
  194. cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
  195. lea lenq, [4*r2q]
  196. mov src1q, [src0q+gprsize]
  197. mov src0q, [src0q]
  198. add dstq, lenq
  199. add src0q, lenq
  200. add src1q, lenq
  201. neg lenq
  202. .loop:
  203. %if cpuflag(sse2)
  204. cvtps2dq m0, [src0q+lenq]
  205. cvtps2dq m1, [src1q+lenq]
  206. packssdw m0, m1
  207. movhlps m1, m0
  208. punpcklwd m0, m1
  209. mova [dstq+lenq], m0
  210. %else
  211. CVTPS2PI m0, [src0q+lenq ]
  212. CVTPS2PI m1, [src0q+lenq+8]
  213. CVTPS2PI m2, [src1q+lenq ]
  214. CVTPS2PI m3, [src1q+lenq+8]
  215. packssdw m0, m1
  216. packssdw m2, m3
  217. mova m1, m0
  218. punpcklwd m0, m2
  219. punpckhwd m1, m2
  220. mova [dstq+lenq ], m0
  221. mova [dstq+lenq+8], m1
  222. %endif
  223. add lenq, 16
  224. js .loop
  225. %if mmsize == 8
  226. emms
  227. %endif
  228. REP_RET
  229. %endmacro
  230. INIT_MMX 3dnow
  231. FLOAT_TO_INT16_INTERLEAVE2
  232. INIT_MMX sse
  233. FLOAT_TO_INT16_INTERLEAVE2
  234. INIT_XMM sse2
  235. FLOAT_TO_INT16_INTERLEAVE2
  236. %macro PSWAPD_SSE 2
  237. pshufw %1, %2, 0x4e
  238. %endmacro
  239. %macro PSWAPD_3DNOW 2
  240. movq %1, %2
  241. psrlq %1, 32
  242. punpckldq %1, %2
  243. %endmacro
  244. %macro FLOAT_TO_INT16_INTERLEAVE6 0
  245. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  246. cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
  247. %if ARCH_X86_64
  248. mov lend, r2d
  249. %else
  250. %define lend dword r2m
  251. %endif
  252. mov src1q, [srcq+1*gprsize]
  253. mov src2q, [srcq+2*gprsize]
  254. mov src3q, [srcq+3*gprsize]
  255. mov src4q, [srcq+4*gprsize]
  256. mov src5q, [srcq+5*gprsize]
  257. mov srcq, [srcq]
  258. sub src1q, srcq
  259. sub src2q, srcq
  260. sub src3q, srcq
  261. sub src4q, srcq
  262. sub src5q, srcq
  263. .loop:
  264. CVTPS2PI mm0, [srcq]
  265. CVTPS2PI mm1, [srcq+src1q]
  266. CVTPS2PI mm2, [srcq+src2q]
  267. CVTPS2PI mm3, [srcq+src3q]
  268. CVTPS2PI mm4, [srcq+src4q]
  269. CVTPS2PI mm5, [srcq+src5q]
  270. packssdw mm0, mm3
  271. packssdw mm1, mm4
  272. packssdw mm2, mm5
  273. pswapd mm3, mm0
  274. punpcklwd mm0, mm1
  275. punpckhwd mm1, mm2
  276. punpcklwd mm2, mm3
  277. pswapd mm3, mm0
  278. punpckldq mm0, mm2
  279. punpckhdq mm2, mm1
  280. punpckldq mm1, mm3
  281. movq [dstq ], mm0
  282. movq [dstq+16], mm2
  283. movq [dstq+ 8], mm1
  284. add srcq, 8
  285. add dstq, 24
  286. sub lend, 2
  287. jg .loop
  288. emms
  289. RET
  290. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  291. INIT_MMX sse
  292. %define pswapd PSWAPD_SSE
  293. FLOAT_TO_INT16_INTERLEAVE6
  294. INIT_MMX 3dnow
  295. %define pswapd PSWAPD_3DNOW
  296. FLOAT_TO_INT16_INTERLEAVE6
  297. %undef pswapd
  298. INIT_MMX 3dnowext
  299. FLOAT_TO_INT16_INTERLEAVE6
  300. ;-----------------------------------------------------------------------------
  301. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  302. ;-----------------------------------------------------------------------------
  303. %macro FLOAT_INTERLEAVE6 1
  304. cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
  305. %if ARCH_X86_64
  306. mov lend, r2d
  307. %else
  308. %define lend dword r2m
  309. %endif
  310. mov src1q, [srcq+1*gprsize]
  311. mov src2q, [srcq+2*gprsize]
  312. mov src3q, [srcq+3*gprsize]
  313. mov src4q, [srcq+4*gprsize]
  314. mov src5q, [srcq+5*gprsize]
  315. mov srcq, [srcq]
  316. sub src1q, srcq
  317. sub src2q, srcq
  318. sub src3q, srcq
  319. sub src4q, srcq
  320. sub src5q, srcq
  321. .loop:
  322. %if cpuflag(sse)
  323. movaps m0, [srcq]
  324. movaps m1, [srcq+src1q]
  325. movaps m2, [srcq+src2q]
  326. movaps m3, [srcq+src3q]
  327. movaps m4, [srcq+src4q]
  328. movaps m5, [srcq+src5q]
  329. SBUTTERFLYPS 0, 1, 6
  330. SBUTTERFLYPS 2, 3, 6
  331. SBUTTERFLYPS 4, 5, 6
  332. movaps m6, m4
  333. shufps m4, m0, 0xe4
  334. movlhps m0, m2
  335. movhlps m6, m2
  336. movaps [dstq ], m0
  337. movaps [dstq+16], m4
  338. movaps [dstq+32], m6
  339. movaps m6, m5
  340. shufps m5, m1, 0xe4
  341. movlhps m1, m3
  342. movhlps m6, m3
  343. movaps [dstq+48], m1
  344. movaps [dstq+64], m5
  345. movaps [dstq+80], m6
  346. %else ; mmx
  347. movq m0, [srcq]
  348. movq m1, [srcq+src1q]
  349. movq m2, [srcq+src2q]
  350. movq m3, [srcq+src3q]
  351. movq m4, [srcq+src4q]
  352. movq m5, [srcq+src5q]
  353. SBUTTERFLY dq, 0, 1, 6
  354. SBUTTERFLY dq, 2, 3, 6
  355. SBUTTERFLY dq, 4, 5, 6
  356. movq [dstq ], m0
  357. movq [dstq+ 8], m2
  358. movq [dstq+16], m4
  359. movq [dstq+24], m1
  360. movq [dstq+32], m3
  361. movq [dstq+40], m5
  362. %endif
  363. add srcq, mmsize
  364. add dstq, mmsize*6
  365. sub lend, mmsize/4
  366. jg .loop
  367. %if mmsize == 8
  368. emms
  369. %endif
  370. REP_RET
  371. %endmacro
  372. INIT_MMX mmx
  373. FLOAT_INTERLEAVE6 0
  374. INIT_XMM sse
  375. FLOAT_INTERLEAVE6 7
  376. ;-----------------------------------------------------------------------------
  377. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  378. ;-----------------------------------------------------------------------------
  379. %macro FLOAT_INTERLEAVE2 1
  380. cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
  381. mov src1q, [srcq+gprsize]
  382. mov srcq, [srcq ]
  383. sub src1q, srcq
  384. .loop:
  385. mova m0, [srcq ]
  386. mova m1, [srcq+src1q ]
  387. mova m3, [srcq +mmsize]
  388. mova m4, [srcq+src1q+mmsize]
  389. mova m2, m0
  390. PUNPCKLDQ m0, m1
  391. PUNPCKHDQ m2, m1
  392. mova m1, m3
  393. PUNPCKLDQ m3, m4
  394. PUNPCKHDQ m1, m4
  395. mova [dstq ], m0
  396. mova [dstq+1*mmsize], m2
  397. mova [dstq+2*mmsize], m3
  398. mova [dstq+3*mmsize], m1
  399. add srcq, mmsize*2
  400. add dstq, mmsize*4
  401. sub lend, mmsize/2
  402. jg .loop
  403. %if mmsize == 8
  404. emms
  405. %endif
  406. REP_RET
  407. %endmacro
  408. INIT_MMX mmx
  409. %define PUNPCKLDQ punpckldq
  410. %define PUNPCKHDQ punpckhdq
  411. FLOAT_INTERLEAVE2 0
  412. INIT_XMM sse
  413. %define PUNPCKLDQ unpcklps
  414. %define PUNPCKHDQ unpckhps
  415. FLOAT_INTERLEAVE2 5