You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

433 lines
11KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_TEXT
  23. %macro CVTPS2PI 2
  24. %if cpuflag(sse)
  25. cvtps2pi %1, %2
  26. %elif cpuflag(3dnow)
  27. pf2id %1, %2
  28. %endif
  29. %endmacro
  30. ;---------------------------------------------------------------------------------
  31. ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
  32. ;---------------------------------------------------------------------------------
  33. %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  34. %if UNIX64
  35. cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  36. %else
  37. cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  38. %endif
  39. %if WIN64
  40. SWAP 0, 2
  41. %elif ARCH_X86_32
  42. movss m0, mulm
  43. %endif
  44. SPLATD m0
  45. shl lenq, 2
  46. add srcq, lenq
  47. add dstq, lenq
  48. neg lenq
  49. .loop:
  50. %if cpuflag(sse2)
  51. cvtdq2ps m1, [srcq+lenq ]
  52. cvtdq2ps m2, [srcq+lenq+16]
  53. %else
  54. cvtpi2ps m1, [srcq+lenq ]
  55. cvtpi2ps m3, [srcq+lenq+ 8]
  56. cvtpi2ps m2, [srcq+lenq+16]
  57. cvtpi2ps m4, [srcq+lenq+24]
  58. movlhps m1, m3
  59. movlhps m2, m4
  60. %endif
  61. mulps m1, m0
  62. mulps m2, m0
  63. mova [dstq+lenq ], m1
  64. mova [dstq+lenq+16], m2
  65. add lenq, 32
  66. jl .loop
  67. REP_RET
  68. %endmacro
  69. INIT_XMM sse
  70. %define SPLATD SPLATD_SSE
  71. INT32_TO_FLOAT_FMUL_SCALAR 5
  72. INIT_XMM sse2
  73. %define SPLATD SPLATD_SSE2
  74. INT32_TO_FLOAT_FMUL_SCALAR 3
  75. %undef SPLATD
  76. ;------------------------------------------------------------------------------
  77. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  78. ;------------------------------------------------------------------------------
  79. %macro FLOAT_TO_INT16 1
  80. cglobal float_to_int16, 3, 3, %1, dst, src, len
  81. add lenq, lenq
  82. lea srcq, [srcq+2*lenq]
  83. add dstq, lenq
  84. neg lenq
  85. .loop:
  86. %if cpuflag(sse2)
  87. cvtps2dq m0, [srcq+2*lenq ]
  88. cvtps2dq m1, [srcq+2*lenq+16]
  89. packssdw m0, m1
  90. mova [dstq+lenq], m0
  91. %else
  92. CVTPS2PI m0, [srcq+2*lenq ]
  93. CVTPS2PI m1, [srcq+2*lenq+ 8]
  94. CVTPS2PI m2, [srcq+2*lenq+16]
  95. CVTPS2PI m3, [srcq+2*lenq+24]
  96. packssdw m0, m1
  97. packssdw m2, m3
  98. mova [dstq+lenq ], m0
  99. mova [dstq+lenq+8], m2
  100. %endif
  101. add lenq, 16
  102. js .loop
  103. %if mmsize == 8
  104. emms
  105. %endif
  106. REP_RET
  107. %endmacro
  108. INIT_XMM sse2
  109. FLOAT_TO_INT16 2
  110. INIT_MMX sse
  111. FLOAT_TO_INT16 0
  112. INIT_MMX 3dnow
  113. FLOAT_TO_INT16 0
  114. ;------------------------------------------------------------------------------
  115. ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
  116. ;------------------------------------------------------------------------------
  117. %macro FLOAT_TO_INT16_STEP 1
  118. cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
  119. add lenq, lenq
  120. lea srcq, [srcq+2*lenq]
  121. lea step3q, [stepq*3]
  122. neg lenq
  123. .loop:
  124. %if cpuflag(sse2)
  125. cvtps2dq m0, [srcq+2*lenq ]
  126. cvtps2dq m1, [srcq+2*lenq+16]
  127. packssdw m0, m1
  128. movd v1d, m0
  129. psrldq m0, 4
  130. movd v2d, m0
  131. psrldq m0, 4
  132. mov [dstq], v1w
  133. mov [dstq+stepq*4], v2w
  134. shr v1d, 16
  135. shr v2d, 16
  136. mov [dstq+stepq*2], v1w
  137. mov [dstq+step3q*2], v2w
  138. lea dstq, [dstq+stepq*8]
  139. movd v1d, m0
  140. psrldq m0, 4
  141. movd v2d, m0
  142. mov [dstq], v1w
  143. mov [dstq+stepq*4], v2w
  144. shr v1d, 16
  145. shr v2d, 16
  146. mov [dstq+stepq*2], v1w
  147. mov [dstq+step3q*2], v2w
  148. lea dstq, [dstq+stepq*8]
  149. %else
  150. CVTPS2PI m0, [srcq+2*lenq ]
  151. CVTPS2PI m1, [srcq+2*lenq+ 8]
  152. CVTPS2PI m2, [srcq+2*lenq+16]
  153. CVTPS2PI m3, [srcq+2*lenq+24]
  154. packssdw m0, m1
  155. packssdw m2, m3
  156. movd v1d, m0
  157. psrlq m0, 32
  158. movd v2d, m0
  159. mov [dstq], v1w
  160. mov [dstq+stepq*4], v2w
  161. shr v1d, 16
  162. shr v2d, 16
  163. mov [dstq+stepq*2], v1w
  164. mov [dstq+step3q*2], v2w
  165. lea dstq, [dstq+stepq*8]
  166. movd v1d, m2
  167. psrlq m2, 32
  168. movd v2d, m2
  169. mov [dstq], v1w
  170. mov [dstq+stepq*4], v2w
  171. shr v1d, 16
  172. shr v2d, 16
  173. mov [dstq+stepq*2], v1w
  174. mov [dstq+step3q*2], v2w
  175. lea dstq, [dstq+stepq*8]
  176. %endif
  177. add lenq, 16
  178. js .loop
  179. %if mmsize == 8
  180. emms
  181. %endif
  182. REP_RET
  183. %endmacro
  184. INIT_XMM sse2
  185. FLOAT_TO_INT16_STEP 2
  186. INIT_MMX sse
  187. FLOAT_TO_INT16_STEP 0
  188. INIT_MMX 3dnow
  189. FLOAT_TO_INT16_STEP 0
  190. ;-------------------------------------------------------------------------------
  191. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  192. ;-------------------------------------------------------------------------------
  193. %macro FLOAT_TO_INT16_INTERLEAVE2 0
  194. cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
  195. lea lenq, [4*r2q]
  196. mov src1q, [src0q+gprsize]
  197. mov src0q, [src0q]
  198. add dstq, lenq
  199. add src0q, lenq
  200. add src1q, lenq
  201. neg lenq
  202. .loop:
  203. %if cpuflag(sse2)
  204. cvtps2dq m0, [src0q+lenq]
  205. cvtps2dq m1, [src1q+lenq]
  206. packssdw m0, m1
  207. movhlps m1, m0
  208. punpcklwd m0, m1
  209. mova [dstq+lenq], m0
  210. %else
  211. CVTPS2PI m0, [src0q+lenq ]
  212. CVTPS2PI m1, [src0q+lenq+8]
  213. CVTPS2PI m2, [src1q+lenq ]
  214. CVTPS2PI m3, [src1q+lenq+8]
  215. packssdw m0, m1
  216. packssdw m2, m3
  217. mova m1, m0
  218. punpcklwd m0, m2
  219. punpckhwd m1, m2
  220. mova [dstq+lenq ], m0
  221. mova [dstq+lenq+8], m1
  222. %endif
  223. add lenq, 16
  224. js .loop
  225. %if mmsize == 8
  226. emms
  227. %endif
  228. REP_RET
  229. %endmacro
  230. INIT_MMX 3dnow
  231. FLOAT_TO_INT16_INTERLEAVE2
  232. INIT_MMX sse
  233. FLOAT_TO_INT16_INTERLEAVE2
  234. INIT_XMM sse2
  235. FLOAT_TO_INT16_INTERLEAVE2
  236. %macro FLOAT_TO_INT16_INTERLEAVE6 0
  237. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  238. cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
  239. %if ARCH_X86_64
  240. mov lend, r2d
  241. %else
  242. %define lend dword r2m
  243. %endif
  244. mov src1q, [srcq+1*gprsize]
  245. mov src2q, [srcq+2*gprsize]
  246. mov src3q, [srcq+3*gprsize]
  247. mov src4q, [srcq+4*gprsize]
  248. mov src5q, [srcq+5*gprsize]
  249. mov srcq, [srcq]
  250. sub src1q, srcq
  251. sub src2q, srcq
  252. sub src3q, srcq
  253. sub src4q, srcq
  254. sub src5q, srcq
  255. .loop:
  256. CVTPS2PI mm0, [srcq]
  257. CVTPS2PI mm1, [srcq+src1q]
  258. CVTPS2PI mm2, [srcq+src2q]
  259. CVTPS2PI mm3, [srcq+src3q]
  260. CVTPS2PI mm4, [srcq+src4q]
  261. CVTPS2PI mm5, [srcq+src5q]
  262. packssdw mm0, mm3
  263. packssdw mm1, mm4
  264. packssdw mm2, mm5
  265. PSWAPD mm3, mm0
  266. punpcklwd mm0, mm1
  267. punpckhwd mm1, mm2
  268. punpcklwd mm2, mm3
  269. PSWAPD mm3, mm0
  270. punpckldq mm0, mm2
  271. punpckhdq mm2, mm1
  272. punpckldq mm1, mm3
  273. movq [dstq ], mm0
  274. movq [dstq+16], mm2
  275. movq [dstq+ 8], mm1
  276. add srcq, 8
  277. add dstq, 24
  278. sub lend, 2
  279. jg .loop
  280. emms
  281. RET
  282. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  283. INIT_MMX sse
  284. FLOAT_TO_INT16_INTERLEAVE6
  285. INIT_MMX 3dnow
  286. FLOAT_TO_INT16_INTERLEAVE6
  287. INIT_MMX 3dnowext
  288. FLOAT_TO_INT16_INTERLEAVE6
  289. ;-----------------------------------------------------------------------------
  290. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  291. ;-----------------------------------------------------------------------------
  292. %macro FLOAT_INTERLEAVE6 1
  293. cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
  294. %if ARCH_X86_64
  295. mov lend, r2d
  296. %else
  297. %define lend dword r2m
  298. %endif
  299. mov src1q, [srcq+1*gprsize]
  300. mov src2q, [srcq+2*gprsize]
  301. mov src3q, [srcq+3*gprsize]
  302. mov src4q, [srcq+4*gprsize]
  303. mov src5q, [srcq+5*gprsize]
  304. mov srcq, [srcq]
  305. sub src1q, srcq
  306. sub src2q, srcq
  307. sub src3q, srcq
  308. sub src4q, srcq
  309. sub src5q, srcq
  310. .loop:
  311. %if cpuflag(sse)
  312. movaps m0, [srcq]
  313. movaps m1, [srcq+src1q]
  314. movaps m2, [srcq+src2q]
  315. movaps m3, [srcq+src3q]
  316. movaps m4, [srcq+src4q]
  317. movaps m5, [srcq+src5q]
  318. SBUTTERFLYPS 0, 1, 6
  319. SBUTTERFLYPS 2, 3, 6
  320. SBUTTERFLYPS 4, 5, 6
  321. movaps m6, m4
  322. shufps m4, m0, 0xe4
  323. movlhps m0, m2
  324. movhlps m6, m2
  325. movaps [dstq ], m0
  326. movaps [dstq+16], m4
  327. movaps [dstq+32], m6
  328. movaps m6, m5
  329. shufps m5, m1, 0xe4
  330. movlhps m1, m3
  331. movhlps m6, m3
  332. movaps [dstq+48], m1
  333. movaps [dstq+64], m5
  334. movaps [dstq+80], m6
  335. %else ; mmx
  336. movq m0, [srcq]
  337. movq m1, [srcq+src1q]
  338. movq m2, [srcq+src2q]
  339. movq m3, [srcq+src3q]
  340. movq m4, [srcq+src4q]
  341. movq m5, [srcq+src5q]
  342. SBUTTERFLY dq, 0, 1, 6
  343. SBUTTERFLY dq, 2, 3, 6
  344. SBUTTERFLY dq, 4, 5, 6
  345. movq [dstq ], m0
  346. movq [dstq+ 8], m2
  347. movq [dstq+16], m4
  348. movq [dstq+24], m1
  349. movq [dstq+32], m3
  350. movq [dstq+40], m5
  351. %endif
  352. add srcq, mmsize
  353. add dstq, mmsize*6
  354. sub lend, mmsize/4
  355. jg .loop
  356. %if mmsize == 8
  357. emms
  358. %endif
  359. REP_RET
  360. %endmacro
  361. INIT_MMX mmx
  362. FLOAT_INTERLEAVE6 0
  363. INIT_XMM sse
  364. FLOAT_INTERLEAVE6 7
  365. ;-----------------------------------------------------------------------------
  366. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  367. ;-----------------------------------------------------------------------------
  368. %macro FLOAT_INTERLEAVE2 1
  369. cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
  370. mov src1q, [srcq+gprsize]
  371. mov srcq, [srcq ]
  372. sub src1q, srcq
  373. .loop:
  374. mova m0, [srcq ]
  375. mova m1, [srcq+src1q ]
  376. mova m3, [srcq +mmsize]
  377. mova m4, [srcq+src1q+mmsize]
  378. mova m2, m0
  379. PUNPCKLDQ m0, m1
  380. PUNPCKHDQ m2, m1
  381. mova m1, m3
  382. PUNPCKLDQ m3, m4
  383. PUNPCKHDQ m1, m4
  384. mova [dstq ], m0
  385. mova [dstq+1*mmsize], m2
  386. mova [dstq+2*mmsize], m3
  387. mova [dstq+3*mmsize], m1
  388. add srcq, mmsize*2
  389. add dstq, mmsize*4
  390. sub lend, mmsize/2
  391. jg .loop
  392. %if mmsize == 8
  393. emms
  394. %endif
  395. REP_RET
  396. %endmacro
  397. INIT_MMX mmx
  398. %define PUNPCKLDQ punpckldq
  399. %define PUNPCKHDQ punpckhdq
  400. FLOAT_INTERLEAVE2 0
  401. INIT_XMM sse
  402. %define PUNPCKLDQ unpcklps
  403. %define PUNPCKHDQ unpckhps
  404. FLOAT_INTERLEAVE2 5