You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

430 lines
11KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_TEXT
  23. %macro CVTPS2PI 2
  24. %if cpuflag(sse)
  25. cvtps2pi %1, %2
  26. %elif cpuflag(3dnow)
  27. pf2id %1, %2
  28. %endif
  29. %endmacro
  30. ;---------------------------------------------------------------------------------
  31. ; void int32_to_float_fmul_scalar(float *dst, const int *src, float mul, int len);
  32. ;---------------------------------------------------------------------------------
  33. %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  34. %if UNIX64
  35. cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  36. %else
  37. cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  38. %endif
  39. %if WIN64
  40. SWAP 0, 2
  41. %elif ARCH_X86_32
  42. movss m0, mulm
  43. %endif
  44. SPLATD m0
  45. shl lenq, 2
  46. add srcq, lenq
  47. add dstq, lenq
  48. neg lenq
  49. .loop:
  50. %if cpuflag(sse2)
  51. cvtdq2ps m1, [srcq+lenq ]
  52. cvtdq2ps m2, [srcq+lenq+16]
  53. %else
  54. cvtpi2ps m1, [srcq+lenq ]
  55. cvtpi2ps m3, [srcq+lenq+ 8]
  56. cvtpi2ps m2, [srcq+lenq+16]
  57. cvtpi2ps m4, [srcq+lenq+24]
  58. movlhps m1, m3
  59. movlhps m2, m4
  60. %endif
  61. mulps m1, m0
  62. mulps m2, m0
  63. mova [dstq+lenq ], m1
  64. mova [dstq+lenq+16], m2
  65. add lenq, 32
  66. jl .loop
  67. REP_RET
  68. %endmacro
  69. INIT_XMM sse
  70. INT32_TO_FLOAT_FMUL_SCALAR 5
  71. INIT_XMM sse2
  72. INT32_TO_FLOAT_FMUL_SCALAR 3
  73. ;------------------------------------------------------------------------------
  74. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  75. ;------------------------------------------------------------------------------
  76. %macro FLOAT_TO_INT16 1
  77. cglobal float_to_int16, 3, 3, %1, dst, src, len
  78. add lenq, lenq
  79. lea srcq, [srcq+2*lenq]
  80. add dstq, lenq
  81. neg lenq
  82. .loop:
  83. %if cpuflag(sse2)
  84. cvtps2dq m0, [srcq+2*lenq ]
  85. cvtps2dq m1, [srcq+2*lenq+16]
  86. packssdw m0, m1
  87. mova [dstq+lenq], m0
  88. %else
  89. CVTPS2PI m0, [srcq+2*lenq ]
  90. CVTPS2PI m1, [srcq+2*lenq+ 8]
  91. CVTPS2PI m2, [srcq+2*lenq+16]
  92. CVTPS2PI m3, [srcq+2*lenq+24]
  93. packssdw m0, m1
  94. packssdw m2, m3
  95. mova [dstq+lenq ], m0
  96. mova [dstq+lenq+8], m2
  97. %endif
  98. add lenq, 16
  99. js .loop
  100. %if mmsize == 8
  101. emms
  102. %endif
  103. REP_RET
  104. %endmacro
  105. INIT_XMM sse2
  106. FLOAT_TO_INT16 2
  107. INIT_MMX sse
  108. FLOAT_TO_INT16 0
  109. INIT_MMX 3dnow
  110. FLOAT_TO_INT16 0
  111. ;------------------------------------------------------------------------------
  112. ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
  113. ;------------------------------------------------------------------------------
  114. %macro FLOAT_TO_INT16_STEP 1
  115. cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
  116. add lenq, lenq
  117. lea srcq, [srcq+2*lenq]
  118. lea step3q, [stepq*3]
  119. neg lenq
  120. .loop:
  121. %if cpuflag(sse2)
  122. cvtps2dq m0, [srcq+2*lenq ]
  123. cvtps2dq m1, [srcq+2*lenq+16]
  124. packssdw m0, m1
  125. movd v1d, m0
  126. psrldq m0, 4
  127. movd v2d, m0
  128. psrldq m0, 4
  129. mov [dstq], v1w
  130. mov [dstq+stepq*4], v2w
  131. shr v1d, 16
  132. shr v2d, 16
  133. mov [dstq+stepq*2], v1w
  134. mov [dstq+step3q*2], v2w
  135. lea dstq, [dstq+stepq*8]
  136. movd v1d, m0
  137. psrldq m0, 4
  138. movd v2d, m0
  139. mov [dstq], v1w
  140. mov [dstq+stepq*4], v2w
  141. shr v1d, 16
  142. shr v2d, 16
  143. mov [dstq+stepq*2], v1w
  144. mov [dstq+step3q*2], v2w
  145. lea dstq, [dstq+stepq*8]
  146. %else
  147. CVTPS2PI m0, [srcq+2*lenq ]
  148. CVTPS2PI m1, [srcq+2*lenq+ 8]
  149. CVTPS2PI m2, [srcq+2*lenq+16]
  150. CVTPS2PI m3, [srcq+2*lenq+24]
  151. packssdw m0, m1
  152. packssdw m2, m3
  153. movd v1d, m0
  154. psrlq m0, 32
  155. movd v2d, m0
  156. mov [dstq], v1w
  157. mov [dstq+stepq*4], v2w
  158. shr v1d, 16
  159. shr v2d, 16
  160. mov [dstq+stepq*2], v1w
  161. mov [dstq+step3q*2], v2w
  162. lea dstq, [dstq+stepq*8]
  163. movd v1d, m2
  164. psrlq m2, 32
  165. movd v2d, m2
  166. mov [dstq], v1w
  167. mov [dstq+stepq*4], v2w
  168. shr v1d, 16
  169. shr v2d, 16
  170. mov [dstq+stepq*2], v1w
  171. mov [dstq+step3q*2], v2w
  172. lea dstq, [dstq+stepq*8]
  173. %endif
  174. add lenq, 16
  175. js .loop
  176. %if mmsize == 8
  177. emms
  178. %endif
  179. REP_RET
  180. %endmacro
  181. INIT_XMM sse2
  182. FLOAT_TO_INT16_STEP 2
  183. INIT_MMX sse
  184. FLOAT_TO_INT16_STEP 0
  185. INIT_MMX 3dnow
  186. FLOAT_TO_INT16_STEP 0
  187. ;-------------------------------------------------------------------------------
  188. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  189. ;-------------------------------------------------------------------------------
  190. %macro FLOAT_TO_INT16_INTERLEAVE2 0
  191. cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
  192. lea lenq, [4*r2q]
  193. mov src1q, [src0q+gprsize]
  194. mov src0q, [src0q]
  195. add dstq, lenq
  196. add src0q, lenq
  197. add src1q, lenq
  198. neg lenq
  199. .loop:
  200. %if cpuflag(sse2)
  201. cvtps2dq m0, [src0q+lenq]
  202. cvtps2dq m1, [src1q+lenq]
  203. packssdw m0, m1
  204. movhlps m1, m0
  205. punpcklwd m0, m1
  206. mova [dstq+lenq], m0
  207. %else
  208. CVTPS2PI m0, [src0q+lenq ]
  209. CVTPS2PI m1, [src0q+lenq+8]
  210. CVTPS2PI m2, [src1q+lenq ]
  211. CVTPS2PI m3, [src1q+lenq+8]
  212. packssdw m0, m1
  213. packssdw m2, m3
  214. mova m1, m0
  215. punpcklwd m0, m2
  216. punpckhwd m1, m2
  217. mova [dstq+lenq ], m0
  218. mova [dstq+lenq+8], m1
  219. %endif
  220. add lenq, 16
  221. js .loop
  222. %if mmsize == 8
  223. emms
  224. %endif
  225. REP_RET
  226. %endmacro
  227. INIT_MMX 3dnow
  228. FLOAT_TO_INT16_INTERLEAVE2
  229. INIT_MMX sse
  230. FLOAT_TO_INT16_INTERLEAVE2
  231. INIT_XMM sse2
  232. FLOAT_TO_INT16_INTERLEAVE2
  233. %macro FLOAT_TO_INT16_INTERLEAVE6 0
  234. ; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
  235. cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
  236. %if ARCH_X86_64
  237. mov lend, r2d
  238. %else
  239. %define lend dword r2m
  240. %endif
  241. mov src1q, [srcq+1*gprsize]
  242. mov src2q, [srcq+2*gprsize]
  243. mov src3q, [srcq+3*gprsize]
  244. mov src4q, [srcq+4*gprsize]
  245. mov src5q, [srcq+5*gprsize]
  246. mov srcq, [srcq]
  247. sub src1q, srcq
  248. sub src2q, srcq
  249. sub src3q, srcq
  250. sub src4q, srcq
  251. sub src5q, srcq
  252. .loop:
  253. CVTPS2PI mm0, [srcq]
  254. CVTPS2PI mm1, [srcq+src1q]
  255. CVTPS2PI mm2, [srcq+src2q]
  256. CVTPS2PI mm3, [srcq+src3q]
  257. CVTPS2PI mm4, [srcq+src4q]
  258. CVTPS2PI mm5, [srcq+src5q]
  259. packssdw mm0, mm3
  260. packssdw mm1, mm4
  261. packssdw mm2, mm5
  262. PSWAPD mm3, mm0
  263. punpcklwd mm0, mm1
  264. punpckhwd mm1, mm2
  265. punpcklwd mm2, mm3
  266. PSWAPD mm3, mm0
  267. punpckldq mm0, mm2
  268. punpckhdq mm2, mm1
  269. punpckldq mm1, mm3
  270. movq [dstq ], mm0
  271. movq [dstq+16], mm2
  272. movq [dstq+ 8], mm1
  273. add srcq, 8
  274. add dstq, 24
  275. sub lend, 2
  276. jg .loop
  277. emms
  278. RET
  279. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  280. INIT_MMX sse
  281. FLOAT_TO_INT16_INTERLEAVE6
  282. INIT_MMX 3dnow
  283. FLOAT_TO_INT16_INTERLEAVE6
  284. INIT_MMX 3dnowext
  285. FLOAT_TO_INT16_INTERLEAVE6
  286. ;-----------------------------------------------------------------------------
  287. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  288. ;-----------------------------------------------------------------------------
  289. %macro FLOAT_INTERLEAVE6 1
  290. cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
  291. %if ARCH_X86_64
  292. mov lend, r2d
  293. %else
  294. %define lend dword r2m
  295. %endif
  296. mov src1q, [srcq+1*gprsize]
  297. mov src2q, [srcq+2*gprsize]
  298. mov src3q, [srcq+3*gprsize]
  299. mov src4q, [srcq+4*gprsize]
  300. mov src5q, [srcq+5*gprsize]
  301. mov srcq, [srcq]
  302. sub src1q, srcq
  303. sub src2q, srcq
  304. sub src3q, srcq
  305. sub src4q, srcq
  306. sub src5q, srcq
  307. .loop:
  308. %if cpuflag(sse)
  309. movaps m0, [srcq]
  310. movaps m1, [srcq+src1q]
  311. movaps m2, [srcq+src2q]
  312. movaps m3, [srcq+src3q]
  313. movaps m4, [srcq+src4q]
  314. movaps m5, [srcq+src5q]
  315. SBUTTERFLYPS 0, 1, 6
  316. SBUTTERFLYPS 2, 3, 6
  317. SBUTTERFLYPS 4, 5, 6
  318. movaps m6, m4
  319. shufps m4, m0, 0xe4
  320. movlhps m0, m2
  321. movhlps m6, m2
  322. movaps [dstq ], m0
  323. movaps [dstq+16], m4
  324. movaps [dstq+32], m6
  325. movaps m6, m5
  326. shufps m5, m1, 0xe4
  327. movlhps m1, m3
  328. movhlps m6, m3
  329. movaps [dstq+48], m1
  330. movaps [dstq+64], m5
  331. movaps [dstq+80], m6
  332. %else ; mmx
  333. movq m0, [srcq]
  334. movq m1, [srcq+src1q]
  335. movq m2, [srcq+src2q]
  336. movq m3, [srcq+src3q]
  337. movq m4, [srcq+src4q]
  338. movq m5, [srcq+src5q]
  339. SBUTTERFLY dq, 0, 1, 6
  340. SBUTTERFLY dq, 2, 3, 6
  341. SBUTTERFLY dq, 4, 5, 6
  342. movq [dstq ], m0
  343. movq [dstq+ 8], m2
  344. movq [dstq+16], m4
  345. movq [dstq+24], m1
  346. movq [dstq+32], m3
  347. movq [dstq+40], m5
  348. %endif
  349. add srcq, mmsize
  350. add dstq, mmsize*6
  351. sub lend, mmsize/4
  352. jg .loop
  353. %if mmsize == 8
  354. emms
  355. %endif
  356. REP_RET
  357. %endmacro
  358. INIT_MMX mmx
  359. FLOAT_INTERLEAVE6 0
  360. INIT_XMM sse
  361. FLOAT_INTERLEAVE6 7
  362. ;-----------------------------------------------------------------------------
  363. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  364. ;-----------------------------------------------------------------------------
  365. %macro FLOAT_INTERLEAVE2 1
  366. cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
  367. mov src1q, [srcq+gprsize]
  368. mov srcq, [srcq ]
  369. sub src1q, srcq
  370. .loop:
  371. mova m0, [srcq ]
  372. mova m1, [srcq+src1q ]
  373. mova m3, [srcq +mmsize]
  374. mova m4, [srcq+src1q+mmsize]
  375. mova m2, m0
  376. PUNPCKLDQ m0, m1
  377. PUNPCKHDQ m2, m1
  378. mova m1, m3
  379. PUNPCKLDQ m3, m4
  380. PUNPCKHDQ m1, m4
  381. mova [dstq ], m0
  382. mova [dstq+1*mmsize], m2
  383. mova [dstq+2*mmsize], m3
  384. mova [dstq+3*mmsize], m1
  385. add srcq, mmsize*2
  386. add dstq, mmsize*4
  387. sub lend, mmsize/2
  388. jg .loop
  389. %if mmsize == 8
  390. emms
  391. %endif
  392. REP_RET
  393. %endmacro
  394. INIT_MMX mmx
  395. %define PUNPCKLDQ punpckldq
  396. %define PUNPCKHDQ punpckhdq
  397. FLOAT_INTERLEAVE2 0
  398. INIT_XMM sse
  399. %define PUNPCKLDQ unpcklps
  400. %define PUNPCKHDQ unpckhps
  401. FLOAT_INTERLEAVE2 5