You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

433 lines
11KB

  1. ;******************************************************************************
  2. ;* x86 optimized Format Conversion Utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_TEXT
  23. %macro CVTPS2PI 2
  24. %if cpuflag(sse)
  25. cvtps2pi %1, %2
  26. %elif cpuflag(3dnow)
  27. pf2id %1, %2
  28. %endif
  29. %endmacro
  30. ;------------------------------------------------------------------------------
  31. ; void ff_int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul,
  32. ; int len);
  33. ;------------------------------------------------------------------------------
  34. %macro INT32_TO_FLOAT_FMUL_SCALAR 1
  35. %if UNIX64
  36. cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
  37. %else
  38. cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
  39. %endif
  40. %if WIN64
  41. SWAP 0, 2
  42. %elif ARCH_X86_32
  43. movss m0, mulm
  44. %endif
  45. SPLATD m0
  46. shl lenq, 2
  47. add srcq, lenq
  48. add dstq, lenq
  49. neg lenq
  50. .loop:
  51. %if cpuflag(sse2)
  52. cvtdq2ps m1, [srcq+lenq ]
  53. cvtdq2ps m2, [srcq+lenq+16]
  54. %else
  55. cvtpi2ps m1, [srcq+lenq ]
  56. cvtpi2ps m3, [srcq+lenq+ 8]
  57. cvtpi2ps m2, [srcq+lenq+16]
  58. cvtpi2ps m4, [srcq+lenq+24]
  59. movlhps m1, m3
  60. movlhps m2, m4
  61. %endif
  62. mulps m1, m0
  63. mulps m2, m0
  64. mova [dstq+lenq ], m1
  65. mova [dstq+lenq+16], m2
  66. add lenq, 32
  67. jl .loop
  68. REP_RET
  69. %endmacro
  70. INIT_XMM sse
  71. INT32_TO_FLOAT_FMUL_SCALAR 5
  72. INIT_XMM sse2
  73. INT32_TO_FLOAT_FMUL_SCALAR 3
  74. ;------------------------------------------------------------------------------
  75. ; void ff_float_to_int16(int16_t *dst, const float *src, long len);
  76. ;------------------------------------------------------------------------------
  77. %macro FLOAT_TO_INT16 1
  78. cglobal float_to_int16, 3, 3, %1, dst, src, len
  79. add lenq, lenq
  80. lea srcq, [srcq+2*lenq]
  81. add dstq, lenq
  82. neg lenq
  83. .loop:
  84. %if cpuflag(sse2)
  85. cvtps2dq m0, [srcq+2*lenq ]
  86. cvtps2dq m1, [srcq+2*lenq+16]
  87. packssdw m0, m1
  88. mova [dstq+lenq], m0
  89. %else
  90. CVTPS2PI m0, [srcq+2*lenq ]
  91. CVTPS2PI m1, [srcq+2*lenq+ 8]
  92. CVTPS2PI m2, [srcq+2*lenq+16]
  93. CVTPS2PI m3, [srcq+2*lenq+24]
  94. packssdw m0, m1
  95. packssdw m2, m3
  96. mova [dstq+lenq ], m0
  97. mova [dstq+lenq+8], m2
  98. %endif
  99. add lenq, 16
  100. js .loop
  101. %if mmsize == 8
  102. emms
  103. %endif
  104. REP_RET
  105. %endmacro
  106. INIT_XMM sse2
  107. FLOAT_TO_INT16 2
  108. INIT_MMX sse
  109. FLOAT_TO_INT16 0
  110. INIT_MMX 3dnow
  111. FLOAT_TO_INT16 0
  112. ;------------------------------------------------------------------------------
  113. ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
  114. ;------------------------------------------------------------------------------
  115. %macro FLOAT_TO_INT16_STEP 1
  116. cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
  117. add lenq, lenq
  118. lea srcq, [srcq+2*lenq]
  119. lea step3q, [stepq*3]
  120. neg lenq
  121. .loop:
  122. %if cpuflag(sse2)
  123. cvtps2dq m0, [srcq+2*lenq ]
  124. cvtps2dq m1, [srcq+2*lenq+16]
  125. packssdw m0, m1
  126. movd v1d, m0
  127. psrldq m0, 4
  128. movd v2d, m0
  129. psrldq m0, 4
  130. mov [dstq], v1w
  131. mov [dstq+stepq*4], v2w
  132. shr v1d, 16
  133. shr v2d, 16
  134. mov [dstq+stepq*2], v1w
  135. mov [dstq+step3q*2], v2w
  136. lea dstq, [dstq+stepq*8]
  137. movd v1d, m0
  138. psrldq m0, 4
  139. movd v2d, m0
  140. mov [dstq], v1w
  141. mov [dstq+stepq*4], v2w
  142. shr v1d, 16
  143. shr v2d, 16
  144. mov [dstq+stepq*2], v1w
  145. mov [dstq+step3q*2], v2w
  146. lea dstq, [dstq+stepq*8]
  147. %else
  148. CVTPS2PI m0, [srcq+2*lenq ]
  149. CVTPS2PI m1, [srcq+2*lenq+ 8]
  150. CVTPS2PI m2, [srcq+2*lenq+16]
  151. CVTPS2PI m3, [srcq+2*lenq+24]
  152. packssdw m0, m1
  153. packssdw m2, m3
  154. movd v1d, m0
  155. psrlq m0, 32
  156. movd v2d, m0
  157. mov [dstq], v1w
  158. mov [dstq+stepq*4], v2w
  159. shr v1d, 16
  160. shr v2d, 16
  161. mov [dstq+stepq*2], v1w
  162. mov [dstq+step3q*2], v2w
  163. lea dstq, [dstq+stepq*8]
  164. movd v1d, m2
  165. psrlq m2, 32
  166. movd v2d, m2
  167. mov [dstq], v1w
  168. mov [dstq+stepq*4], v2w
  169. shr v1d, 16
  170. shr v2d, 16
  171. mov [dstq+stepq*2], v1w
  172. mov [dstq+step3q*2], v2w
  173. lea dstq, [dstq+stepq*8]
  174. %endif
  175. add lenq, 16
  176. js .loop
  177. %if mmsize == 8
  178. emms
  179. %endif
  180. REP_RET
  181. %endmacro
  182. INIT_XMM sse2
  183. FLOAT_TO_INT16_STEP 2
  184. INIT_MMX sse
  185. FLOAT_TO_INT16_STEP 0
  186. INIT_MMX 3dnow
  187. FLOAT_TO_INT16_STEP 0
  188. ;-------------------------------------------------------------------------------
  189. ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
  190. ;-------------------------------------------------------------------------------
  191. %macro FLOAT_TO_INT16_INTERLEAVE2 0
  192. cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
  193. lea lenq, [4*r2q]
  194. mov src1q, [src0q+gprsize]
  195. mov src0q, [src0q]
  196. add dstq, lenq
  197. add src0q, lenq
  198. add src1q, lenq
  199. neg lenq
  200. .loop:
  201. %if cpuflag(sse2)
  202. cvtps2dq m0, [src0q+lenq]
  203. cvtps2dq m1, [src1q+lenq]
  204. packssdw m0, m1
  205. movhlps m1, m0
  206. punpcklwd m0, m1
  207. mova [dstq+lenq], m0
  208. %else
  209. CVTPS2PI m0, [src0q+lenq ]
  210. CVTPS2PI m1, [src0q+lenq+8]
  211. CVTPS2PI m2, [src1q+lenq ]
  212. CVTPS2PI m3, [src1q+lenq+8]
  213. packssdw m0, m1
  214. packssdw m2, m3
  215. mova m1, m0
  216. punpcklwd m0, m2
  217. punpckhwd m1, m2
  218. mova [dstq+lenq ], m0
  219. mova [dstq+lenq+8], m1
  220. %endif
  221. add lenq, 16
  222. js .loop
  223. %if mmsize == 8
  224. emms
  225. %endif
  226. REP_RET
  227. %endmacro
  228. INIT_MMX 3dnow
  229. FLOAT_TO_INT16_INTERLEAVE2
  230. INIT_MMX sse
  231. FLOAT_TO_INT16_INTERLEAVE2
  232. INIT_XMM sse2
  233. FLOAT_TO_INT16_INTERLEAVE2
  234. ;-----------------------------------------------------------------------------
  235. ; void ff_float_to_int16_interleave6(int16_t *dst, const float **src, int len)
  236. ;-----------------------------------------------------------------------------
  237. %macro FLOAT_TO_INT16_INTERLEAVE6 0
  238. cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
  239. %if ARCH_X86_64
  240. mov lend, r2d
  241. %else
  242. %define lend dword r2m
  243. %endif
  244. mov src1q, [srcq+1*gprsize]
  245. mov src2q, [srcq+2*gprsize]
  246. mov src3q, [srcq+3*gprsize]
  247. mov src4q, [srcq+4*gprsize]
  248. mov src5q, [srcq+5*gprsize]
  249. mov srcq, [srcq]
  250. sub src1q, srcq
  251. sub src2q, srcq
  252. sub src3q, srcq
  253. sub src4q, srcq
  254. sub src5q, srcq
  255. .loop:
  256. CVTPS2PI mm0, [srcq]
  257. CVTPS2PI mm1, [srcq+src1q]
  258. CVTPS2PI mm2, [srcq+src2q]
  259. CVTPS2PI mm3, [srcq+src3q]
  260. CVTPS2PI mm4, [srcq+src4q]
  261. CVTPS2PI mm5, [srcq+src5q]
  262. packssdw mm0, mm3
  263. packssdw mm1, mm4
  264. packssdw mm2, mm5
  265. PSWAPD mm3, mm0
  266. punpcklwd mm0, mm1
  267. punpckhwd mm1, mm2
  268. punpcklwd mm2, mm3
  269. PSWAPD mm3, mm0
  270. punpckldq mm0, mm2
  271. punpckhdq mm2, mm1
  272. punpckldq mm1, mm3
  273. movq [dstq ], mm0
  274. movq [dstq+16], mm2
  275. movq [dstq+ 8], mm1
  276. add srcq, 8
  277. add dstq, 24
  278. sub lend, 2
  279. jg .loop
  280. emms
  281. RET
  282. %endmacro ; FLOAT_TO_INT16_INTERLEAVE6
  283. INIT_MMX sse
  284. FLOAT_TO_INT16_INTERLEAVE6
  285. INIT_MMX 3dnow
  286. FLOAT_TO_INT16_INTERLEAVE6
  287. INIT_MMX 3dnowext
  288. FLOAT_TO_INT16_INTERLEAVE6
  289. ;-----------------------------------------------------------------------------
  290. ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
  291. ;-----------------------------------------------------------------------------
  292. %macro FLOAT_INTERLEAVE6 1
  293. cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
  294. %if ARCH_X86_64
  295. mov lend, r2d
  296. %else
  297. %define lend dword r2m
  298. %endif
  299. mov src1q, [srcq+1*gprsize]
  300. mov src2q, [srcq+2*gprsize]
  301. mov src3q, [srcq+3*gprsize]
  302. mov src4q, [srcq+4*gprsize]
  303. mov src5q, [srcq+5*gprsize]
  304. mov srcq, [srcq]
  305. sub src1q, srcq
  306. sub src2q, srcq
  307. sub src3q, srcq
  308. sub src4q, srcq
  309. sub src5q, srcq
  310. .loop:
  311. %if cpuflag(sse)
  312. movaps m0, [srcq]
  313. movaps m1, [srcq+src1q]
  314. movaps m2, [srcq+src2q]
  315. movaps m3, [srcq+src3q]
  316. movaps m4, [srcq+src4q]
  317. movaps m5, [srcq+src5q]
  318. SBUTTERFLYPS 0, 1, 6
  319. SBUTTERFLYPS 2, 3, 6
  320. SBUTTERFLYPS 4, 5, 6
  321. movaps m6, m4
  322. shufps m4, m0, 0xe4
  323. movlhps m0, m2
  324. movhlps m6, m2
  325. movaps [dstq ], m0
  326. movaps [dstq+16], m4
  327. movaps [dstq+32], m6
  328. movaps m6, m5
  329. shufps m5, m1, 0xe4
  330. movlhps m1, m3
  331. movhlps m6, m3
  332. movaps [dstq+48], m1
  333. movaps [dstq+64], m5
  334. movaps [dstq+80], m6
  335. %else ; mmx
  336. movq m0, [srcq]
  337. movq m1, [srcq+src1q]
  338. movq m2, [srcq+src2q]
  339. movq m3, [srcq+src3q]
  340. movq m4, [srcq+src4q]
  341. movq m5, [srcq+src5q]
  342. SBUTTERFLY dq, 0, 1, 6
  343. SBUTTERFLY dq, 2, 3, 6
  344. SBUTTERFLY dq, 4, 5, 6
  345. movq [dstq ], m0
  346. movq [dstq+ 8], m2
  347. movq [dstq+16], m4
  348. movq [dstq+24], m1
  349. movq [dstq+32], m3
  350. movq [dstq+40], m5
  351. %endif
  352. add srcq, mmsize
  353. add dstq, mmsize*6
  354. sub lend, mmsize/4
  355. jg .loop
  356. %if mmsize == 8
  357. emms
  358. %endif
  359. REP_RET
  360. %endmacro
  361. INIT_MMX mmx
  362. FLOAT_INTERLEAVE6 0
  363. INIT_XMM sse
  364. FLOAT_INTERLEAVE6 7
  365. ;-----------------------------------------------------------------------------
  366. ; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
  367. ;-----------------------------------------------------------------------------
  368. %macro FLOAT_INTERLEAVE2 1
  369. cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
  370. mov src1q, [srcq+gprsize]
  371. mov srcq, [srcq ]
  372. sub src1q, srcq
  373. .loop:
  374. mova m0, [srcq ]
  375. mova m1, [srcq+src1q ]
  376. mova m3, [srcq +mmsize]
  377. mova m4, [srcq+src1q+mmsize]
  378. mova m2, m0
  379. PUNPCKLDQ m0, m1
  380. PUNPCKHDQ m2, m1
  381. mova m1, m3
  382. PUNPCKLDQ m3, m4
  383. PUNPCKHDQ m1, m4
  384. mova [dstq ], m0
  385. mova [dstq+1*mmsize], m2
  386. mova [dstq+2*mmsize], m3
  387. mova [dstq+3*mmsize], m1
  388. add srcq, mmsize*2
  389. add dstq, mmsize*4
  390. sub lend, mmsize/2
  391. jg .loop
  392. %if mmsize == 8
  393. emms
  394. %endif
  395. REP_RET
  396. %endmacro
  397. INIT_MMX mmx
  398. %define PUNPCKLDQ punpckldq
  399. %define PUNPCKHDQ punpckhdq
  400. FLOAT_INTERLEAVE2 0
  401. INIT_XMM sse
  402. %define PUNPCKLDQ unpcklps
  403. %define PUNPCKHDQ unpckhps
  404. FLOAT_INTERLEAVE2 5