You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

462 lines
13KB

  1. ;******************************************************************************
  2. ;* Copyright (c) 2012 Michael Niedermayer
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. align 32
  23. flt2pm31: times 8 dd 4.6566129e-10
  24. flt2p31 : times 8 dd 2147483648.0
  25. flt2p15 : times 8 dd 32768.0
  26. word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
  27. SECTION .text
  28. ;to, from, a/u, log2_outsize, log_intsize, const
  29. %macro PACK_2CH 5-7
  30. cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
  31. mov src2q , [srcq+gprsize]
  32. mov srcq , [srcq]
  33. mov dstq , [dstq]
  34. %ifidn %3, a
  35. test dstq, mmsize-1
  36. jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  37. test srcq, mmsize-1
  38. jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  39. test src2q, mmsize-1
  40. jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
  41. %else
  42. pack_2ch_%2_to_%1_u_int %+ SUFFIX
  43. %endif
  44. lea srcq , [srcq + (1<<%5)*lenq]
  45. lea src2q, [src2q + (1<<%5)*lenq]
  46. lea dstq , [dstq + (2<<%4)*lenq]
  47. neg lenq
  48. %7 m0,m1,m2,m3,m4,m5
  49. .next:
  50. %if %4 >= %5
  51. mov%3 m0, [ srcq +(1<<%5)*lenq]
  52. mova m1, m0
  53. mov%3 m2, [ src2q+(1<<%5)*lenq]
  54. %if %5 == 1
  55. punpcklwd m0, m2
  56. punpckhwd m1, m2
  57. %else
  58. punpckldq m0, m2
  59. punpckhdq m1, m2
  60. %endif
  61. %6 m0,m1,m2,m3,m4,m5
  62. %else
  63. mov%3 m0, [ srcq +(1<<%5)*lenq]
  64. mov%3 m1, [mmsize + srcq +(1<<%5)*lenq]
  65. mov%3 m2, [ src2q+(1<<%5)*lenq]
  66. mov%3 m3, [mmsize + src2q+(1<<%5)*lenq]
  67. %6 m0,m1,m2,m3,m4,m5
  68. mova m2, m0
  69. punpcklwd m0, m1
  70. punpckhwd m2, m1
  71. SWAP 1,2
  72. %endif
  73. mov%3 [ dstq+(2<<%4)*lenq], m0
  74. mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1
  75. %if %4 > %5
  76. mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
  77. mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
  78. add lenq, 4*mmsize/(2<<%4)
  79. %else
  80. add lenq, 2*mmsize/(2<<%4)
  81. %endif
  82. jl .next
  83. REP_RET
  84. %endmacro
  85. %macro UNPACK_2CH 5-7
  86. cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
  87. mov dst2q , [dstq+gprsize]
  88. mov srcq , [srcq]
  89. mov dstq , [dstq]
  90. %ifidn %3, a
  91. test dstq, mmsize-1
  92. jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  93. test srcq, mmsize-1
  94. jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  95. test dst2q, mmsize-1
  96. jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  97. %else
  98. unpack_2ch_%2_to_%1_u_int %+ SUFFIX
  99. %endif
  100. lea srcq , [srcq + (2<<%5)*lenq]
  101. lea dstq , [dstq + (1<<%4)*lenq]
  102. lea dst2q, [dst2q + (1<<%4)*lenq]
  103. neg lenq
  104. %7 m0,m1,m2,m3,m4,m5
  105. mova m6, [word_unpack_shuf]
  106. .next:
  107. mov%3 m0, [ srcq +(2<<%5)*lenq]
  108. mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq]
  109. %if %5 == 1
  110. %ifidn SUFFIX, _ssse3
  111. pshufb m0, m6
  112. mova m1, m0
  113. pshufb m2, m6
  114. punpcklqdq m0,m2
  115. punpckhqdq m1,m2
  116. %else
  117. mova m1, m0
  118. punpcklwd m0,m2
  119. punpckhwd m1,m2
  120. mova m2, m0
  121. punpcklwd m0,m1
  122. punpckhwd m2,m1
  123. mova m1, m0
  124. punpcklwd m0,m2
  125. punpckhwd m1,m2
  126. %endif
  127. %else
  128. mova m1, m0
  129. shufps m0, m2, 10001000b
  130. shufps m1, m2, 11011101b
  131. %endif
  132. %if %4 < %5
  133. mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq]
  134. mova m3, m2
  135. mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq]
  136. shufps m2, m4, 10001000b
  137. shufps m3, m4, 11011101b
  138. SWAP 1,2
  139. %endif
  140. %6 m0,m1,m2,m3,m4,m5
  141. mov%3 [ dstq+(1<<%4)*lenq], m0
  142. %if %4 > %5
  143. mov%3 [ dst2q+(1<<%4)*lenq], m2
  144. mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
  145. mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
  146. add lenq, 2*mmsize/(1<<%4)
  147. %else
  148. mov%3 [ dst2q+(1<<%4)*lenq], m1
  149. add lenq, mmsize/(1<<%4)
  150. %endif
  151. jl .next
  152. REP_RET
  153. %endmacro
  154. %macro CONV 5-7
  155. cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
  156. mov srcq , [srcq]
  157. mov dstq , [dstq]
  158. %ifidn %3, a
  159. test dstq, mmsize-1
  160. jne %2_to_%1_u_int %+ SUFFIX
  161. test srcq, mmsize-1
  162. jne %2_to_%1_u_int %+ SUFFIX
  163. %else
  164. %2_to_%1_u_int %+ SUFFIX
  165. %endif
  166. lea srcq , [srcq + (1<<%5)*lenq]
  167. lea dstq , [dstq + (1<<%4)*lenq]
  168. neg lenq
  169. %7 m0,m1,m2,m3,m4,m5
  170. .next:
  171. mov%3 m0, [ srcq +(1<<%5)*lenq]
  172. mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq]
  173. %if %4 < %5
  174. mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq]
  175. mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq]
  176. %endif
  177. %6 m0,m1,m2,m3,m4,m5
  178. mov%3 [ dstq+(1<<%4)*lenq], m0
  179. mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
  180. %if %4 > %5
  181. mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
  182. mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
  183. add lenq, 4*mmsize/(1<<%4)
  184. %else
  185. add lenq, 2*mmsize/(1<<%4)
  186. %endif
  187. jl .next
  188. REP_RET
  189. %endmacro
  190. %macro PACK_6CH 5-7
  191. cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
  192. %if ARCH_X86_64
  193. mov lend, r2d
  194. %else
  195. %define lend dword r2m
  196. %endif
  197. mov src1q, [srcq+1*gprsize]
  198. mov src2q, [srcq+2*gprsize]
  199. mov src3q, [srcq+3*gprsize]
  200. mov src4q, [srcq+4*gprsize]
  201. mov src5q, [srcq+5*gprsize]
  202. mov srcq, [srcq]
  203. mov dstq, [dstq]
  204. %ifidn %3, a
  205. test dstq, mmsize-1
  206. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  207. test srcq, mmsize-1
  208. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  209. test src2q, mmsize-1
  210. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  211. test src3q, mmsize-1
  212. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  213. test src4q, mmsize-1
  214. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  215. test src5q, mmsize-1
  216. jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
  217. %else
  218. pack_6ch_%2_to_%1_u_int %+ SUFFIX
  219. %endif
  220. sub src1q, srcq
  221. sub src2q, srcq
  222. sub src3q, srcq
  223. sub src4q, srcq
  224. sub src5q, srcq
  225. .loop:
  226. mov%3 m0, [srcq ]
  227. mov%3 m1, [srcq+src1q]
  228. mov%3 m2, [srcq+src2q]
  229. mov%3 m3, [srcq+src3q]
  230. mov%3 m4, [srcq+src4q]
  231. mov%3 m5, [srcq+src5q]
  232. %7 x,x,x,x,m7,x
  233. %if cpuflag(sse4)
  234. SBUTTERFLYPS 0, 1, 6
  235. SBUTTERFLYPS 2, 3, 6
  236. SBUTTERFLYPS 4, 5, 6
  237. blendps m6, m4, m0, 1100b
  238. movlhps m0, m2
  239. movhlps m4, m2
  240. blendps m2, m5, m1, 1100b
  241. movlhps m1, m3
  242. movhlps m5, m3
  243. %6 m0,m6,x,x,m7,m3
  244. %6 m4,m1,x,x,m7,m3
  245. %6 m2,m5,x,x,m7,m3
  246. mov %+ %3 %+ ps [dstq ], m0
  247. mov %+ %3 %+ ps [dstq+16], m6
  248. mov %+ %3 %+ ps [dstq+32], m4
  249. mov %+ %3 %+ ps [dstq+48], m1
  250. mov %+ %3 %+ ps [dstq+64], m2
  251. mov %+ %3 %+ ps [dstq+80], m5
  252. %else ; mmx
  253. SBUTTERFLY dq, 0, 1, 6
  254. SBUTTERFLY dq, 2, 3, 6
  255. SBUTTERFLY dq, 4, 5, 6
  256. movq [dstq ], m0
  257. movq [dstq+ 8], m2
  258. movq [dstq+16], m4
  259. movq [dstq+24], m1
  260. movq [dstq+32], m3
  261. movq [dstq+40], m5
  262. %endif
  263. add srcq, mmsize
  264. add dstq, mmsize*6
  265. sub lend, mmsize/4
  266. jg .loop
  267. %if mmsize == 8
  268. emms
  269. RET
  270. %else
  271. REP_RET
  272. %endif
  273. %endmacro
  274. %macro INT16_TO_INT32_N 6
  275. pxor m2, m2
  276. pxor m3, m3
  277. punpcklwd m2, m1
  278. punpckhwd m3, m1
  279. SWAP 4,0
  280. pxor m0, m0
  281. pxor m1, m1
  282. punpcklwd m0, m4
  283. punpckhwd m1, m4
  284. %endmacro
  285. %macro INT32_TO_INT16_N 6
  286. psrad m0, 16
  287. psrad m1, 16
  288. psrad m2, 16
  289. psrad m3, 16
  290. packssdw m0, m1
  291. packssdw m2, m3
  292. SWAP 1,2
  293. %endmacro
  294. %macro INT32_TO_FLOAT_INIT 6
  295. mova %5, [flt2pm31]
  296. %endmacro
  297. %macro INT32_TO_FLOAT_N 6
  298. cvtdq2ps %1, %1
  299. cvtdq2ps %2, %2
  300. mulps %1, %1, %5
  301. mulps %2, %2, %5
  302. %endmacro
  303. %macro FLOAT_TO_INT32_INIT 6
  304. mova %5, [flt2p31]
  305. %endmacro
  306. %macro FLOAT_TO_INT32_N 6
  307. mulps %1, %5
  308. mulps %2, %5
  309. cvtps2dq %6, %1
  310. cmpnltps %1, %5
  311. paddd %1, %6
  312. cvtps2dq %6, %2
  313. cmpnltps %2, %5
  314. paddd %2, %6
  315. %endmacro
  316. %macro INT16_TO_FLOAT_INIT 6
  317. mova m5, [flt2pm31]
  318. %endmacro
  319. %macro INT16_TO_FLOAT_N 6
  320. INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
  321. cvtdq2ps m0, m0
  322. cvtdq2ps m1, m1
  323. cvtdq2ps m2, m2
  324. cvtdq2ps m3, m3
  325. mulps m0, m0, m5
  326. mulps m1, m1, m5
  327. mulps m2, m2, m5
  328. mulps m3, m3, m5
  329. %endmacro
  330. %macro FLOAT_TO_INT16_INIT 6
  331. mova m5, [flt2p15]
  332. %endmacro
  333. %macro FLOAT_TO_INT16_N 6
  334. mulps m0, m5
  335. mulps m1, m5
  336. mulps m2, m5
  337. mulps m3, m5
  338. cvtps2dq m0, m0
  339. cvtps2dq m1, m1
  340. packssdw m0, m1
  341. cvtps2dq m1, m2
  342. cvtps2dq m3, m3
  343. packssdw m1, m3
  344. %endmacro
  345. %macro NOP_N 0-6
  346. %endmacro
  347. INIT_MMX mmx
  348. CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  349. CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  350. CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  351. CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  352. PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  353. PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  354. INIT_XMM sse2
  355. CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  356. CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  357. CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  358. CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  359. PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  360. PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  361. PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
  362. PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
  363. PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  364. PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  365. PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  366. PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  367. UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  368. UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  369. UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
  370. UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
  371. UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  372. UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  373. UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
  374. UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
  375. CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  376. CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  377. CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  378. CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  379. CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  380. CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  381. CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  382. CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  383. PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  384. PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  385. PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  386. PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  387. PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  388. PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  389. PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  390. PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  391. UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  392. UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  393. UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  394. UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  395. UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  396. UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  397. UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  398. UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
  399. INIT_XMM ssse3
  400. UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
  401. UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
  402. UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
  403. UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
  404. UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  405. UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
  406. INIT_XMM sse4
  407. PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  408. PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  409. PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  410. PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  411. PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  412. PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  413. %if HAVE_AVX_EXTERNAL
  414. INIT_XMM avx
  415. PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
  416. PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
  417. PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  418. PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  419. PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  420. PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
  421. INIT_YMM avx
  422. CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  423. CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
  424. %endif