You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

386 lines
8.8KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  24. SECTION_TEXT
  25. %macro SCALARPRODUCT 0
  26. ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  27. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  28. shl orderq, 1
  29. add v1q, orderq
  30. add v2q, orderq
  31. neg orderq
  32. pxor m2, m2
  33. .loop:
  34. movu m0, [v1q + orderq]
  35. movu m1, [v1q + orderq + mmsize]
  36. pmaddwd m0, [v2q + orderq]
  37. pmaddwd m1, [v2q + orderq + mmsize]
  38. paddd m2, m0
  39. paddd m2, m1
  40. add orderq, mmsize*2
  41. jl .loop
  42. %if mmsize == 16
  43. movhlps m0, m2
  44. paddd m2, m0
  45. pshuflw m0, m2, 0x4e
  46. %else
  47. pshufw m0, m2, 0x4e
  48. %endif
  49. paddd m2, m0
  50. movd eax, m2
  51. RET
  52. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  53. ; int order, int mul)
  54. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  55. shl orderq, 1
  56. movd m7, mulm
  57. %if mmsize == 16
  58. pshuflw m7, m7, 0
  59. punpcklqdq m7, m7
  60. %else
  61. pshufw m7, m7, 0
  62. %endif
  63. pxor m6, m6
  64. add v1q, orderq
  65. add v2q, orderq
  66. add v3q, orderq
  67. neg orderq
  68. .loop:
  69. movu m0, [v2q + orderq]
  70. movu m1, [v2q + orderq + mmsize]
  71. mova m4, [v1q + orderq]
  72. mova m5, [v1q + orderq + mmsize]
  73. movu m2, [v3q + orderq]
  74. movu m3, [v3q + orderq + mmsize]
  75. pmaddwd m0, m4
  76. pmaddwd m1, m5
  77. pmullw m2, m7
  78. pmullw m3, m7
  79. paddd m6, m0
  80. paddd m6, m1
  81. paddw m2, m4
  82. paddw m3, m5
  83. mova [v1q + orderq], m2
  84. mova [v1q + orderq + mmsize], m3
  85. add orderq, mmsize*2
  86. jl .loop
  87. %if mmsize == 16
  88. movhlps m0, m6
  89. paddd m6, m0
  90. pshuflw m0, m6, 0x4e
  91. %else
  92. pshufw m0, m6, 0x4e
  93. %endif
  94. paddd m6, m0
  95. movd eax, m6
  96. RET
  97. %endmacro
  98. INIT_MMX mmxext
  99. SCALARPRODUCT
  100. INIT_XMM sse2
  101. SCALARPRODUCT
  102. %macro SCALARPRODUCT_LOOP 1
  103. align 16
  104. .loop%1:
  105. sub orderq, mmsize*2
  106. %if %1
  107. mova m1, m4
  108. mova m4, [v2q + orderq]
  109. mova m0, [v2q + orderq + mmsize]
  110. palignr m1, m0, %1
  111. palignr m0, m4, %1
  112. mova m3, m5
  113. mova m5, [v3q + orderq]
  114. mova m2, [v3q + orderq + mmsize]
  115. palignr m3, m2, %1
  116. palignr m2, m5, %1
  117. %else
  118. mova m0, [v2q + orderq]
  119. mova m1, [v2q + orderq + mmsize]
  120. mova m2, [v3q + orderq]
  121. mova m3, [v3q + orderq + mmsize]
  122. %endif
  123. %define t0 [v1q + orderq]
  124. %define t1 [v1q + orderq + mmsize]
  125. %if ARCH_X86_64
  126. mova m8, t0
  127. mova m9, t1
  128. %define t0 m8
  129. %define t1 m9
  130. %endif
  131. pmaddwd m0, t0
  132. pmaddwd m1, t1
  133. pmullw m2, m7
  134. pmullw m3, m7
  135. paddw m2, t0
  136. paddw m3, t1
  137. paddd m6, m0
  138. paddd m6, m1
  139. mova [v1q + orderq], m2
  140. mova [v1q + orderq + mmsize], m3
  141. jg .loop%1
  142. %if %1
  143. jmp .end
  144. %endif
  145. %endmacro
  146. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  147. ; int order, int mul)
  148. INIT_XMM ssse3
  149. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  150. shl orderq, 1
  151. movd m7, mulm
  152. pshuflw m7, m7, 0
  153. punpcklqdq m7, m7
  154. pxor m6, m6
  155. mov r4d, v2d
  156. and r4d, 15
  157. and v2q, ~15
  158. and v3q, ~15
  159. mova m4, [v2q + orderq]
  160. mova m5, [v3q + orderq]
  161. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  162. cmp r4d, 0
  163. je .loop0
  164. cmp r4d, 2
  165. je .loop2
  166. cmp r4d, 4
  167. je .loop4
  168. cmp r4d, 6
  169. je .loop6
  170. cmp r4d, 8
  171. je .loop8
  172. cmp r4d, 10
  173. je .loop10
  174. cmp r4d, 12
  175. je .loop12
  176. SCALARPRODUCT_LOOP 14
  177. SCALARPRODUCT_LOOP 12
  178. SCALARPRODUCT_LOOP 10
  179. SCALARPRODUCT_LOOP 8
  180. SCALARPRODUCT_LOOP 6
  181. SCALARPRODUCT_LOOP 4
  182. SCALARPRODUCT_LOOP 2
  183. SCALARPRODUCT_LOOP 0
  184. .end:
  185. movhlps m0, m6
  186. paddd m6, m0
  187. pshuflw m0, m6, 0x4e
  188. paddd m6, m0
  189. movd eax, m6
  190. RET
  191. ;-----------------------------------------------------------------------------
  192. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  193. ; int32_t max, unsigned int len)
  194. ;-----------------------------------------------------------------------------
  195. ; %1 = number of xmm registers used
  196. ; %2 = number of inline load/process/store loops per asm loop
  197. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  198. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  199. ; %5 = suffix
  200. %macro VECTOR_CLIP_INT32 4-5
  201. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  202. %if %4
  203. cvtsi2ss m4, minm
  204. cvtsi2ss m5, maxm
  205. %else
  206. movd m4, minm
  207. movd m5, maxm
  208. %endif
  209. SPLATD m4
  210. SPLATD m5
  211. .loop:
  212. %assign %%i 1
  213. %rep %2
  214. mova m0, [srcq+mmsize*0*%%i]
  215. mova m1, [srcq+mmsize*1*%%i]
  216. mova m2, [srcq+mmsize*2*%%i]
  217. mova m3, [srcq+mmsize*3*%%i]
  218. %if %3
  219. mova m7, [srcq+mmsize*4*%%i]
  220. mova m8, [srcq+mmsize*5*%%i]
  221. mova m9, [srcq+mmsize*6*%%i]
  222. mova m10, [srcq+mmsize*7*%%i]
  223. %endif
  224. CLIPD m0, m4, m5, m6
  225. CLIPD m1, m4, m5, m6
  226. CLIPD m2, m4, m5, m6
  227. CLIPD m3, m4, m5, m6
  228. %if %3
  229. CLIPD m7, m4, m5, m6
  230. CLIPD m8, m4, m5, m6
  231. CLIPD m9, m4, m5, m6
  232. CLIPD m10, m4, m5, m6
  233. %endif
  234. mova [dstq+mmsize*0*%%i], m0
  235. mova [dstq+mmsize*1*%%i], m1
  236. mova [dstq+mmsize*2*%%i], m2
  237. mova [dstq+mmsize*3*%%i], m3
  238. %if %3
  239. mova [dstq+mmsize*4*%%i], m7
  240. mova [dstq+mmsize*5*%%i], m8
  241. mova [dstq+mmsize*6*%%i], m9
  242. mova [dstq+mmsize*7*%%i], m10
  243. %endif
  244. %assign %%i %%i+1
  245. %endrep
  246. add srcq, mmsize*4*(%2+%3)
  247. add dstq, mmsize*4*(%2+%3)
  248. sub lend, mmsize*(%2+%3)
  249. jg .loop
  250. REP_RET
  251. %endmacro
  252. INIT_MMX mmx
  253. %define CLIPD CLIPD_MMX
  254. VECTOR_CLIP_INT32 0, 1, 0, 0
  255. INIT_XMM sse2
  256. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  257. %define CLIPD CLIPD_SSE2
  258. VECTOR_CLIP_INT32 6, 2, 0, 1
  259. INIT_XMM sse4
  260. %define CLIPD CLIPD_SSE41
  261. %ifdef m8
  262. VECTOR_CLIP_INT32 11, 1, 1, 0
  263. %else
  264. VECTOR_CLIP_INT32 6, 1, 0, 0
  265. %endif
  266. ; %1 = aligned/unaligned
  267. %macro BSWAP_LOOPS 1
  268. mov r3, r2
  269. sar r2, 3
  270. jz .left4_%1
  271. .loop8_%1:
  272. mov%1 m0, [r1 + 0]
  273. mov%1 m1, [r1 + 16]
  274. %if cpuflag(ssse3)
  275. pshufb m0, m2
  276. pshufb m1, m2
  277. mov%1 [r0 + 0], m0
  278. mov%1 [r0 + 16], m1
  279. %else
  280. pshuflw m0, m0, 10110001b
  281. pshuflw m1, m1, 10110001b
  282. pshufhw m0, m0, 10110001b
  283. pshufhw m1, m1, 10110001b
  284. mova m2, m0
  285. mova m3, m1
  286. psllw m0, 8
  287. psllw m1, 8
  288. psrlw m2, 8
  289. psrlw m3, 8
  290. por m2, m0
  291. por m3, m1
  292. mov%1 [r0 + 0], m2
  293. mov%1 [r0 + 16], m3
  294. %endif
  295. add r0, 32
  296. add r1, 32
  297. dec r2
  298. jnz .loop8_%1
  299. .left4_%1:
  300. mov r2, r3
  301. and r3, 4
  302. jz .left
  303. mov%1 m0, [r1]
  304. %if cpuflag(ssse3)
  305. pshufb m0, m2
  306. mov%1 [r0], m0
  307. %else
  308. pshuflw m0, m0, 10110001b
  309. pshufhw m0, m0, 10110001b
  310. mova m2, m0
  311. psllw m0, 8
  312. psrlw m2, 8
  313. por m2, m0
  314. mov%1 [r0], m2
  315. %endif
  316. add r1, 16
  317. add r0, 16
  318. %endmacro
  319. ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  320. %macro BSWAP32_BUF 0
  321. %if cpuflag(ssse3)
  322. cglobal bswap32_buf, 3,4,3
  323. mov r3, r1
  324. mova m2, [pb_bswap32]
  325. %else
  326. cglobal bswap32_buf, 3,4,5
  327. mov r3, r1
  328. %endif
  329. and r3, 15
  330. jz .start_align
  331. BSWAP_LOOPS u
  332. jmp .left
  333. .start_align:
  334. BSWAP_LOOPS a
  335. .left:
  336. %if cpuflag(ssse3)
  337. mov r3, r2
  338. and r2, 2
  339. jz .left1
  340. movq m0, [r1]
  341. pshufb m0, m2
  342. movq [r0], m0
  343. add r1, 8
  344. add r0, 8
  345. .left1:
  346. and r3, 1
  347. jz .end
  348. mov r2d, [r1]
  349. bswap r2d
  350. mov [r0], r2d
  351. %else
  352. and r2, 3
  353. jz .end
  354. .loop2:
  355. mov r3d, [r1]
  356. bswap r3d
  357. mov [r0], r3d
  358. add r1, 4
  359. add r0, 4
  360. dec r2
  361. jnz .loop2
  362. %endif
  363. .end:
  364. RET
  365. %endmacro
  366. INIT_XMM sse2
  367. BSWAP32_BUF
  368. INIT_XMM ssse3
  369. BSWAP32_BUF