You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

249 lines
5.8KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  24. SECTION_TEXT
  25. %macro SCALARPRODUCT 0
  26. ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  27. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  28. shl orderq, 1
  29. add v1q, orderq
  30. add v2q, orderq
  31. neg orderq
  32. pxor m2, m2
  33. .loop:
  34. movu m0, [v1q + orderq]
  35. movu m1, [v1q + orderq + mmsize]
  36. pmaddwd m0, [v2q + orderq]
  37. pmaddwd m1, [v2q + orderq + mmsize]
  38. paddd m2, m0
  39. paddd m2, m1
  40. add orderq, mmsize*2
  41. jl .loop
  42. %if mmsize == 16
  43. movhlps m0, m2
  44. paddd m2, m0
  45. pshuflw m0, m2, 0x4e
  46. %else
  47. pshufw m0, m2, 0x4e
  48. %endif
  49. paddd m2, m0
  50. movd eax, m2
  51. RET
  52. %endmacro
  53. INIT_MMX mmxext
  54. SCALARPRODUCT
  55. INIT_XMM sse2
  56. SCALARPRODUCT
  57. ;-----------------------------------------------------------------------------
  58. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  59. ; int32_t max, unsigned int len)
  60. ;-----------------------------------------------------------------------------
  61. ; %1 = number of xmm registers used
  62. ; %2 = number of inline load/process/store loops per asm loop
  63. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  64. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  65. ; %5 = suffix
  66. %macro VECTOR_CLIP_INT32 4-5
  67. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  68. %if %4
  69. cvtsi2ss m4, minm
  70. cvtsi2ss m5, maxm
  71. %else
  72. movd m4, minm
  73. movd m5, maxm
  74. %endif
  75. SPLATD m4
  76. SPLATD m5
  77. .loop:
  78. %assign %%i 1
  79. %rep %2
  80. mova m0, [srcq+mmsize*0*%%i]
  81. mova m1, [srcq+mmsize*1*%%i]
  82. mova m2, [srcq+mmsize*2*%%i]
  83. mova m3, [srcq+mmsize*3*%%i]
  84. %if %3
  85. mova m7, [srcq+mmsize*4*%%i]
  86. mova m8, [srcq+mmsize*5*%%i]
  87. mova m9, [srcq+mmsize*6*%%i]
  88. mova m10, [srcq+mmsize*7*%%i]
  89. %endif
  90. CLIPD m0, m4, m5, m6
  91. CLIPD m1, m4, m5, m6
  92. CLIPD m2, m4, m5, m6
  93. CLIPD m3, m4, m5, m6
  94. %if %3
  95. CLIPD m7, m4, m5, m6
  96. CLIPD m8, m4, m5, m6
  97. CLIPD m9, m4, m5, m6
  98. CLIPD m10, m4, m5, m6
  99. %endif
  100. mova [dstq+mmsize*0*%%i], m0
  101. mova [dstq+mmsize*1*%%i], m1
  102. mova [dstq+mmsize*2*%%i], m2
  103. mova [dstq+mmsize*3*%%i], m3
  104. %if %3
  105. mova [dstq+mmsize*4*%%i], m7
  106. mova [dstq+mmsize*5*%%i], m8
  107. mova [dstq+mmsize*6*%%i], m9
  108. mova [dstq+mmsize*7*%%i], m10
  109. %endif
  110. %assign %%i %%i+1
  111. %endrep
  112. add srcq, mmsize*4*(%2+%3)
  113. add dstq, mmsize*4*(%2+%3)
  114. sub lend, mmsize*(%2+%3)
  115. jg .loop
  116. REP_RET
  117. %endmacro
  118. INIT_MMX mmx
  119. %define CLIPD CLIPD_MMX
  120. VECTOR_CLIP_INT32 0, 1, 0, 0
  121. INIT_XMM sse2
  122. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  123. %define CLIPD CLIPD_SSE2
  124. VECTOR_CLIP_INT32 6, 2, 0, 1
  125. INIT_XMM sse4
  126. %define CLIPD CLIPD_SSE41
  127. %ifdef m8
  128. VECTOR_CLIP_INT32 11, 1, 1, 0
  129. %else
  130. VECTOR_CLIP_INT32 6, 1, 0, 0
  131. %endif
  132. ; %1 = aligned/unaligned
  133. %macro BSWAP_LOOPS 1
  134. mov r3, r2
  135. sar r2, 3
  136. jz .left4_%1
  137. .loop8_%1:
  138. mov%1 m0, [r1 + 0]
  139. mov%1 m1, [r1 + 16]
  140. %if cpuflag(ssse3)
  141. pshufb m0, m2
  142. pshufb m1, m2
  143. mov%1 [r0 + 0], m0
  144. mov%1 [r0 + 16], m1
  145. %else
  146. pshuflw m0, m0, 10110001b
  147. pshuflw m1, m1, 10110001b
  148. pshufhw m0, m0, 10110001b
  149. pshufhw m1, m1, 10110001b
  150. mova m2, m0
  151. mova m3, m1
  152. psllw m0, 8
  153. psllw m1, 8
  154. psrlw m2, 8
  155. psrlw m3, 8
  156. por m2, m0
  157. por m3, m1
  158. mov%1 [r0 + 0], m2
  159. mov%1 [r0 + 16], m3
  160. %endif
  161. add r0, 32
  162. add r1, 32
  163. dec r2
  164. jnz .loop8_%1
  165. .left4_%1:
  166. mov r2, r3
  167. and r3, 4
  168. jz .left
  169. mov%1 m0, [r1]
  170. %if cpuflag(ssse3)
  171. pshufb m0, m2
  172. mov%1 [r0], m0
  173. %else
  174. pshuflw m0, m0, 10110001b
  175. pshufhw m0, m0, 10110001b
  176. mova m2, m0
  177. psllw m0, 8
  178. psrlw m2, 8
  179. por m2, m0
  180. mov%1 [r0], m2
  181. %endif
  182. add r1, 16
  183. add r0, 16
  184. %endmacro
  185. ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  186. %macro BSWAP32_BUF 0
  187. %if cpuflag(ssse3)
  188. cglobal bswap32_buf, 3,4,3
  189. mov r3, r1
  190. mova m2, [pb_bswap32]
  191. %else
  192. cglobal bswap32_buf, 3,4,5
  193. mov r3, r1
  194. %endif
  195. and r3, 15
  196. jz .start_align
  197. BSWAP_LOOPS u
  198. jmp .left
  199. .start_align:
  200. BSWAP_LOOPS a
  201. .left:
  202. %if cpuflag(ssse3)
  203. mov r3, r2
  204. and r2, 2
  205. jz .left1
  206. movq m0, [r1]
  207. pshufb m0, m2
  208. movq [r0], m0
  209. add r1, 8
  210. add r0, 8
  211. .left1:
  212. and r3, 1
  213. jz .end
  214. mov r2d, [r1]
  215. bswap r2d
  216. mov [r0], r2d
  217. %else
  218. and r2, 3
  219. jz .end
  220. .loop2:
  221. mov r3d, [r1]
  222. bswap r3d
  223. mov [r0], r3d
  224. add r1, 4
  225. add r0, 4
  226. dec r2
  227. jnz .loop2
  228. %endif
  229. .end:
  230. RET
  231. %endmacro
  232. INIT_XMM sse2
  233. BSWAP32_BUF
  234. INIT_XMM ssse3
  235. BSWAP32_BUF