You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

138 lines
3.7KB

  1. ;******************************************************************************
  2. ;* optimized audio functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION .text
  23. %macro SCALARPRODUCT 0
  24. ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  25. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  26. shl orderq, 1
  27. add v1q, orderq
  28. add v2q, orderq
  29. neg orderq
  30. pxor m2, m2
  31. .loop:
  32. movu m0, [v1q + orderq]
  33. movu m1, [v1q + orderq + mmsize]
  34. pmaddwd m0, [v2q + orderq]
  35. pmaddwd m1, [v2q + orderq + mmsize]
  36. paddd m2, m0
  37. paddd m2, m1
  38. add orderq, mmsize*2
  39. jl .loop
  40. %if mmsize == 16
  41. movhlps m0, m2
  42. paddd m2, m0
  43. pshuflw m0, m2, 0x4e
  44. %else
  45. pshufw m0, m2, 0x4e
  46. %endif
  47. paddd m2, m0
  48. movd eax, m2
  49. RET
  50. %endmacro
  51. INIT_MMX mmxext
  52. SCALARPRODUCT
  53. INIT_XMM sse2
  54. SCALARPRODUCT
  55. ;-----------------------------------------------------------------------------
  56. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  57. ; int32_t max, unsigned int len)
  58. ;-----------------------------------------------------------------------------
  59. ; %1 = number of xmm registers used
  60. ; %2 = number of inline load/process/store loops per asm loop
  61. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  62. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  63. ; %5 = suffix
  64. %macro VECTOR_CLIP_INT32 4-5
  65. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  66. %if %4
  67. cvtsi2ss m4, minm
  68. cvtsi2ss m5, maxm
  69. %else
  70. movd m4, minm
  71. movd m5, maxm
  72. %endif
  73. SPLATD m4
  74. SPLATD m5
  75. .loop:
  76. %assign %%i 1
  77. %rep %2
  78. mova m0, [srcq+mmsize*0*%%i]
  79. mova m1, [srcq+mmsize*1*%%i]
  80. mova m2, [srcq+mmsize*2*%%i]
  81. mova m3, [srcq+mmsize*3*%%i]
  82. %if %3
  83. mova m7, [srcq+mmsize*4*%%i]
  84. mova m8, [srcq+mmsize*5*%%i]
  85. mova m9, [srcq+mmsize*6*%%i]
  86. mova m10, [srcq+mmsize*7*%%i]
  87. %endif
  88. CLIPD m0, m4, m5, m6
  89. CLIPD m1, m4, m5, m6
  90. CLIPD m2, m4, m5, m6
  91. CLIPD m3, m4, m5, m6
  92. %if %3
  93. CLIPD m7, m4, m5, m6
  94. CLIPD m8, m4, m5, m6
  95. CLIPD m9, m4, m5, m6
  96. CLIPD m10, m4, m5, m6
  97. %endif
  98. mova [dstq+mmsize*0*%%i], m0
  99. mova [dstq+mmsize*1*%%i], m1
  100. mova [dstq+mmsize*2*%%i], m2
  101. mova [dstq+mmsize*3*%%i], m3
  102. %if %3
  103. mova [dstq+mmsize*4*%%i], m7
  104. mova [dstq+mmsize*5*%%i], m8
  105. mova [dstq+mmsize*6*%%i], m9
  106. mova [dstq+mmsize*7*%%i], m10
  107. %endif
  108. %assign %%i %%i+1
  109. %endrep
  110. add srcq, mmsize*4*(%2+%3)
  111. add dstq, mmsize*4*(%2+%3)
  112. sub lend, mmsize*(%2+%3)
  113. jg .loop
  114. REP_RET
  115. %endmacro
  116. INIT_MMX mmx
  117. %define CLIPD CLIPD_MMX
  118. VECTOR_CLIP_INT32 0, 1, 0, 0
  119. INIT_XMM sse2
  120. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  121. %define CLIPD CLIPD_SSE2
  122. VECTOR_CLIP_INT32 6, 2, 0, 1
  123. INIT_XMM sse4
  124. %define CLIPD CLIPD_SSE41
  125. %ifdef m8
  126. VECTOR_CLIP_INT32 11, 1, 1, 0
  127. %else
  128. VECTOR_CLIP_INT32 6, 1, 0, 0
  129. %endif