You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

168 lines
4.1KB

  1. ;******************************************************************************
  2. ;* Copyright (c) 2008 Loren Merritt
  3. ;*
  4. ;* This file is part of Libav.
  5. ;*
  6. ;* Libav is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* Libav is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with Libav; if not, write to the Free Software
  18. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION .text
  22. %macro SCALARPRODUCT 0
  23. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  24. ; int order, int mul)
  25. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  26. shl orderq, 1
  27. movd m7, mulm
  28. %if mmsize == 16
  29. pshuflw m7, m7, 0
  30. punpcklqdq m7, m7
  31. %else
  32. pshufw m7, m7, 0
  33. %endif
  34. pxor m6, m6
  35. add v1q, orderq
  36. add v2q, orderq
  37. add v3q, orderq
  38. neg orderq
  39. .loop:
  40. movu m0, [v2q + orderq]
  41. movu m1, [v2q + orderq + mmsize]
  42. mova m4, [v1q + orderq]
  43. mova m5, [v1q + orderq + mmsize]
  44. movu m2, [v3q + orderq]
  45. movu m3, [v3q + orderq + mmsize]
  46. pmaddwd m0, m4
  47. pmaddwd m1, m5
  48. pmullw m2, m7
  49. pmullw m3, m7
  50. paddd m6, m0
  51. paddd m6, m1
  52. paddw m2, m4
  53. paddw m3, m5
  54. mova [v1q + orderq], m2
  55. mova [v1q + orderq + mmsize], m3
  56. add orderq, mmsize*2
  57. jl .loop
  58. %if mmsize == 16
  59. movhlps m0, m6
  60. paddd m6, m0
  61. pshuflw m0, m6, 0x4e
  62. %else
  63. pshufw m0, m6, 0x4e
  64. %endif
  65. paddd m6, m0
  66. movd eax, m6
  67. RET
  68. %endmacro
  69. INIT_MMX mmxext
  70. SCALARPRODUCT
  71. INIT_XMM sse2
  72. SCALARPRODUCT
  73. %macro SCALARPRODUCT_LOOP 1
  74. align 16
  75. .loop%1:
  76. sub orderq, mmsize*2
  77. %if %1
  78. mova m1, m4
  79. mova m4, [v2q + orderq]
  80. mova m0, [v2q + orderq + mmsize]
  81. palignr m1, m0, %1
  82. palignr m0, m4, %1
  83. mova m3, m5
  84. mova m5, [v3q + orderq]
  85. mova m2, [v3q + orderq + mmsize]
  86. palignr m3, m2, %1
  87. palignr m2, m5, %1
  88. %else
  89. mova m0, [v2q + orderq]
  90. mova m1, [v2q + orderq + mmsize]
  91. mova m2, [v3q + orderq]
  92. mova m3, [v3q + orderq + mmsize]
  93. %endif
  94. %define t0 [v1q + orderq]
  95. %define t1 [v1q + orderq + mmsize]
  96. %if ARCH_X86_64
  97. mova m8, t0
  98. mova m9, t1
  99. %define t0 m8
  100. %define t1 m9
  101. %endif
  102. pmaddwd m0, t0
  103. pmaddwd m1, t1
  104. pmullw m2, m7
  105. pmullw m3, m7
  106. paddw m2, t0
  107. paddw m3, t1
  108. paddd m6, m0
  109. paddd m6, m1
  110. mova [v1q + orderq], m2
  111. mova [v1q + orderq + mmsize], m3
  112. jg .loop%1
  113. %if %1
  114. jmp .end
  115. %endif
  116. %endmacro
  117. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  118. ; int order, int mul)
  119. INIT_XMM ssse3
  120. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  121. shl orderq, 1
  122. movd m7, mulm
  123. pshuflw m7, m7, 0
  124. punpcklqdq m7, m7
  125. pxor m6, m6
  126. mov r4d, v2d
  127. and r4d, 15
  128. and v2q, ~15
  129. and v3q, ~15
  130. mova m4, [v2q + orderq]
  131. mova m5, [v3q + orderq]
  132. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  133. cmp r4d, 0
  134. je .loop0
  135. cmp r4d, 2
  136. je .loop2
  137. cmp r4d, 4
  138. je .loop4
  139. cmp r4d, 6
  140. je .loop6
  141. cmp r4d, 8
  142. je .loop8
  143. cmp r4d, 10
  144. je .loop10
  145. cmp r4d, 12
  146. je .loop12
  147. SCALARPRODUCT_LOOP 14
  148. SCALARPRODUCT_LOOP 12
  149. SCALARPRODUCT_LOOP 10
  150. SCALARPRODUCT_LOOP 8
  151. SCALARPRODUCT_LOOP 6
  152. SCALARPRODUCT_LOOP 4
  153. SCALARPRODUCT_LOOP 2
  154. SCALARPRODUCT_LOOP 0
  155. .end:
  156. movhlps m0, m6
  157. paddd m6, m0
  158. pshuflw m0, m6, 0x4e
  159. paddd m6, m0
  160. movd eax, m6
  161. RET