You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

117 lines
4.0KB

  1. ;******************************************************************************
  2. ;* TAK DSP SIMD optimizations
  3. ;*
  4. ;* Copyright (C) 2015 Paul B Mahol
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. pd_128: times 4 dd 128
  25. SECTION .text
  26. INIT_XMM sse2
  27. cglobal tak_decorrelate_ls, 3, 3, 2, p1, p2, length
  28. shl lengthd, 2
  29. add p1q, lengthq
  30. add p2q, lengthq
  31. neg lengthq
  32. .loop:
  33. mova m0, [p1q+lengthq+mmsize*0]
  34. mova m1, [p1q+lengthq+mmsize*1]
  35. paddd m0, [p2q+lengthq+mmsize*0]
  36. paddd m1, [p2q+lengthq+mmsize*1]
  37. mova [p2q+lengthq+mmsize*0], m0
  38. mova [p2q+lengthq+mmsize*1], m1
  39. add lengthq, mmsize*2
  40. jl .loop
  41. REP_RET
  42. cglobal tak_decorrelate_sr, 3, 3, 2, p1, p2, length
  43. shl lengthd, 2
  44. add p1q, lengthq
  45. add p2q, lengthq
  46. neg lengthq
  47. .loop:
  48. mova m0, [p2q+lengthq+mmsize*0]
  49. mova m1, [p2q+lengthq+mmsize*1]
  50. psubd m0, [p1q+lengthq+mmsize*0]
  51. psubd m1, [p1q+lengthq+mmsize*1]
  52. mova [p1q+lengthq+mmsize*0], m0
  53. mova [p1q+lengthq+mmsize*1], m1
  54. add lengthq, mmsize*2
  55. jl .loop
  56. REP_RET
  57. cglobal tak_decorrelate_sm, 3, 3, 6, p1, p2, length
  58. shl lengthd, 2
  59. add p1q, lengthq
  60. add p2q, lengthq
  61. neg lengthq
  62. .loop:
  63. mova m0, [p1q+lengthq]
  64. mova m1, [p2q+lengthq]
  65. mova m3, [p1q+lengthq+mmsize]
  66. mova m4, [p2q+lengthq+mmsize]
  67. mova m2, m1
  68. mova m5, m4
  69. psrld m2, 1
  70. psrld m5, 1
  71. psubd m0, m2
  72. psubd m3, m5
  73. paddd m1, m0
  74. paddd m4, m3
  75. mova [p1q+lengthq], m0
  76. mova [p2q+lengthq], m1
  77. mova [p1q+lengthq+mmsize], m3
  78. mova [p2q+lengthq+mmsize], m4
  79. add lengthq, mmsize*2
  80. jl .loop
  81. REP_RET
  82. INIT_XMM sse4
  83. cglobal tak_decorrelate_sf, 3, 3, 5, p1, p2, length, dshift, dfactor
  84. shl lengthd, 2
  85. add p1q, lengthq
  86. add p2q, lengthq
  87. neg lengthq
  88. movd m2, dshiftm
  89. movd m3, dfactorm
  90. pshufd m3, m3, 0
  91. mova m4, [pd_128]
  92. .loop:
  93. mova m0, [p1q+lengthq]
  94. mova m1, [p2q+lengthq]
  95. psrld m1, m2
  96. pmulld m1, m3
  97. paddd m1, m4
  98. psrld m1, 8
  99. pslld m1, m2
  100. psubd m1, m0
  101. mova [p1q+lengthq], m1
  102. add lengthq, mmsize
  103. jl .loop
  104. REP_RET