You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

165 lines
3.9KB

  1. ;******************************************************************************
  2. ;* SIMD-optimized JPEG2000 DSP functions
  3. ;* Copyright (c) 2014 Nicolas Bertrand
  4. ;* Copyright (c) 2015 James Almer
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA 32
  24. pf_ict0: times 8 dd 1.402
  25. pf_ict1: times 8 dd 0.34413
  26. pf_ict2: times 8 dd 0.71414
  27. pf_ict3: times 8 dd 1.772
  28. SECTION .text
  29. ;***********************************************************************
  30. ; ff_ict_float_<opt>(float *src0, float *src1, float *src2, int csize)
  31. ;***********************************************************************
  32. %macro ICT_FLOAT 1
  33. cglobal ict_float, 4, 4, %1, src0, src1, src2, csize
  34. shl csized, 2
  35. add src0q, csizeq
  36. add src1q, csizeq
  37. add src2q, csizeq
  38. neg csizeq
  39. movaps m6, [pf_ict0]
  40. movaps m7, [pf_ict1]
  41. %define ICT0 m6
  42. %define ICT1 m7
  43. %if ARCH_X86_64
  44. movaps m8, [pf_ict2]
  45. %define ICT2 m8
  46. %if cpuflag(avx)
  47. movaps m3, [pf_ict3]
  48. %define ICT3 m3
  49. %else
  50. movaps m9, [pf_ict3]
  51. %define ICT3 m9
  52. %endif
  53. %else ; ARCH_X86_32
  54. %define ICT2 [pf_ict2]
  55. %if cpuflag(avx)
  56. movaps m3, [pf_ict3]
  57. %define ICT3 m3
  58. %else
  59. %define ICT3 [pf_ict3]
  60. %endif
  61. %endif ; ARCH
  62. align 16
  63. .loop:
  64. movaps m0, [src0q+csizeq]
  65. movaps m1, [src1q+csizeq]
  66. movaps m2, [src2q+csizeq]
  67. %if cpuflag(fma4) || cpuflag(fma3)
  68. %if cpuflag(fma4)
  69. fnmaddps m5, m1, ICT1, m0
  70. fmaddps m4, m2, ICT0, m0
  71. %else ; fma3
  72. movaps m5, m1
  73. movaps m4, m2
  74. fnmaddps m5, m5, ICT1, m0
  75. fmaddps m4, m4, ICT0, m0
  76. %endif
  77. fmaddps m0, m1, ICT3, m0
  78. fnmaddps m5, m2, ICT2, m5
  79. %else ; non FMA
  80. %if cpuflag(avx)
  81. mulps m5, m1, ICT1
  82. mulps m4, m2, ICT0
  83. mulps m1, m1, ICT3
  84. mulps m2, m2, ICT2
  85. subps m5, m0, m5
  86. %else ; sse
  87. movaps m3, m1
  88. movaps m4, m2
  89. movaps m5, m0
  90. mulps m3, ICT1
  91. mulps m4, ICT0
  92. mulps m1, ICT3
  93. mulps m2, ICT2
  94. subps m5, m3
  95. %endif
  96. addps m4, m4, m0
  97. addps m0, m0, m1
  98. subps m5, m5, m2
  99. %endif
  100. movaps [src0q+csizeq], m4
  101. movaps [src2q+csizeq], m0
  102. movaps [src1q+csizeq], m5
  103. add csizeq, mmsize
  104. jl .loop
  105. REP_RET
  106. %endmacro
  107. INIT_XMM sse
  108. ICT_FLOAT 10
  109. INIT_YMM avx
  110. ICT_FLOAT 9
  111. %if HAVE_FMA4_EXTERNAL
  112. INIT_XMM fma4
  113. ICT_FLOAT 9
  114. %endif
  115. INIT_YMM fma3
  116. ICT_FLOAT 9
  117. ;***************************************************************************
  118. ; ff_rct_int_<opt>(int32_t *src0, int32_t *src1, int32_t *src2, int csize)
  119. ;***************************************************************************
  120. %macro RCT_INT 0
  121. cglobal rct_int, 4, 4, 4, src0, src1, src2, csize
  122. shl csized, 2
  123. add src0q, csizeq
  124. add src1q, csizeq
  125. add src2q, csizeq
  126. neg csizeq
  127. align 16
  128. .loop:
  129. mova m1, [src1q+csizeq]
  130. mova m2, [src2q+csizeq]
  131. mova m0, [src0q+csizeq]
  132. paddd m3, m1, m2
  133. psrad m3, 2
  134. psubd m0, m3
  135. paddd m1, m0
  136. paddd m2, m0
  137. mova [src1q+csizeq], m0
  138. mova [src2q+csizeq], m1
  139. mova [src0q+csizeq], m2
  140. add csizeq, mmsize
  141. jl .loop
  142. REP_RET
  143. %endmacro
  144. INIT_XMM sse2
  145. RCT_INT
  146. %if HAVE_AVX2_EXTERNAL
  147. INIT_YMM avx2
  148. RCT_INT
  149. %endif