You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

133 lines
3.0KB

  1. ;******************************************************************************
  2. ;* optimized bswap buffer functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  24. SECTION .text
  25. ; %1 = aligned/unaligned
  26. %macro BSWAP_LOOPS 1
  27. mov r3d, r2d
  28. sar r2d, 3
  29. jz .left4_%1
  30. .loop8_%1:
  31. mov%1 m0, [r1 + 0]
  32. mov%1 m1, [r1 + 16]
  33. %if cpuflag(ssse3)
  34. pshufb m0, m2
  35. pshufb m1, m2
  36. mov%1 [r0 + 0], m0
  37. mov%1 [r0 + 16], m1
  38. %else
  39. pshuflw m0, m0, 10110001b
  40. pshuflw m1, m1, 10110001b
  41. pshufhw m0, m0, 10110001b
  42. pshufhw m1, m1, 10110001b
  43. mova m2, m0
  44. mova m3, m1
  45. psllw m0, 8
  46. psllw m1, 8
  47. psrlw m2, 8
  48. psrlw m3, 8
  49. por m2, m0
  50. por m3, m1
  51. mov%1 [r0 + 0], m2
  52. mov%1 [r0 + 16], m3
  53. %endif
  54. add r0, 32
  55. add r1, 32
  56. dec r2d
  57. jnz .loop8_%1
  58. .left4_%1:
  59. mov r2d, r3d
  60. test r3d, 4
  61. jz .left
  62. mov%1 m0, [r1]
  63. %if cpuflag(ssse3)
  64. pshufb m0, m2
  65. mov%1 [r0], m0
  66. %else
  67. pshuflw m0, m0, 10110001b
  68. pshufhw m0, m0, 10110001b
  69. mova m2, m0
  70. psllw m0, 8
  71. psrlw m2, 8
  72. por m2, m0
  73. mov%1 [r0], m2
  74. %endif
  75. add r1, 16
  76. add r0, 16
  77. %endmacro
  78. ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  79. %macro BSWAP32_BUF 0
  80. %if cpuflag(ssse3)
  81. cglobal bswap32_buf, 3,4,3
  82. mova m2, [pb_bswap32]
  83. %else
  84. cglobal bswap32_buf, 3,4,5
  85. %endif
  86. test r1, 15
  87. jz .start_align
  88. BSWAP_LOOPS u
  89. jmp .left
  90. .start_align:
  91. BSWAP_LOOPS a
  92. .left:
  93. %if cpuflag(ssse3)
  94. test r2d, 2
  95. jz .left1
  96. movq m0, [r1]
  97. pshufb m0, m2
  98. movq [r0], m0
  99. add r1, 8
  100. add r0, 8
  101. .left1:
  102. test r2d, 1
  103. jz .end
  104. mov r2d, [r1]
  105. bswap r2d
  106. mov [r0], r2d
  107. %else
  108. and r2d, 3
  109. jz .end
  110. .loop2:
  111. mov r3d, [r1]
  112. bswap r3d
  113. mov [r0], r3d
  114. add r1, 4
  115. add r0, 4
  116. dec r2d
  117. jnz .loop2
  118. %endif
  119. .end:
  120. RET
  121. %endmacro
  122. INIT_XMM sse2
  123. BSWAP32_BUF
  124. INIT_XMM ssse3
  125. BSWAP32_BUF