You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

136 lines
3.1KB

  1. ;******************************************************************************
  2. ;* optimized bswap buffer functions
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  24. SECTION_TEXT
  25. ; %1 = aligned/unaligned
  26. %macro BSWAP_LOOPS 1
  27. mov r3, r2
  28. sar r2, 3
  29. jz .left4_%1
  30. .loop8_%1:
  31. mov%1 m0, [r1 + 0]
  32. mov%1 m1, [r1 + 16]
  33. %if cpuflag(ssse3)
  34. pshufb m0, m2
  35. pshufb m1, m2
  36. mov%1 [r0 + 0], m0
  37. mov%1 [r0 + 16], m1
  38. %else
  39. pshuflw m0, m0, 10110001b
  40. pshuflw m1, m1, 10110001b
  41. pshufhw m0, m0, 10110001b
  42. pshufhw m1, m1, 10110001b
  43. mova m2, m0
  44. mova m3, m1
  45. psllw m0, 8
  46. psllw m1, 8
  47. psrlw m2, 8
  48. psrlw m3, 8
  49. por m2, m0
  50. por m3, m1
  51. mov%1 [r0 + 0], m2
  52. mov%1 [r0 + 16], m3
  53. %endif
  54. add r0, 32
  55. add r1, 32
  56. dec r2
  57. jnz .loop8_%1
  58. .left4_%1:
  59. mov r2, r3
  60. and r3, 4
  61. jz .left
  62. mov%1 m0, [r1]
  63. %if cpuflag(ssse3)
  64. pshufb m0, m2
  65. mov%1 [r0], m0
  66. %else
  67. pshuflw m0, m0, 10110001b
  68. pshufhw m0, m0, 10110001b
  69. mova m2, m0
  70. psllw m0, 8
  71. psrlw m2, 8
  72. por m2, m0
  73. mov%1 [r0], m2
  74. %endif
  75. add r1, 16
  76. add r0, 16
  77. %endmacro
  78. ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  79. %macro BSWAP32_BUF 0
  80. %if cpuflag(ssse3)
  81. cglobal bswap32_buf, 3,4,3
  82. mov r3, r1
  83. mova m2, [pb_bswap32]
  84. %else
  85. cglobal bswap32_buf, 3,4,5
  86. mov r3, r1
  87. %endif
  88. and r3, 15
  89. jz .start_align
  90. BSWAP_LOOPS u
  91. jmp .left
  92. .start_align:
  93. BSWAP_LOOPS a
  94. .left:
  95. %if cpuflag(ssse3)
  96. mov r3, r2
  97. and r2, 2
  98. jz .left1
  99. movq m0, [r1]
  100. pshufb m0, m2
  101. movq [r0], m0
  102. add r1, 8
  103. add r0, 8
  104. .left1:
  105. and r3, 1
  106. jz .end
  107. mov r2d, [r1]
  108. bswap r2d
  109. mov [r0], r2d
  110. %else
  111. and r2, 3
  112. jz .end
  113. .loop2:
  114. mov r3d, [r1]
  115. bswap r3d
  116. mov [r0], r3d
  117. add r1, 4
  118. add r0, 4
  119. dec r2
  120. jnz .loop2
  121. %endif
  122. .end:
  123. RET
  124. %endmacro
  125. INIT_XMM sse2
  126. BSWAP32_BUF
  127. INIT_XMM ssse3
  128. BSWAP32_BUF