You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

261 lines
6.4KB

  1. ;******************************************************************************
  2. ;* Copyright (c) 2010 David Conrad
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "x86inc.asm"
  21. SECTION_RODATA
  22. pw_3: times 8 dw 3
  23. pw_7: times 8 dw 7
  24. pw_16: times 8 dw 16
  25. pw_32: times 8 dw 32
  26. pb_128: times 16 db 128
  27. section .text
  28. %macro UNPACK_ADD 6
  29. mov%5 %1, %3
  30. mov%6 m5, %4
  31. mova m4, %1
  32. mova %2, m5
  33. punpcklbw %1, m7
  34. punpcklbw m5, m7
  35. punpckhbw m4, m7
  36. punpckhbw %2, m7
  37. paddw %1, m5
  38. paddw %2, m4
  39. %endmacro
  40. %macro HPEL_FILTER 1
  41. ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
  42. cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
  43. mov src0q, srcq
  44. lea stridex3q, [3*strideq]
  45. sub src0q, stridex3q
  46. pxor m7, m7
  47. .loop:
  48. ; 7*(src[0] + src[1])
  49. UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
  50. pmullw m0, [pw_7]
  51. pmullw m1, [pw_7]
  52. ; 3*( ... + src[-2] + src[3])
  53. UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
  54. paddw m0, m2
  55. paddw m1, m3
  56. pmullw m0, [pw_3]
  57. pmullw m1, [pw_3]
  58. ; ... - 7*(src[-1] + src[2])
  59. UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
  60. pmullw m2, [pw_7]
  61. pmullw m3, [pw_7]
  62. psubw m0, m2
  63. psubw m1, m3
  64. ; ... - (src[-3] + src[4])
  65. UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
  66. psubw m0, m2
  67. psubw m1, m3
  68. paddw m0, [pw_16]
  69. paddw m1, [pw_16]
  70. psraw m0, 5
  71. psraw m1, 5
  72. packuswb m0, m1
  73. mova [dstq], m0
  74. add dstq, mmsize
  75. add srcq, mmsize
  76. add src0q, mmsize
  77. sub widthd, mmsize
  78. jg .loop
  79. RET
  80. ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
  81. cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
  82. dec widthd
  83. pxor m7, m7
  84. and widthd, ~(mmsize-1)
  85. .loop:
  86. ; 7*(src[0] + src[1])
  87. UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
  88. pmullw m0, [pw_7]
  89. pmullw m1, [pw_7]
  90. ; 3*( ... + src[-2] + src[3])
  91. UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
  92. paddw m0, m2
  93. paddw m1, m3
  94. pmullw m0, [pw_3]
  95. pmullw m1, [pw_3]
  96. ; ... - 7*(src[-1] + src[2])
  97. UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
  98. pmullw m2, [pw_7]
  99. pmullw m3, [pw_7]
  100. psubw m0, m2
  101. psubw m1, m3
  102. ; ... - (src[-3] + src[4])
  103. UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
  104. psubw m0, m2
  105. psubw m1, m3
  106. paddw m0, [pw_16]
  107. paddw m1, [pw_16]
  108. psraw m0, 5
  109. psraw m1, 5
  110. packuswb m0, m1
  111. mova [dstq + widthq], m0
  112. sub widthd, mmsize
  113. jge .loop
  114. RET
  115. %endmacro
  116. %macro PUT_RECT 1
  117. ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
  118. cglobal put_signed_rect_clamped_%1, 5,7,3, dst, dst_stride, src, src_stride, w, dst2, src2
  119. mova m0, [pb_128]
  120. add wd, (mmsize-1)
  121. and wd, ~(mmsize-1)
  122. %ifdef ARCH_X86_64
  123. mov r10d, r5m
  124. mov r11d, wd
  125. %define wspill r11d
  126. %define hd r10d
  127. %else
  128. mov r4m, wd
  129. %define wspill r4m
  130. %define hd r5mp
  131. %endif
  132. .loopy
  133. lea src2q, [srcq+src_strideq*2]
  134. lea dst2q, [dstq+dst_strideq]
  135. .loopx:
  136. sub wd, mmsize
  137. mova m1, [srcq +2*wq]
  138. mova m2, [src2q+2*wq]
  139. packsswb m1, [srcq +2*wq+mmsize]
  140. packsswb m2, [src2q+2*wq+mmsize]
  141. paddb m1, m0
  142. paddb m2, m0
  143. mova [dstq +wq], m1
  144. mova [dst2q+wq], m2
  145. jg .loopx
  146. lea srcq, [srcq+src_strideq*4]
  147. lea dstq, [dstq+dst_strideq*2]
  148. sub hd, 2
  149. mov wd, wspill
  150. jg .loopy
  151. RET
  152. %endm
  153. %macro ADD_RECT 1
  154. ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
  155. cglobal add_rect_clamped_%1, 7,7,3, dst, src, stride, idwt, idwt_stride, w, h
  156. mova m0, [pw_32]
  157. add wd, (mmsize-1)
  158. and wd, ~(mmsize-1)
  159. %ifdef ARCH_X86_64
  160. mov r11d, wd
  161. %define wspill r11d
  162. %else
  163. mov r5m, wd
  164. %define wspill r5m
  165. %endif
  166. .loop:
  167. sub wd, mmsize
  168. movu m1, [srcq +2*wq] ; FIXME: ensure alignment
  169. paddw m1, m0
  170. psraw m1, 6
  171. movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
  172. paddw m2, m0
  173. psraw m2, 6
  174. paddw m1, [idwtq+2*wq]
  175. paddw m2, [idwtq+2*wq+mmsize]
  176. packuswb m1, m2
  177. mova [dstq +wq], m1
  178. jg .loop
  179. lea srcq, [srcq + 2*strideq]
  180. add dstq, strideq
  181. lea idwtq, [idwtq+ 2*idwt_strideq]
  182. sub hd, 1
  183. mov wd, wspill
  184. jg .loop
  185. RET
  186. %endm
  187. %macro ADD_OBMC 2
  188. ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
  189. cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
  190. pxor m4, m4
  191. .loop:
  192. %assign i 0
  193. %rep %1 / mmsize
  194. mova m0, [srcq+i]
  195. mova m1, m0
  196. punpcklbw m0, m4
  197. punpckhbw m1, m4
  198. mova m2, [obmcq+i]
  199. mova m3, m2
  200. punpcklbw m2, m4
  201. punpckhbw m3, m4
  202. pmullw m0, m2
  203. pmullw m1, m3
  204. movu m2, [dstq+2*i]
  205. movu m3, [dstq+2*i+mmsize]
  206. paddw m0, m2
  207. paddw m1, m3
  208. movu [dstq+2*i], m0
  209. movu [dstq+2*i+mmsize], m1
  210. %assign i i+mmsize
  211. %endrep
  212. lea srcq, [srcq+strideq]
  213. lea dstq, [dstq+2*strideq]
  214. add obmcq, 32
  215. sub yblend, 1
  216. jg .loop
  217. RET
  218. %endm
  219. INIT_MMX
  220. %ifndef ARCH_X86_64
  221. PUT_RECT mmx
  222. ADD_RECT mmx
  223. HPEL_FILTER mmx
  224. ADD_OBMC 32, mmx
  225. ADD_OBMC 16, mmx
  226. %endif
  227. ADD_OBMC 8, mmx
  228. INIT_XMM
  229. PUT_RECT sse2
  230. ADD_RECT sse2
  231. HPEL_FILTER sse2
  232. ADD_OBMC 32, sse2
  233. ADD_OBMC 16, sse2