You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

265 lines
6.6KB

  1. ;******************************************************************************
  2. ;* Copyright (c) 2010 David Conrad
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. pw_3: times 8 dw 3
  23. pw_7: times 8 dw 7
  24. pw_16: times 8 dw 16
  25. pw_32: times 8 dw 32
  26. pb_128: times 16 db 128
  27. section .text
  28. %macro UNPACK_ADD 6
  29. mov%5 %1, %3
  30. mov%6 m5, %4
  31. mova m4, %1
  32. mova %2, m5
  33. punpcklbw %1, m7
  34. punpcklbw m5, m7
  35. punpckhbw m4, m7
  36. punpckhbw %2, m7
  37. paddw %1, m5
  38. paddw %2, m4
  39. %endmacro
  40. %macro HPEL_FILTER 1
  41. ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
  42. cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
  43. mov src0q, srcq
  44. lea stridex3q, [3*strideq]
  45. sub src0q, stridex3q
  46. pxor m7, m7
  47. .loop:
  48. ; 7*(src[0] + src[1])
  49. UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
  50. pmullw m0, [pw_7]
  51. pmullw m1, [pw_7]
  52. ; 3*( ... + src[-2] + src[3])
  53. UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
  54. paddw m0, m2
  55. paddw m1, m3
  56. pmullw m0, [pw_3]
  57. pmullw m1, [pw_3]
  58. ; ... - 7*(src[-1] + src[2])
  59. UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
  60. pmullw m2, [pw_7]
  61. pmullw m3, [pw_7]
  62. psubw m0, m2
  63. psubw m1, m3
  64. ; ... - (src[-3] + src[4])
  65. UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
  66. psubw m0, m2
  67. psubw m1, m3
  68. paddw m0, [pw_16]
  69. paddw m1, [pw_16]
  70. psraw m0, 5
  71. psraw m1, 5
  72. packuswb m0, m1
  73. mova [dstq], m0
  74. add dstq, mmsize
  75. add srcq, mmsize
  76. add src0q, mmsize
  77. sub widthd, mmsize
  78. jg .loop
  79. RET
  80. ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
  81. cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
  82. dec widthd
  83. pxor m7, m7
  84. and widthd, ~(mmsize-1)
  85. .loop:
  86. ; 7*(src[0] + src[1])
  87. UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
  88. pmullw m0, [pw_7]
  89. pmullw m1, [pw_7]
  90. ; 3*( ... + src[-2] + src[3])
  91. UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
  92. paddw m0, m2
  93. paddw m1, m3
  94. pmullw m0, [pw_3]
  95. pmullw m1, [pw_3]
  96. ; ... - 7*(src[-1] + src[2])
  97. UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
  98. pmullw m2, [pw_7]
  99. pmullw m3, [pw_7]
  100. psubw m0, m2
  101. psubw m1, m3
  102. ; ... - (src[-3] + src[4])
  103. UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
  104. psubw m0, m2
  105. psubw m1, m3
  106. paddw m0, [pw_16]
  107. paddw m1, [pw_16]
  108. psraw m0, 5
  109. psraw m1, 5
  110. packuswb m0, m1
  111. mova [dstq + widthq], m0
  112. sub widthd, mmsize
  113. jge .loop
  114. RET
  115. %endmacro
  116. %macro PUT_RECT 1
  117. ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
  118. cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
  119. mova m0, [pb_128]
  120. add wd, (mmsize-1)
  121. and wd, ~(mmsize-1)
  122. %if ARCH_X86_64
  123. movsxd dst_strideq, dst_strided
  124. movsxd src_strideq, src_strided
  125. mov r7d, r5m
  126. mov r8d, wd
  127. %define wspill r8d
  128. %define hd r7d
  129. %else
  130. mov r4m, wd
  131. %define wspill r4m
  132. %define hd r5mp
  133. %endif
  134. .loopy
  135. lea src2q, [srcq+src_strideq*2]
  136. lea dst2q, [dstq+dst_strideq]
  137. .loopx:
  138. sub wd, mmsize
  139. mova m1, [srcq +2*wq]
  140. mova m2, [src2q+2*wq]
  141. packsswb m1, [srcq +2*wq+mmsize]
  142. packsswb m2, [src2q+2*wq+mmsize]
  143. paddb m1, m0
  144. paddb m2, m0
  145. mova [dstq +wq], m1
  146. mova [dst2q+wq], m2
  147. jg .loopx
  148. lea srcq, [srcq+src_strideq*4]
  149. lea dstq, [dstq+dst_strideq*2]
  150. sub hd, 2
  151. mov wd, wspill
  152. jg .loopy
  153. RET
  154. %endm
  155. %macro ADD_RECT 1
  156. ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
  157. cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
  158. mova m0, [pw_32]
  159. add wd, (mmsize-1)
  160. and wd, ~(mmsize-1)
  161. %if ARCH_X86_64
  162. movsxd strideq, strided
  163. movsxd idwt_strideq, idwt_strided
  164. mov r8d, wd
  165. %define wspill r8d
  166. %else
  167. mov r5m, wd
  168. %define wspill r5m
  169. %endif
  170. .loop:
  171. sub wd, mmsize
  172. movu m1, [srcq +2*wq] ; FIXME: ensure alignment
  173. paddw m1, m0
  174. psraw m1, 6
  175. movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
  176. paddw m2, m0
  177. psraw m2, 6
  178. paddw m1, [idwtq+2*wq]
  179. paddw m2, [idwtq+2*wq+mmsize]
  180. packuswb m1, m2
  181. mova [dstq +wq], m1
  182. jg .loop
  183. lea srcq, [srcq + 2*strideq]
  184. add dstq, strideq
  185. lea idwtq, [idwtq+ 2*idwt_strideq]
  186. sub hd, 1
  187. mov wd, wspill
  188. jg .loop
  189. RET
  190. %endm
  191. %macro ADD_OBMC 2
  192. ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
  193. cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
  194. pxor m4, m4
  195. .loop:
  196. %assign i 0
  197. %rep %1 / mmsize
  198. mova m0, [srcq+i]
  199. mova m1, m0
  200. punpcklbw m0, m4
  201. punpckhbw m1, m4
  202. mova m2, [obmcq+i]
  203. mova m3, m2
  204. punpcklbw m2, m4
  205. punpckhbw m3, m4
  206. pmullw m0, m2
  207. pmullw m1, m3
  208. movu m2, [dstq+2*i]
  209. movu m3, [dstq+2*i+mmsize]
  210. paddw m0, m2
  211. paddw m1, m3
  212. movu [dstq+2*i], m0
  213. movu [dstq+2*i+mmsize], m1
  214. %assign i i+mmsize
  215. %endrep
  216. lea srcq, [srcq+strideq]
  217. lea dstq, [dstq+2*strideq]
  218. add obmcq, 32
  219. sub yblend, 1
  220. jg .loop
  221. RET
  222. %endm
  223. INIT_MMX
  224. %if ARCH_X86_64 == 0
  225. PUT_RECT mmx
  226. ADD_RECT mmx
  227. HPEL_FILTER mmx
  228. ADD_OBMC 32, mmx
  229. ADD_OBMC 16, mmx
  230. %endif
  231. ADD_OBMC 8, mmx
  232. INIT_XMM
  233. PUT_RECT sse2
  234. ADD_RECT sse2
  235. HPEL_FILTER sse2
  236. ADD_OBMC 32, sse2
  237. ADD_OBMC 16, sse2