You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

349 lines
8.3KB

  1. ;******************************************************************************
  2. ;* Copyright (c) 2010 David Conrad
  3. ;*
  4. ;* This file is part of FFmpeg.
  5. ;*
  6. ;* FFmpeg is free software; you can redistribute it and/or
  7. ;* modify it under the terms of the GNU Lesser General Public
  8. ;* License as published by the Free Software Foundation; either
  9. ;* version 2.1 of the License, or (at your option) any later version.
  10. ;*
  11. ;* FFmpeg is distributed in the hope that it will be useful,
  12. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. ;* Lesser General Public License for more details.
  15. ;*
  16. ;* You should have received a copy of the GNU Lesser General Public
  17. ;* License along with FFmpeg; if not, write to the Free Software
  18. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. ;******************************************************************************
  20. %include "libavutil/x86/x86util.asm"
  21. SECTION_RODATA
  22. pw_7: times 8 dw 7
  23. convert_to_unsigned_10bit: times 4 dd 0x200
  24. clip_10bit: times 8 dw 0x3ff
  25. cextern pw_3
  26. cextern pw_16
  27. cextern pw_32
  28. cextern pb_80
  29. SECTION .text
  30. %macro UNPACK_ADD 6
  31. mov%5 %1, %3
  32. mov%6 m5, %4
  33. mova m4, %1
  34. mova %2, m5
  35. punpcklbw %1, m7
  36. punpcklbw m5, m7
  37. punpckhbw m4, m7
  38. punpckhbw %2, m7
  39. paddw %1, m5
  40. paddw %2, m4
  41. %endmacro
  42. %macro HPEL_FILTER 1
  43. ; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
  44. cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
  45. mov src0q, srcq
  46. lea stridex3q, [3*strideq]
  47. sub src0q, stridex3q
  48. pxor m7, m7
  49. .loop:
  50. ; 7*(src[0] + src[1])
  51. UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
  52. pmullw m0, [pw_7]
  53. pmullw m1, [pw_7]
  54. ; 3*( ... + src[-2] + src[3])
  55. UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
  56. paddw m0, m2
  57. paddw m1, m3
  58. pmullw m0, [pw_3]
  59. pmullw m1, [pw_3]
  60. ; ... - 7*(src[-1] + src[2])
  61. UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
  62. pmullw m2, [pw_7]
  63. pmullw m3, [pw_7]
  64. psubw m0, m2
  65. psubw m1, m3
  66. ; ... - (src[-3] + src[4])
  67. UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
  68. psubw m0, m2
  69. psubw m1, m3
  70. paddw m0, [pw_16]
  71. paddw m1, [pw_16]
  72. psraw m0, 5
  73. psraw m1, 5
  74. packuswb m0, m1
  75. mova [dstq], m0
  76. add dstq, mmsize
  77. add srcq, mmsize
  78. add src0q, mmsize
  79. sub widthd, mmsize
  80. jg .loop
  81. RET
  82. ; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
  83. cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
  84. dec widthd
  85. pxor m7, m7
  86. and widthd, ~(mmsize-1)
  87. .loop:
  88. ; 7*(src[0] + src[1])
  89. UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
  90. pmullw m0, [pw_7]
  91. pmullw m1, [pw_7]
  92. ; 3*( ... + src[-2] + src[3])
  93. UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
  94. paddw m0, m2
  95. paddw m1, m3
  96. pmullw m0, [pw_3]
  97. pmullw m1, [pw_3]
  98. ; ... - 7*(src[-1] + src[2])
  99. UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
  100. pmullw m2, [pw_7]
  101. pmullw m3, [pw_7]
  102. psubw m0, m2
  103. psubw m1, m3
  104. ; ... - (src[-3] + src[4])
  105. UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
  106. psubw m0, m2
  107. psubw m1, m3
  108. paddw m0, [pw_16]
  109. paddw m1, [pw_16]
  110. psraw m0, 5
  111. psraw m1, 5
  112. packuswb m0, m1
  113. mova [dstq + widthq], m0
  114. sub widthd, mmsize
  115. jge .loop
  116. RET
  117. %endmacro
  118. %macro PUT_RECT 1
  119. ; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
  120. cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
  121. mova m0, [pb_80]
  122. add wd, (mmsize-1)
  123. and wd, ~(mmsize-1)
  124. %if ARCH_X86_64
  125. movsxd dst_strideq, dst_strided
  126. movsxd src_strideq, src_strided
  127. mov r7d, r5m
  128. mov r8d, wd
  129. %define wspill r8d
  130. %define hd r7d
  131. %else
  132. mov r4m, wd
  133. %define wspill r4m
  134. %define hd r5mp
  135. %endif
  136. .loopy:
  137. lea src2q, [srcq+src_strideq]
  138. lea dst2q, [dstq+dst_strideq]
  139. .loopx:
  140. sub wd, mmsize
  141. mova m1, [srcq +2*wq]
  142. mova m2, [src2q+2*wq]
  143. packsswb m1, [srcq +2*wq+mmsize]
  144. packsswb m2, [src2q+2*wq+mmsize]
  145. paddb m1, m0
  146. paddb m2, m0
  147. mova [dstq +wq], m1
  148. mova [dst2q+wq], m2
  149. jg .loopx
  150. lea srcq, [srcq+src_strideq*2]
  151. lea dstq, [dstq+dst_strideq*2]
  152. sub hd, 2
  153. mov wd, wspill
  154. jg .loopy
  155. RET
  156. %endm
  157. %macro ADD_RECT 1
  158. ; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
  159. cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
  160. mova m0, [pw_32]
  161. add wd, (mmsize-1)
  162. and wd, ~(mmsize-1)
  163. %if ARCH_X86_64
  164. movsxd strideq, strided
  165. movsxd idwt_strideq, idwt_strided
  166. mov r8d, wd
  167. %define wspill r8d
  168. %else
  169. mov r5m, wd
  170. %define wspill r5m
  171. %endif
  172. .loop:
  173. sub wd, mmsize
  174. movu m1, [srcq +2*wq] ; FIXME: ensure alignment
  175. paddw m1, m0
  176. psraw m1, 6
  177. movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
  178. paddw m2, m0
  179. psraw m2, 6
  180. paddw m1, [idwtq+2*wq]
  181. paddw m2, [idwtq+2*wq+mmsize]
  182. packuswb m1, m2
  183. mova [dstq +wq], m1
  184. jg .loop
  185. lea srcq, [srcq + 2*strideq]
  186. add dstq, strideq
  187. lea idwtq, [idwtq+ 2*idwt_strideq]
  188. sub hd, 1
  189. mov wd, wspill
  190. jg .loop
  191. RET
  192. %endm
  193. %macro ADD_OBMC 2
  194. ; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
  195. cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
  196. pxor m4, m4
  197. .loop:
  198. %assign i 0
  199. %rep %1 / mmsize
  200. mova m0, [srcq+i]
  201. mova m1, m0
  202. punpcklbw m0, m4
  203. punpckhbw m1, m4
  204. mova m2, [obmcq+i]
  205. mova m3, m2
  206. punpcklbw m2, m4
  207. punpckhbw m3, m4
  208. pmullw m0, m2
  209. pmullw m1, m3
  210. movu m2, [dstq+2*i]
  211. movu m3, [dstq+2*i+mmsize]
  212. paddw m0, m2
  213. paddw m1, m3
  214. movu [dstq+2*i], m0
  215. movu [dstq+2*i+mmsize], m1
  216. %assign i i+mmsize
  217. %endrep
  218. lea srcq, [srcq+strideq]
  219. lea dstq, [dstq+2*strideq]
  220. add obmcq, 32
  221. sub yblend, 1
  222. jg .loop
  223. RET
  224. %endm
  225. INIT_MMX
  226. %if ARCH_X86_64 == 0
  227. PUT_RECT mmx
  228. ADD_RECT mmx
  229. HPEL_FILTER mmx
  230. ADD_OBMC 32, mmx
  231. ADD_OBMC 16, mmx
  232. %endif
  233. ADD_OBMC 8, mmx
  234. INIT_XMM
  235. PUT_RECT sse2
  236. ADD_RECT sse2
  237. HPEL_FILTER sse2
  238. ADD_OBMC 32, sse2
  239. ADD_OBMC 16, sse2
  240. INIT_XMM sse4
  241. ; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride, const int qf, const int qs, int tot_v, int tot_h)
  242. cglobal dequant_subband_32, 7, 7, 4, src, dst, stride, qf, qs, tot_v, tot_h
  243. movd m2, qfd
  244. movd m3, qsd
  245. SPLATD m2
  246. SPLATD m3
  247. mov r4, tot_hq
  248. mov r3, dstq
  249. .loop_v:
  250. mov tot_hq, r4
  251. mov dstq, r3
  252. .loop_h:
  253. movu m0, [srcq]
  254. pabsd m1, m0
  255. pmulld m1, m2
  256. paddd m1, m3
  257. psrld m1, 2
  258. psignd m1, m0
  259. movu [dstq], m1
  260. add srcq, mmsize
  261. add dstq, mmsize
  262. sub tot_hq, 4
  263. jg .loop_h
  264. lea srcq, [srcq + 4*tot_hq]
  265. add r3, strideq
  266. dec tot_vd
  267. jg .loop_v
  268. RET
  269. INIT_XMM sse4
  270. ; void put_signed_rect_clamped_10(uint8_t *dst, int dst_stride, const uint8_t *src, int src_stride, int width, int height)
  271. %if ARCH_X86_64
  272. cglobal put_signed_rect_clamped_10, 6, 8, 5, dst, dst_stride, src, src_stride, w, h, t1, t2
  273. %else
  274. cglobal put_signed_rect_clamped_10, 5, 7, 5, dst, dst_stride, src, src_stride, w, t1, t2
  275. %define hd r5mp
  276. %endif
  277. shl wd, 2
  278. add srcq, wq
  279. neg wq
  280. mov t2q, dstq
  281. mov t1q, wq
  282. pxor m2, m2
  283. mova m3, [clip_10bit]
  284. mova m4, [convert_to_unsigned_10bit]
  285. .loop_h:
  286. mov dstq, t2q
  287. mov wq, t1q
  288. .loop_w:
  289. movu m0, [srcq+wq+0*mmsize]
  290. movu m1, [srcq+wq+1*mmsize]
  291. paddd m0, m4
  292. paddd m1, m4
  293. packusdw m0, m0, m1
  294. CLIPW m0, m2, m3 ; packusdw saturates so it's fine
  295. movu [dstq], m0
  296. add dstq, 1*mmsize
  297. add wq, 2*mmsize
  298. jl .loop_w
  299. add srcq, src_strideq
  300. add t2q, dst_strideq
  301. sub hd, 1
  302. jg .loop_h
  303. RET