You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

291 lines
6.6KB

  1. ;******************************************************************************
  2. ;* SIMD lossless video DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2014 Michael Niedermayer
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. cextern pb_15
  25. pb_zzzzzzzz77777777: times 8 db -1
  26. pb_7: times 8 db 7
  27. pb_ef: times 8 db 14,15
  28. pb_67: times 8 db 6, 7
  29. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  30. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  31. pb_zzzz2323zzzzabab: db -1,-1,-1,-1, 2, 3, 2, 3,-1,-1,-1,-1,10,11,10,11
  32. pb_zzzzzzzz67676767: db -1,-1,-1,-1,-1,-1,-1,-1, 6, 7, 6, 7, 6, 7, 6, 7
  33. SECTION .text
  34. ; void ff_add_median_pred_mmxext(uint8_t *dst, const uint8_t *top,
  35. ; const uint8_t *diff, int w,
  36. ; int *left, int *left_top)
  37. %macro MEDIAN_PRED 0
  38. cglobal add_median_pred, 6,6,8, dst, top, diff, w, left, left_top
  39. movu m0, [topq]
  40. mova m2, m0
  41. movd m4, [left_topq]
  42. LSHIFT m2, 1
  43. mova m1, m0
  44. por m4, m2
  45. movd m3, [leftq]
  46. psubb m0, m4 ; t-tl
  47. add dstq, wq
  48. add topq, wq
  49. add diffq, wq
  50. neg wq
  51. jmp .skip
  52. .loop:
  53. movu m4, [topq+wq]
  54. mova m0, m4
  55. LSHIFT m4, 1
  56. por m4, m1
  57. mova m1, m0 ; t
  58. psubb m0, m4 ; t-tl
  59. .skip:
  60. movu m2, [diffq+wq]
  61. %assign i 0
  62. %rep mmsize
  63. mova m4, m0
  64. paddb m4, m3 ; t-tl+l
  65. mova m5, m3
  66. pmaxub m3, m1
  67. pminub m5, m1
  68. pminub m3, m4
  69. pmaxub m3, m5 ; median
  70. paddb m3, m2 ; +residual
  71. %if i==0
  72. mova m7, m3
  73. LSHIFT m7, mmsize-1
  74. %else
  75. mova m6, m3
  76. RSHIFT m7, 1
  77. LSHIFT m6, mmsize-1
  78. por m7, m6
  79. %endif
  80. %if i<mmsize-1
  81. RSHIFT m0, 1
  82. RSHIFT m1, 1
  83. RSHIFT m2, 1
  84. %endif
  85. %assign i i+1
  86. %endrep
  87. movu [dstq+wq], m7
  88. add wq, mmsize
  89. jl .loop
  90. movzx r2d, byte [dstq-1]
  91. mov [leftq], r2d
  92. movzx r2d, byte [topq-1]
  93. mov [left_topq], r2d
  94. RET
  95. %endmacro
  96. %if ARCH_X86_32
  97. INIT_MMX mmxext
  98. MEDIAN_PRED
  99. %endif
  100. INIT_XMM sse2
  101. MEDIAN_PRED
  102. %macro ADD_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  103. add srcq, wq
  104. add dstq, wq
  105. neg wq
  106. %%.loop:
  107. %if %2
  108. mova m1, [srcq+wq]
  109. %else
  110. movu m1, [srcq+wq]
  111. %endif
  112. mova m2, m1
  113. psllw m1, 8
  114. paddb m1, m2
  115. mova m2, m1
  116. pshufb m1, m3
  117. paddb m1, m2
  118. pshufb m0, m5
  119. mova m2, m1
  120. pshufb m1, m4
  121. paddb m1, m2
  122. %if mmsize == 16
  123. mova m2, m1
  124. pshufb m1, m6
  125. paddb m1, m2
  126. %endif
  127. paddb m0, m1
  128. %if %1
  129. mova [dstq+wq], m0
  130. %else
  131. movq [dstq+wq], m0
  132. movhps [dstq+wq+8], m0
  133. %endif
  134. add wq, mmsize
  135. jl %%.loop
  136. mov eax, mmsize-1
  137. sub eax, wd
  138. movd m1, eax
  139. pshufb m0, m1
  140. movd eax, m0
  141. RET
  142. %endmacro
  143. ; int ff_add_left_pred(uint8_t *dst, const uint8_t *src, int w, int left)
  144. INIT_MMX ssse3
  145. cglobal add_left_pred, 3,3,7, dst, src, w, left
  146. .skip_prologue:
  147. mova m5, [pb_7]
  148. mova m4, [pb_zzzz3333zzzzbbbb]
  149. mova m3, [pb_zz11zz55zz99zzdd]
  150. movd m0, leftm
  151. psllq m0, 56
  152. ADD_LEFT_LOOP 1, 1
  153. INIT_XMM sse4
  154. cglobal add_left_pred, 3,3,7, dst, src, w, left
  155. mova m5, [pb_15]
  156. mova m6, [pb_zzzzzzzz77777777]
  157. mova m4, [pb_zzzz3333zzzzbbbb]
  158. mova m3, [pb_zz11zz55zz99zzdd]
  159. movd m0, leftm
  160. pslldq m0, 15
  161. test srcq, 15
  162. jnz .src_unaligned
  163. test dstq, 15
  164. jnz .dst_unaligned
  165. ADD_LEFT_LOOP 1, 1
  166. .dst_unaligned:
  167. ADD_LEFT_LOOP 0, 1
  168. .src_unaligned:
  169. ADD_LEFT_LOOP 0, 0
  170. %macro ADD_BYTES 0
  171. cglobal add_bytes, 3,4,2, dst, src, w, size
  172. mov sizeq, wq
  173. and sizeq, -2*mmsize
  174. jz .2
  175. add dstq, sizeq
  176. add srcq, sizeq
  177. neg sizeq
  178. .1:
  179. mova m0, [srcq + sizeq]
  180. mova m1, [srcq + sizeq + mmsize]
  181. paddb m0, [dstq + sizeq]
  182. paddb m1, [dstq + sizeq + mmsize]
  183. mova [dstq + sizeq], m0
  184. mova [dstq + sizeq + mmsize], m1
  185. add sizeq, 2*mmsize
  186. jl .1
  187. .2:
  188. and wq, 2*mmsize-1
  189. jz .end
  190. add dstq, wq
  191. add srcq, wq
  192. neg wq
  193. .3:
  194. mov sizeb, [srcq + wq]
  195. add [dstq + wq], sizeb
  196. inc wq
  197. jl .3
  198. .end:
  199. REP_RET
  200. %endmacro
  201. %if ARCH_X86_32
  202. INIT_MMX mmx
  203. ADD_BYTES
  204. %endif
  205. INIT_XMM sse2
  206. ADD_BYTES
  207. %macro ADD_HFYU_LEFT_LOOP_INT16 2 ; %1 = dst alignment (a/u), %2 = src alignment (a/u)
  208. add wd, wd
  209. add srcq, wq
  210. add dstq, wq
  211. neg wq
  212. %%.loop:
  213. mov%2 m1, [srcq+wq]
  214. mova m2, m1
  215. pslld m1, 16
  216. paddw m1, m2
  217. mova m2, m1
  218. pshufb m1, m3
  219. paddw m1, m2
  220. pshufb m0, m5
  221. %if mmsize == 16
  222. mova m2, m1
  223. pshufb m1, m4
  224. paddw m1, m2
  225. %endif
  226. paddw m0, m1
  227. pand m0, m7
  228. %ifidn %1, a
  229. mova [dstq+wq], m0
  230. %else
  231. movq [dstq+wq], m0
  232. movhps [dstq+wq+8], m0
  233. %endif
  234. add wq, mmsize
  235. jl %%.loop
  236. mov eax, mmsize-1
  237. sub eax, wd
  238. mov wd, eax
  239. shl wd, 8
  240. lea eax, [wd+eax-1]
  241. movd m1, eax
  242. pshufb m0, m1
  243. movd eax, m0
  244. RET
  245. %endmacro
  246. ; int add_hfyu_left_pred_int16(uint16_t *dst, const uint16_t *src, unsigned mask, int w, int left)
  247. INIT_MMX ssse3
  248. cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
  249. .skip_prologue:
  250. mova m5, [pb_67]
  251. mova m3, [pb_zzzz2323zzzzabab]
  252. movd m0, leftm
  253. psllq m0, 48
  254. movd m7, maskm
  255. SPLATW m7 ,m7
  256. ADD_HFYU_LEFT_LOOP_INT16 a, a
  257. INIT_XMM sse4
  258. cglobal add_hfyu_left_pred_int16, 4,4,8, dst, src, mask, w, left
  259. mova m5, [pb_ef]
  260. mova m4, [pb_zzzzzzzz67676767]
  261. mova m3, [pb_zzzz2323zzzzabab]
  262. movd m0, leftm
  263. pslldq m0, 14
  264. movd m7, maskm
  265. SPLATW m7 ,m7
  266. test srcq, 15
  267. jnz .src_unaligned
  268. test dstq, 15
  269. jnz .dst_unaligned
  270. ADD_HFYU_LEFT_LOOP_INT16 a, a
  271. .dst_unaligned:
  272. ADD_HFYU_LEFT_LOOP_INT16 u, a
  273. .src_unaligned:
  274. ADD_HFYU_LEFT_LOOP_INT16 u, u