You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

327 lines
9.8KB

  1. ;******************************************************************************
  2. ;* x86-SIMD-optimized IDCT for prores
  3. ;* this is identical to "simple" IDCT written by Michael Niedermayer
  4. ;* except for the clip range
  5. ;*
  6. ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
  7. ;*
  8. ;* This file is part of FFmpeg.
  9. ;*
  10. ;* FFmpeg is free software; you can redistribute it and/or
  11. ;* modify it under the terms of the GNU Lesser General Public
  12. ;* License as published by the Free Software Foundation; either
  13. ;* version 2.1 of the License, or (at your option) any later version.
  14. ;*
  15. ;* FFmpeg is distributed in the hope that it will be useful,
  16. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  17. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  18. ;* Lesser General Public License for more details.
  19. ;*
  20. ;* You should have received a copy of the GNU Lesser General Public
  21. ;* License along with FFmpeg; if not, write to the Free Software
  22. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23. ;******************************************************************************
  24. %include "libavutil/x86/x86util.asm"
  25. %define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
  26. %define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
  27. %define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
  28. %define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
  29. %define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
  30. %define W6sh2 8867 ; W6 = 35468 = 8867<<2
  31. %define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
  32. %if ARCH_X86_64
  33. SECTION_RODATA
  34. w4_plus_w2: times 4 dw W4sh2, +W2sh2
  35. w4_min_w2: times 4 dw W4sh2, -W2sh2
  36. w4_plus_w6: times 4 dw W4sh2, +W6sh2
  37. w4_min_w6: times 4 dw W4sh2, -W6sh2
  38. w1_plus_w3: times 4 dw W1sh2, +W3sh2
  39. w3_min_w1: times 4 dw W3sh2, -W1sh2
  40. w7_plus_w3: times 4 dw W7sh2, +W3sh2
  41. w3_min_w7: times 4 dw W3sh2, -W7sh2
  42. w1_plus_w5: times 4 dw W1sh2, +W5sh2
  43. w5_min_w1: times 4 dw W5sh2, -W1sh2
  44. w5_plus_w7: times 4 dw W5sh2, +W7sh2
  45. w7_min_w5: times 4 dw W7sh2, -W5sh2
  46. pw_88: times 8 dw 0x2008
  47. cextern pw_1
  48. cextern pw_4
  49. cextern pw_512
  50. cextern pw_1019
  51. section .text align=16
  52. ; interleave data while maintaining source
  53. ; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
  54. %macro SBUTTERFLY3 5
  55. punpckl%1 m%2, m%4, m%5
  56. punpckh%1 m%3, m%4, m%5
  57. %endmacro
  58. ; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
  59. ; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
  60. ; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
  61. %macro SUMSUB_SHPK 7
  62. psubd %3, %1, %5 ; { a0 - b0 }[0-3]
  63. psubd %4, %2, %6 ; { a0 - b0 }[4-7]
  64. paddd %1, %5 ; { a0 + b0 }[0-3]
  65. paddd %2, %6 ; { a0 + b0 }[4-7]
  66. psrad %1, %7
  67. psrad %2, %7
  68. psrad %3, %7
  69. psrad %4, %7
  70. packssdw %1, %2 ; row[0]
  71. packssdw %3, %4 ; row[7]
  72. %endmacro
  73. ; %1 = row or col (for rounding variable)
  74. ; %2 = number of bits to shift at the end
  75. %macro IDCT_1D 2
  76. ; a0 = (W4 * row[0]) + (1 << (15 - 1));
  77. ; a1 = a0;
  78. ; a2 = a0;
  79. ; a3 = a0;
  80. ; a0 += W2 * row[2];
  81. ; a1 += W6 * row[2];
  82. ; a2 -= W6 * row[2];
  83. ; a3 -= W2 * row[2];
  84. %ifidn %1, col
  85. paddw m10,[pw_88]
  86. %endif
  87. %ifidn %1, row
  88. paddw m10,[pw_1]
  89. %endif
  90. SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
  91. pmaddwd m2, m0, [w4_plus_w6]
  92. pmaddwd m3, m1, [w4_plus_w6]
  93. pmaddwd m4, m0, [w4_min_w6]
  94. pmaddwd m5, m1, [w4_min_w6]
  95. pmaddwd m6, m0, [w4_min_w2]
  96. pmaddwd m7, m1, [w4_min_w2]
  97. pmaddwd m0, [w4_plus_w2]
  98. pmaddwd m1, [w4_plus_w2]
  99. ; a0: -1*row[0]-1*row[2]
  100. ; a1: -1*row[0]
  101. ; a2: -1*row[0]
  102. ; a3: -1*row[0]+1*row[2]
  103. ; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
  104. ; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
  105. ; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
  106. ; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
  107. SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
  108. pmaddwd m10, m8, [w4_plus_w6]
  109. pmaddwd m11, m9, [w4_plus_w6]
  110. paddd m0, m10 ; a0[0-3]
  111. paddd m1, m11 ; a0[4-7]
  112. pmaddwd m10, m8, [w4_min_w6]
  113. pmaddwd m11, m9, [w4_min_w6]
  114. paddd m6, m10 ; a3[0-3]
  115. paddd m7, m11 ; a3[4-7]
  116. pmaddwd m10, m8, [w4_min_w2]
  117. pmaddwd m11, m9, [w4_min_w2]
  118. pmaddwd m8, [w4_plus_w2]
  119. pmaddwd m9, [w4_plus_w2]
  120. psubd m4, m10 ; a2[0-3] intermediate
  121. psubd m5, m11 ; a2[4-7] intermediate
  122. psubd m2, m8 ; a1[0-3] intermediate
  123. psubd m3, m9 ; a1[4-7] intermediate
  124. ; load/store
  125. mova [r2+ 0], m0
  126. mova [r2+ 32], m2
  127. mova [r2+ 64], m4
  128. mova [r2+ 96], m6
  129. mova m10,[r2+ 16] ; { row[1] }[0-7]
  130. mova m8, [r2+ 48] ; { row[3] }[0-7]
  131. mova m13,[r2+ 80] ; { row[5] }[0-7]
  132. mova m14,[r2+112] ; { row[7] }[0-7]
  133. mova [r2+ 16], m1
  134. mova [r2+ 48], m3
  135. mova [r2+ 80], m5
  136. mova [r2+112], m7
  137. %ifidn %1, row
  138. pmullw m10,[r3+ 16]
  139. pmullw m8, [r3+ 48]
  140. pmullw m13,[r3+ 80]
  141. pmullw m14,[r3+112]
  142. %endif
  143. ; b0 = MUL(W1, row[1]);
  144. ; MAC(b0, W3, row[3]);
  145. ; b1 = MUL(W3, row[1]);
  146. ; MAC(b1, -W7, row[3]);
  147. ; b2 = MUL(W5, row[1]);
  148. ; MAC(b2, -W1, row[3]);
  149. ; b3 = MUL(W7, row[1]);
  150. ; MAC(b3, -W5, row[3]);
  151. SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
  152. pmaddwd m2, m0, [w3_min_w7]
  153. pmaddwd m3, m1, [w3_min_w7]
  154. pmaddwd m4, m0, [w5_min_w1]
  155. pmaddwd m5, m1, [w5_min_w1]
  156. pmaddwd m6, m0, [w7_min_w5]
  157. pmaddwd m7, m1, [w7_min_w5]
  158. pmaddwd m0, [w1_plus_w3]
  159. pmaddwd m1, [w1_plus_w3]
  160. ; b0: +1*row[1]+2*row[3]
  161. ; b1: +2*row[1]-1*row[3]
  162. ; b2: -1*row[1]-1*row[3]
  163. ; b3: +1*row[1]+1*row[3]
  164. ; MAC(b0, W5, row[5]);
  165. ; MAC(b0, W7, row[7]);
  166. ; MAC(b1, -W1, row[5]);
  167. ; MAC(b1, -W5, row[7]);
  168. ; MAC(b2, W7, row[5]);
  169. ; MAC(b2, W3, row[7]);
  170. ; MAC(b3, W3, row[5]);
  171. ; MAC(b3, -W1, row[7]);
  172. SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
  173. ; b0: -1*row[5]+1*row[7]
  174. ; b1: -1*row[5]+1*row[7]
  175. ; b2: +1*row[5]+2*row[7]
  176. ; b3: +2*row[5]-1*row[7]
  177. pmaddwd m10, m8, [w1_plus_w5]
  178. pmaddwd m11, m9, [w1_plus_w5]
  179. pmaddwd m12, m8, [w5_plus_w7]
  180. pmaddwd m13, m9, [w5_plus_w7]
  181. psubd m2, m10 ; b1[0-3]
  182. psubd m3, m11 ; b1[4-7]
  183. paddd m0, m12 ; b0[0-3]
  184. paddd m1, m13 ; b0[4-7]
  185. pmaddwd m12, m8, [w7_plus_w3]
  186. pmaddwd m13, m9, [w7_plus_w3]
  187. pmaddwd m8, [w3_min_w1]
  188. pmaddwd m9, [w3_min_w1]
  189. paddd m4, m12 ; b2[0-3]
  190. paddd m5, m13 ; b2[4-7]
  191. paddd m6, m8 ; b3[0-3]
  192. paddd m7, m9 ; b3[4-7]
  193. ; row[0] = (a0 + b0) >> 15;
  194. ; row[7] = (a0 - b0) >> 15;
  195. ; row[1] = (a1 + b1) >> 15;
  196. ; row[6] = (a1 - b1) >> 15;
  197. ; row[2] = (a2 + b2) >> 15;
  198. ; row[5] = (a2 - b2) >> 15;
  199. ; row[3] = (a3 + b3) >> 15;
  200. ; row[4] = (a3 - b3) >> 15;
  201. mova m8, [r2+ 0] ; a0[0-3]
  202. mova m9, [r2+16] ; a0[4-7]
  203. SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
  204. mova m0, [r2+32] ; a1[0-3]
  205. mova m1, [r2+48] ; a1[4-7]
  206. SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
  207. mova m1, [r2+64] ; a2[0-3]
  208. mova m2, [r2+80] ; a2[4-7]
  209. SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
  210. mova m2, [r2+96] ; a3[0-3]
  211. mova m3, [r2+112] ; a3[4-7]
  212. SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
  213. %endmacro
  214. ; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
  215. ; int16_t *block, const int16_t *qmat);
  216. %macro idct_put_fn 1
  217. cglobal prores_idct_put_10, 4, 4, %1
  218. movsxd r1, r1d
  219. pxor m15, m15 ; zero
  220. ; for (i = 0; i < 8; i++)
  221. ; idctRowCondDC(block + i*8);
  222. mova m10,[r2+ 0] ; { row[0] }[0-7]
  223. mova m8, [r2+32] ; { row[2] }[0-7]
  224. mova m13,[r2+64] ; { row[4] }[0-7]
  225. mova m12,[r2+96] ; { row[6] }[0-7]
  226. pmullw m10,[r3+ 0]
  227. pmullw m8, [r3+32]
  228. pmullw m13,[r3+64]
  229. pmullw m12,[r3+96]
  230. IDCT_1D row, 15
  231. ; transpose for second part of IDCT
  232. TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
  233. mova [r2+ 16], m0
  234. mova [r2+ 48], m2
  235. mova [r2+ 80], m11
  236. mova [r2+112], m10
  237. SWAP 8, 10
  238. SWAP 1, 8
  239. SWAP 4, 13
  240. SWAP 9, 12
  241. ; for (i = 0; i < 8; i++)
  242. ; idctSparseColAdd(dest + i, line_size, block + i);
  243. IDCT_1D col, 18
  244. ; clip/store
  245. mova m3, [pw_4]
  246. mova m5, [pw_1019]
  247. pmaxsw m8, m3
  248. pmaxsw m0, m3
  249. pmaxsw m1, m3
  250. pmaxsw m2, m3
  251. pmaxsw m4, m3
  252. pmaxsw m11, m3
  253. pmaxsw m9, m3
  254. pmaxsw m10, m3
  255. pminsw m8, m5
  256. pminsw m0, m5
  257. pminsw m1, m5
  258. pminsw m2, m5
  259. pminsw m4, m5
  260. pminsw m11, m5
  261. pminsw m9, m5
  262. pminsw m10, m5
  263. lea r2, [r1*3]
  264. mova [r0 ], m8
  265. mova [r0+r1 ], m0
  266. mova [r0+r1*2], m1
  267. mova [r0+r2 ], m2
  268. lea r0, [r0+r1*4]
  269. mova [r0 ], m4
  270. mova [r0+r1 ], m11
  271. mova [r0+r1*2], m9
  272. mova [r0+r2 ], m10
  273. RET
  274. %endmacro
  275. %macro SIGNEXTEND 2-3
  276. %if cpuflag(sse4) ; dstlow, dsthigh
  277. movhlps %2, %1
  278. pmovsxwd %1, %1
  279. pmovsxwd %2, %2
  280. %elif cpuflag(sse2) ; dstlow, dsthigh, tmp
  281. pxor %3, %3
  282. pcmpgtw %3, %1
  283. mova %2, %1
  284. punpcklwd %1, %3
  285. punpckhwd %2, %3
  286. %endif
  287. %endmacro
  288. INIT_XMM sse2
  289. idct_put_fn 16
  290. INIT_XMM sse4
  291. idct_put_fn 16
  292. %if HAVE_AVX_EXTERNAL
  293. INIT_XMM avx
  294. idct_put_fn 16
  295. %endif
  296. %endif