You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

851 lines
20KB

  1. ;*******************************************************************************
  2. ;* SIMD-optimized IDCT functions for HEVC decoding
  3. ;* Copyright (c) 2014 Pierre-Edouard LEPERE
  4. ;* Copyright (c) 2014 James Almer
  5. ;* Copyright (c) 2016 Alexandra Hájková
  6. ;*
  7. ;* This file is part of Libav.
  8. ;*
  9. ;* Libav is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* Libav is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with Libav; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pd_64: times 4 dd 64
  26. pd_2048: times 4 dd 2048
  27. pd_512: times 4 dd 512
  28. ; 4x4 transform coeffs
  29. cextern pw_64
  30. pw_64_m64: times 4 dw 64, -64
  31. pw_83_36: times 4 dw 83, 36
  32. pw_36_m83: times 4 dw 36, -83
  33. ; 8x8 transform coeffs
  34. pw_89_75: times 4 dw 89, 75
  35. pw_50_18: times 4 dw 50, 18
  36. pw_75_m18: times 4 dw 75, -18
  37. pw_m89_m50: times 4 dw -89, -50
  38. pw_50_m89: times 4 dw 50, -89
  39. pw_18_75: times 4 dw 18, 75
  40. pw_18_m50: times 4 dw 18, -50
  41. pw_75_m89: times 4 dw 75, -89
  42. ; 16x16 transformation coeffs
  43. trans_coeffs16: times 4 dw 90, 87
  44. times 4 dw 80, 70
  45. times 4 dw 57, 43
  46. times 4 dw 25, 9
  47. times 4 dw 87, 57
  48. times 4 dw 9, -43
  49. times 4 dw -80, -90
  50. times 4 dw -70, -25
  51. times 4 dw 80, 9
  52. times 4 dw -70, -87
  53. times 4 dw -25, 57
  54. times 4 dw 90, 43
  55. times 4 dw 70, -43
  56. times 4 dw -87, 9
  57. times 4 dw 90, 25
  58. times 4 dw -80, -57
  59. times 4 dw 57, -80
  60. times 4 dw -25, 90
  61. times 4 dw -9, -87
  62. times 4 dw 43, 70
  63. times 4 dw 43, -90
  64. times 4 dw 57, 25
  65. times 4 dw -87, 70
  66. times 4 dw 9, -80
  67. times 4 dw 25, -70
  68. times 4 dw 90, -80
  69. times 4 dw 43, 9
  70. times 4 dw -57, 87
  71. times 4 dw 9, -25
  72. times 4 dw 43, -57
  73. times 4 dw 70, -80
  74. times 4 dw 87, -90
  75. ; 32x32 transform coeffs
  76. trans_coeff32: times 8 dw 90
  77. times 4 dw 88, 85
  78. times 4 dw 82, 78
  79. times 4 dw 73, 67
  80. times 4 dw 61, 54
  81. times 4 dw 46, 38
  82. times 4 dw 31, 22
  83. times 4 dw 13, 4
  84. times 4 dw 90, 82
  85. times 4 dw 67, 46
  86. times 4 dw 22, -4
  87. times 4 dw -31, -54
  88. times 4 dw -73, -85
  89. times 4 dw -90, -88
  90. times 4 dw -78, -61
  91. times 4 dw -38, -13
  92. times 4 dw 88, 67
  93. times 4 dw 31, -13
  94. times 4 dw -54, -82
  95. times 4 dw -90, -78
  96. times 4 dw -46, -4
  97. times 4 dw 38, 73
  98. times 4 dw 90, 85
  99. times 4 dw 61, 22
  100. times 4 dw 85, 46
  101. times 4 dw -13, -67
  102. times 4 dw -90, -73
  103. times 4 dw -22, 38
  104. times 4 dw 82, 88
  105. times 4 dw 54, -4
  106. times 4 dw -61, -90
  107. times 4 dw -78, -31
  108. times 4 dw 82, 22
  109. times 4 dw -54, -90
  110. times 4 dw -61, 13
  111. times 4 dw 78, 85
  112. times 4 dw 31, -46
  113. times 4 dw -90, -67
  114. times 4 dw 4, 73
  115. times 4 dw 88, 38
  116. times 4 dw 78, -4
  117. times 4 dw -82, -73
  118. times 4 dw 13, 85
  119. times 4 dw 67, -22
  120. times 4 dw -88, -61
  121. times 4 dw 31, 90
  122. times 4 dw 54, -38
  123. times 4 dw -90, -46
  124. times 4 dw 73, -31
  125. times 4 dw -90, -22
  126. times 4 dw 78, 67
  127. times 4 dw -38, -90
  128. times 4 dw -13, 82
  129. times 4 dw 61, -46
  130. times 4 dw -88, -4
  131. times 4 dw 85, 54
  132. times 4 dw 67, -54
  133. times 4 dw -78, 38
  134. times 4 dw 85, -22
  135. times 4 dw -90, 4
  136. times 4 dw 90, 13
  137. times 4 dw -88, -31
  138. times 4 dw 82, 46
  139. times 4 dw -73, -61
  140. times 4 dw 61, -73
  141. times 4 dw -46, 82
  142. times 4 dw 31, -88
  143. times 4 dw -13, 90
  144. times 4 dw -4, -90
  145. times 4 dw 22, 85
  146. times 4 dw -38, -78
  147. times 4 dw 54, 67
  148. times 4 dw 54, -85
  149. times 4 dw -4, 88
  150. times 4 dw -46, -61
  151. times 4 dw 82, 13
  152. times 4 dw -90, 38
  153. times 4 dw 67, -78
  154. times 4 dw -22, 90
  155. times 4 dw -31, -73
  156. times 4 dw 46, -90
  157. times 4 dw 38, 54
  158. times 4 dw -90, 31
  159. times 4 dw 61, -88
  160. times 4 dw 22, 67
  161. times 4 dw -85, 13
  162. times 4 dw 73, -82
  163. times 4 dw 4, 78
  164. times 4 dw 38, -88
  165. times 4 dw 73, -4
  166. times 4 dw -67, 90
  167. times 4 dw -46, -31
  168. times 4 dw 85, -78
  169. times 4 dw 13, 61
  170. times 4 dw -90, 54
  171. times 4 dw 22, -82
  172. times 4 dw 31, -78
  173. times 4 dw 90, -61
  174. times 4 dw 4, 54
  175. times 4 dw -88, 82
  176. times 4 dw -38, -22
  177. times 4 dw 73, -90
  178. times 4 dw 67, -13
  179. times 4 dw -46, 85
  180. times 4 dw 22, -61
  181. times 4 dw 85, -90
  182. times 4 dw 73, -38
  183. times 4 dw -4, 46
  184. times 4 dw -78, 90
  185. times 4 dw -82, 54
  186. times 4 dw -13, -31
  187. times 4 dw 67, -88
  188. times 4 dw 13, -38
  189. times 4 dw 61, -78
  190. times 4 dw 88, -90
  191. times 4 dw 85, -73
  192. times 4 dw 54, -31
  193. times 4 dw 4, 22
  194. times 4 dw -46, 67
  195. times 4 dw -82, 90
  196. times 4 dw 4, -13
  197. times 4 dw 22, -31
  198. times 4 dw 38, -46
  199. times 4 dw 54, -61
  200. times 4 dw 67, -73
  201. times 4 dw 78, -82
  202. times 4 dw 85, -88
  203. times 4 dw 90, -90
  204. SECTION .text
  205. ; void ff_hevc_idctHxW_dc_{8,10}_<opt>(int16_t *coeffs)
  206. ; %1 = HxW
  207. ; %2 = number of loops
  208. ; %3 = bitdepth
  209. %macro IDCT_DC 3
  210. cglobal hevc_idct_%1x%1_dc_%3, 1, 2, 1, coeff, tmp
  211. movsx tmpd, word [coeffq]
  212. add tmpd, (1 << (14 - %3)) + 1
  213. sar tmpd, (15 - %3)
  214. movd xm0, tmpd
  215. SPLATW m0, xm0
  216. DEFINE_ARGS coeff, cnt
  217. mov cntd, %2
  218. .loop:
  219. mova [coeffq+mmsize*0], m0
  220. mova [coeffq+mmsize*1], m0
  221. mova [coeffq+mmsize*2], m0
  222. mova [coeffq+mmsize*3], m0
  223. add coeffq, mmsize*8
  224. mova [coeffq+mmsize*-4], m0
  225. mova [coeffq+mmsize*-3], m0
  226. mova [coeffq+mmsize*-2], m0
  227. mova [coeffq+mmsize*-1], m0
  228. dec cntd
  229. jg .loop
  230. RET
  231. %endmacro
  232. ; %1 = HxW
  233. ; %2 = bitdepth
  234. %macro IDCT_DC_NL 2 ; No loop
  235. cglobal hevc_idct_%1x%1_dc_%2, 1, 2, 1, coeff, tmp
  236. movsx tmpd, word [coeffq]
  237. add tmpd, (1 << (14 - %2)) + 1
  238. sar tmpd, (15 - %2)
  239. movd m0, tmpd
  240. SPLATW m0, xm0
  241. mova [coeffq+mmsize*0], m0
  242. mova [coeffq+mmsize*1], m0
  243. mova [coeffq+mmsize*2], m0
  244. mova [coeffq+mmsize*3], m0
  245. %if mmsize == 16
  246. mova [coeffq+mmsize*4], m0
  247. mova [coeffq+mmsize*5], m0
  248. mova [coeffq+mmsize*6], m0
  249. mova [coeffq+mmsize*7], m0
  250. %endif
  251. RET
  252. %endmacro
  253. ; IDCT 4x4, expects input in m0, m1
  254. ; %1 - shift
  255. ; %2 - 1/0 - SCALE and Transpose or not
  256. ; %3 - 1/0 add constant or not
  257. %macro TR_4x4 3
  258. ; interleaves src0 with src2 to m0
  259. ; and src1 with scr3 to m2
  260. ; src0: 00 01 02 03 m0: 00 20 01 21 02 22 03 23
  261. ; src1: 10 11 12 13 -->
  262. ; src2: 20 21 22 23 m1: 10 30 11 31 12 32 13 33
  263. ; src3: 30 31 32 33
  264. SBUTTERFLY wd, 0, 1, 2
  265. pmaddwd m2, m0, [pw_64] ; e0
  266. pmaddwd m3, m1, [pw_83_36] ; o0
  267. pmaddwd m0, [pw_64_m64] ; e1
  268. pmaddwd m1, [pw_36_m83] ; o1
  269. %if %3 == 1
  270. %assign %%add 1 << (%1 - 1)
  271. mova m4, [pd_ %+ %%add]
  272. paddd m2, m4
  273. paddd m0, m4
  274. %endif
  275. SUMSUB_BADC d, 3, 2, 1, 0, 4
  276. %if %2 == 1
  277. psrad m3, %1 ; e0 + o0
  278. psrad m1, %1 ; e1 + o1
  279. psrad m2, %1 ; e0 - o0
  280. psrad m0, %1 ; e1 - o1
  281. ;clip16
  282. packssdw m3, m1
  283. packssdw m0, m2
  284. ; Transpose
  285. SBUTTERFLY wd, 3, 0, 1
  286. SBUTTERFLY wd, 3, 0, 1
  287. SWAP 3, 1, 0
  288. %else
  289. SWAP 3, 2, 0
  290. %endif
  291. %endmacro
  292. %macro DEFINE_BIAS 1
  293. %assign shift (20 - %1)
  294. %assign c_add (1 << (shift - 1))
  295. %define arr_add pd_ %+ c_add
  296. %endmacro
  297. ; %1 - bit_depth
  298. ; %2 - register add constant
  299. ; is loaded to
  300. ; shift = 20 - bit_depth
  301. %macro LOAD_BIAS 2
  302. DEFINE_BIAS %1
  303. mova %2, [arr_add]
  304. %endmacro
  305. ; %1, %2 - registers to load packed 16 bit values to
  306. ; %3, %4, %5, %6 - vertical offsets
  307. ; %7 - horizontal offset
  308. %macro LOAD_BLOCK 7
  309. movq %1, [r0 + %3 + %7]
  310. movhps %1, [r0 + %5 + %7]
  311. movq %2, [r0 + %4 + %7]
  312. movhps %2, [r0 + %6 + %7]
  313. %endmacro
  314. ; void ff_hevc_idct_4x4__{8,10}_<opt>(int16_t *coeffs, int col_limit)
  315. ; %1 = bitdepth
  316. %macro IDCT_4x4 1
  317. cglobal hevc_idct_4x4_%1, 1, 1, 5, coeffs
  318. mova m0, [coeffsq]
  319. mova m1, [coeffsq + 16]
  320. TR_4x4 7, 1, 1
  321. TR_4x4 20 - %1, 1, 1
  322. mova [coeffsq], m0
  323. mova [coeffsq + 16], m1
  324. RET
  325. %endmacro
  326. ; scale, pack (clip16) and store the residuals 0 e8[0] + o8[0] --> + %1
  327. ; 4 at one time (4 columns) 1 e8[1] + o8[1]
  328. ; from %5: e8/16 + o8/16, with %1 offset ...
  329. ; and %3: e8/16 - o8/16, with %2 offset 6 e8[1] - o8[1]
  330. ; %4 - shift 7 e8[0] - o8[0] --> + %2
  331. %macro STORE_8 7
  332. psrad %5, %4
  333. psrad %3, %4
  334. packssdw %5, %3
  335. movq [coeffsq + %1], %5
  336. movhps [coeffsq + %2], %5
  337. %endmacro
  338. ; %1 - horizontal offset
  339. ; %2 - shift
  340. ; %3, %4 - transform coeffs
  341. ; %5 - vertical offset for e8 + o8
  342. ; %6 - vertical offset for e8 - o8
  343. ; %7 - register with e8 inside
  344. ; %8 - block_size
  345. ; %9 - register to store e8 +o8
  346. ; %10 - register to store e8 - o8
  347. %macro E8_O8 10
  348. pmaddwd m6, m4, %3
  349. pmaddwd m7, m5, %4
  350. paddd m6, m7
  351. paddd m7, m6, %7 ; o8 + e8
  352. psubd %7, m6 ; e8 - o8
  353. %if %8 == 8
  354. STORE_8 %5 + %1, %6 + %1, %7, %2, m7, 0, 0
  355. %else
  356. SWAP m7, %9
  357. SWAP %7, %10
  358. %endif
  359. %endmacro
  360. ; 8x4 residuals are processed and stored
  361. ; %1 - horizontal offset
  362. ; %2 - shift
  363. ; %3 - offset of the even row
  364. ; %4 - step: 1 for 8x8, 2 for 16x16, 4 for 32x32
  365. ; %5 - offset of the odd row
  366. ; %6 - block size
  367. ; %7 - 1/0 add a constant in TR_4x4 or not
  368. ; I want to add a constant for 8x8 transform but not for 16x16 and 32x32
  369. %macro TR_8x4 7
  370. ; load 4 columns of even rows
  371. LOAD_BLOCK m0, m1, 0, 2 * %4 * %3, %4 * %3, 3 * %4 * %3, %1
  372. TR_4x4 %2, 0, %7 ; e8: m0, m1, m2, m3, for 4 columns only
  373. ; load 4 columns of odd rows
  374. LOAD_BLOCK m4, m5, %4 * %5, 3 * %4 * %5, 5 * %4 * %5, 7 * %4 * %5, %1
  375. ; 00 01 02 03
  376. ; 10 11 12 13 m4: 10 30 11 31 12 32 13 33
  377. ; ... -- >
  378. ; m5: 50 70 51 71 52 72 53 73
  379. ; 70 71 72 73
  380. SBUTTERFLY wd, 4, 5, 6
  381. E8_O8 %1, %2, [pw_89_75], [pw_50_18], 0, %5 * 7, m0, %6, m8, m15
  382. E8_O8 %1, %2, [pw_75_m18], [pw_m89_m50], %5, %5 * 6, m1, %6, m9, m14
  383. E8_O8 %1, %2, [pw_50_m89], [pw_18_75], %5 * 2, %5 * 5, m2, %6, m10, m13
  384. E8_O8 %1, %2, [pw_18_m50], [pw_75_m89], %5 * 3, %5 * 4, m3, %6, m11, m12
  385. %endmacro
  386. %macro STORE_PACKED 7
  387. movq [r0 + %3 + %7], %1
  388. movhps [r0 + %4 + %7], %1
  389. movq [r0 + %5 + %7], %2
  390. movhps [r0 + %6 + %7], %2
  391. %endmacro
  392. ; transpose 4x4 block packed
  393. ; in %1 and %2 registers
  394. ; %3 - temporary register
  395. %macro TRANSPOSE_4x4 3
  396. SBUTTERFLY wd, %1, %2, %3
  397. SBUTTERFLY dq, %1, %2, %3
  398. %endmacro
  399. ; %1 - horizontal offset of the block i
  400. ; %2 - vertical offset of the block i
  401. ; %3 - width in bytes
  402. ; %4 - vertical offset for the block j
  403. ; %5 - horizontal offset for the block j
  404. %macro SWAP_BLOCKS 5
  405. ; M_j
  406. LOAD_BLOCK m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
  407. TRANSPOSE_4x4 4, 5, 6
  408. ; M_i
  409. LOAD_BLOCK m6, m7, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
  410. STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
  411. ; transpose and store M_i
  412. SWAP m6, m4
  413. SWAP m7, m5
  414. TRANSPOSE_4x4 4, 5, 6
  415. STORE_PACKED m4, m5, %4, %4 + %3, %4 + 2 * %3, %4 + 3 * %3, %5
  416. %endmacro
  417. ; %1 - horizontal offset
  418. ; %2 - vertical offset of the block
  419. ; %3 - width in bytes
  420. %macro TRANSPOSE_BLOCK 3
  421. LOAD_BLOCK m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
  422. TRANSPOSE_4x4 4, 5, 6
  423. STORE_PACKED m4, m5, %2, %2 + %3, %2 + 2 * %3, %2 + 3 * %3, %1
  424. %endmacro
  425. %macro TRANSPOSE_8x8 0
  426. cglobal hevc_idct_transpose_8x8, 0, 0, 0
  427. ; M1 M2 ^T = M1^t M3^t
  428. ; M3 M4 M2^t M4^t
  429. ; M1 4x4 block
  430. TRANSPOSE_BLOCK 0, 0, 16
  431. ; M2 and M3
  432. SWAP_BLOCKS 0, 64, 16, 0, 8
  433. ; M4
  434. TRANSPOSE_BLOCK 8, 64, 16
  435. ret
  436. %endmacro
  437. ; void ff_hevc_idct_8x8_{8,10}_<opt>(int16_t *coeffs, int col_limit)
  438. ; %1 = bitdepth
  439. %macro IDCT_8x8 1
  440. cglobal hevc_idct_8x8_%1, 1, 1, 8, coeffs
  441. TR_8x4 0, 7, 32, 1, 16, 8, 1
  442. TR_8x4 8, 7, 32, 1, 16, 8, 1
  443. call hevc_idct_transpose_8x8_ %+ cpuname
  444. DEFINE_BIAS %1
  445. TR_8x4 0, shift, 32, 1, 16, 8, 1
  446. TR_8x4 8, shift, 32, 1, 16, 8, 1
  447. TAIL_CALL hevc_idct_transpose_8x8_ %+ cpuname, 1
  448. %endmacro
  449. ; store intermedite e32 coeffs on stack
  450. ; as 16x4 matrix
  451. ; from m10: e8 + o8, with %6 offset
  452. ; and %3: e8 - o8, with %7 offset
  453. ; %4 - shift, unused here
  454. %macro STORE_16 7
  455. mova [rsp + %6], %5
  456. mova [rsp + %7], %3
  457. %endmacro
  458. ; %1, %2 - transform constants
  459. ; %3, %4 - regs with interleaved coeffs
  460. ; %5 - 1/0 SWAP or add
  461. ; %6, %7 - registers for intermidiate sums
  462. ; %8 - accumulator register
  463. %macro ADD_ROWS 8
  464. pmaddwd %6, %3, %1
  465. pmaddwd %7, %4, %2
  466. paddd %6, %7
  467. %if %5 == 1
  468. SWAP %6, %8
  469. %else
  470. paddd %8, %6
  471. %endif
  472. %endmacro
  473. ; %1 - transform coeffs
  474. ; %2, %3 offsets for storing e+o/e-o back to coeffsq
  475. ; %4 - shift
  476. ; %5 - add
  477. ; %6 - block_size
  478. ; %7 - register with e16
  479. ; %8, %9 - stack offsets for storing e+o/e-o
  480. %macro E16_O16 9
  481. ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m5, m6, m7
  482. ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m5, m6, m7
  483. %if %6 == 8
  484. paddd %7, %5
  485. %endif
  486. paddd m4, m7, %7 ; o16 + e16
  487. psubd %7, m7 ; e16 - o16
  488. STORE_%6 %2, %3, %7, %4, m4, %8, %9
  489. %endmacro
  490. %macro TR_16x4 10
  491. ; produce 8x4 matrix of e16 coeffs
  492. ; for 4 first rows and store it on stack (128 bytes)
  493. TR_8x4 %1, 7, %4, %5, %6, %8, 0
  494. ; load 8 even rows
  495. LOAD_BLOCK m0, m1, %9 * %6, %9 * 3 * %6, %9 * 5 * %6, %9 * 7 * %6, %1
  496. LOAD_BLOCK m2, m3, %9 * 9 * %6, %9 * 11 * %6, %9 * 13 * %6, %9 * 15 * %6, %1
  497. SBUTTERFLY wd, 0, 1, 4
  498. SBUTTERFLY wd, 2, 3, 4
  499. E16_O16 trans_coeffs16, 0 + %1, 15 * %6 + %1, %2, %3, %7, m8, 0, 15 * 16
  500. mova m8, %3
  501. E16_O16 trans_coeffs16 + 64, %6 + %1, 14 * %6 + %1, %2, m8, %7, m9, 16, 14 * 16
  502. E16_O16 trans_coeffs16 + 2 * 64, 2 * %6 + %1, 13 * %6 + %1, %2, m8, %7, m10, 2 * 16, 13 * 16
  503. E16_O16 trans_coeffs16 + 3 * 64, 3 * %6 + %1, 12 * %6 + %1, %2, m8, %7, m11, 3 * 16, 12 * 16
  504. E16_O16 trans_coeffs16 + 4 * 64, 4 * %6 + %1, 11 * %6 + %1, %2, m8, %7, m12, 4 * 16, 11 * 16
  505. E16_O16 trans_coeffs16 + 5 * 64, 5 * %6 + %1, 10 * %6 + %1, %2, m8, %7, m13, 5 * 16, 10 * 16
  506. E16_O16 trans_coeffs16 + 6 * 64, 6 * %6 + %1, 9 * %6 + %1, %2, m8, %7, m14, 6 * 16, 9 * 16
  507. E16_O16 trans_coeffs16 + 7 * 64, 7 * %6 + %1, 8 * %6 + %1, %2, m8, %7, m15, 7 * 16, 8 * 16
  508. %endmacro
  509. %macro TRANSPOSE_16x16 0
  510. cglobal hevc_idct_transpose_16x16, 0, 0, 0
  511. ; M1 M2 M3 M4 ^T m1 m5 m9 m13 M_i^T = m_i
  512. ; M5 M6 M7 M8 --> m2 m6 m10 m14
  513. ; M9 M10 M11 M12 m3 m7 m11 m15
  514. ; M13 M14 M15 M16 m4 m8 m12 m16
  515. ; M1 4x4 block
  516. TRANSPOSE_BLOCK 0, 0, 32
  517. ; M5, M2
  518. SWAP_BLOCKS 0, 128, 32, 0, 8
  519. ; M9, M3
  520. SWAP_BLOCKS 0, 256, 32, 0, 16
  521. ; M13, M4
  522. SWAP_BLOCKS 0, 384, 32, 0, 24
  523. ;M6
  524. TRANSPOSE_BLOCK 8, 128, 32
  525. ; M10, M7
  526. SWAP_BLOCKS 8, 256, 32, 128, 16
  527. ; M14, M8
  528. SWAP_BLOCKS 8, 384, 32, 128, 24
  529. ;M11
  530. TRANSPOSE_BLOCK 16, 256, 32
  531. ; M15, M12
  532. SWAP_BLOCKS 16, 384, 32, 256, 24
  533. ;M16
  534. TRANSPOSE_BLOCK 24, 384, 32
  535. ret
  536. %endmacro
  537. ; void ff_hevc_idct_16x16_{8,10}_<opt>(int16_t *coeffs, int col_limit)
  538. ; %1 = bitdepth
  539. %macro IDCT_16x16 1
  540. cglobal hevc_idct_16x16_%1, 1, 2, 16, coeffs
  541. mov r1d, 3
  542. .loop16:
  543. TR_16x4 8 * r1, 7, [pd_64], 64, 2, 32, 8, 16, 1, 0
  544. dec r1d
  545. jge .loop16
  546. call hevc_idct_transpose_16x16_ %+ cpuname
  547. DEFINE_BIAS %1
  548. mov r1d, 3
  549. .loop16_2:
  550. TR_16x4 8 * r1, shift, [arr_add], 64, 2, 32, 8, 16, 1, 1
  551. dec r1d
  552. jge .loop16_2
  553. TAIL_CALL hevc_idct_transpose_16x16_ %+ cpuname, 1
  554. %endmacro
  555. ; scale, pack (clip16) and store the residuals 0 e32[0] + o32[0] --> %1
  556. ; 4 at one time (4 columns) 1 e32[1] + o32[1]
  557. ; %1 - address to store e32 + o32
  558. ; %2 - address to store e32 - e32
  559. ; %5 - reg with e32 + o32 ...
  560. ; %3 - reg with e32 - o32 30 e32[1] - o32[1]
  561. ; %4 - shift 31 e32[0] - o32[0] --> %2
  562. %macro STORE_32 5
  563. psrad %5, %4
  564. psrad %3, %4
  565. packssdw %5, %3
  566. movq [%1], %5
  567. movhps [%2], %5
  568. %endmacro
  569. ; %1 - transform coeffs
  570. ; %2 - stack offset for e32
  571. ; %2, %3 offsets for storing e+o/e-o back to coeffsq
  572. ; %4 - shift
  573. ; %5 - stack offset of e32
  574. %macro E32_O32 5
  575. ADD_ROWS [%1], [%1 + 16], m0, m1, 1, m8, m9, m10
  576. ADD_ROWS [%1 + 2 * 16], [%1 + 3 * 16], m2, m3, 0, m8, m9, m10
  577. ADD_ROWS [%1 + 4 * 16], [%1 + 5 * 16], m4, m5, 0, m8, m9, m10
  578. ADD_ROWS [%1 + 6 * 16], [%1 + 7 * 16], m6, m7, 0, m8, m9, m10
  579. paddd m11, m14, [rsp + %5]
  580. paddd m12, m10, m11 ; o32 + e32
  581. psubd m11, m10 ; e32 - o32
  582. STORE_32 %2, %3, m11, %4, m12
  583. %endmacro
  584. ; %1 - horizontal offset
  585. ; %2 - bitdepth
  586. %macro TR_32x4 3
  587. TR_16x4 %1, 7, [pd_64], 128, 4, 64, 16, 16, 2, 0
  588. LOAD_BLOCK m0, m1, 64, 3 * 64, 5 * 64, 7 * 64, %1
  589. LOAD_BLOCK m2, m3, 9 * 64, 11 * 64, 13 * 64, 15 * 64, %1
  590. LOAD_BLOCK m4, m5, 17 * 64, 19 * 64, 21 * 64, 23 * 64, %1
  591. LOAD_BLOCK m6, m7, 25 * 64, 27 * 64, 29 * 64, 31 * 64, %1
  592. SBUTTERFLY wd, 0, 1, 8
  593. SBUTTERFLY wd, 2, 3, 8
  594. SBUTTERFLY wd, 4, 5, 8
  595. SBUTTERFLY wd, 6, 7, 8
  596. %if %3 == 1
  597. %assign shift 7
  598. mova m14, [pd_64]
  599. %else
  600. LOAD_BIAS %2, m14
  601. %endif
  602. lea r2, [trans_coeff32 + 15 * 128]
  603. lea r3, [coeffsq + %1]
  604. lea r4, [r3 + 16 * 64]
  605. mov r5d, 15 * 16
  606. %%loop:
  607. E32_O32 r2, r3 + r5 * 4, r4, shift, r5
  608. sub r2, 128
  609. add r4, 64
  610. sub r5d, 16
  611. jge %%loop
  612. %endmacro
  613. %macro TRANSPOSE_32x32 0
  614. cglobal hevc_idct_transpose_32x32, 0, 0, 0
  615. ; M0 M1 ... M7
  616. ; M8 M15
  617. ;
  618. ; ...
  619. ;
  620. ; M56 M63
  621. TRANSPOSE_BLOCK 0, 0, 64 ; M1
  622. mov r1d, 7
  623. mov r2d, 7 * 256
  624. .loop_transpose:
  625. SWAP_BLOCKS 0, r2, 64, 0, r1 * 8
  626. sub r2d, 256
  627. dec r1d
  628. jg .loop_transpose
  629. TRANSPOSE_BLOCK 8, 256, 64 ; M9
  630. mov r1d, 6
  631. mov r2d, 512
  632. mov r3d, 16
  633. .loop_transpose2:
  634. SWAP_BLOCKS 8, r2, 64, 256, r3
  635. add r3d, 8
  636. add r2d, 256
  637. dec r1d
  638. jg .loop_transpose2
  639. TRANSPOSE_BLOCK 2 * 8, 2 * 256, 64 ; M9
  640. mov r1d, 5
  641. mov r2d, 768
  642. mov r3d, 24
  643. .loop_transpose3:
  644. SWAP_BLOCKS 2 * 8, r2, 64, 2 * 256, r3
  645. add r3d, 8
  646. add r2d, 256
  647. dec r1d
  648. jg .loop_transpose3
  649. TRANSPOSE_BLOCK 3 * 8, 3 * 256, 64 ; M27
  650. mov r1d, 4
  651. mov r2d, 1024
  652. mov r3d, 32
  653. .loop_transpose4:
  654. SWAP_BLOCKS 3 * 8, r2, 64, 3 * 256, r3
  655. add r3d, 8
  656. add r2d, 256
  657. dec r1d
  658. jg .loop_transpose4
  659. TRANSPOSE_BLOCK 4 * 8, 4 * 256, 64 ; M36
  660. mov r1d, 3
  661. mov r2d, 1280
  662. mov r3d, 40
  663. .loop_transpose5:
  664. SWAP_BLOCKS 4 * 8, r2, 64, 4 * 256, r3
  665. add r3d, 8
  666. add r2d, 256
  667. dec r1d
  668. jg .loop_transpose5
  669. TRANSPOSE_BLOCK 5 * 8, 5 * 256, 64 ; M45
  670. SWAP_BLOCKS 5 * 8, 6 * 256, 64, 5 * 256, 6 * 8
  671. SWAP_BLOCKS 5 * 8, 7 * 256, 64, 5 * 256, 7 * 8
  672. TRANSPOSE_BLOCK 6 * 8, 6 * 256, 64 ; M54
  673. SWAP_BLOCKS 6 * 8, 7 * 256, 64, 6 * 256, 7 * 8
  674. TRANSPOSE_BLOCK 7 * 8, 7 * 256, 64 ; M63
  675. ret
  676. %endmacro
  677. ; void ff_hevc_idct_32x32_{8,10}_<opt>(int16_t *coeffs, int col_limit)
  678. ; %1 = bitdepth
  679. %macro IDCT_32x32 1
  680. cglobal hevc_idct_32x32_%1, 1, 6, 16, 256, coeffs
  681. mov r1d, 7
  682. .loop32:
  683. TR_32x4 8 * r1, %1, 1
  684. dec r1d
  685. jge .loop32
  686. call hevc_idct_transpose_32x32_ %+ cpuname
  687. mov r1d, 7
  688. .loop32_2:
  689. TR_32x4 8 * r1, %1, 0
  690. dec r1d
  691. jge .loop32_2
  692. TAIL_CALL hevc_idct_transpose_32x32_ %+ cpuname, 1
  693. %endmacro
  694. %macro INIT_IDCT_DC 1
  695. INIT_MMX mmxext
  696. IDCT_DC_NL 4, %1
  697. IDCT_DC 8, 2, %1
  698. INIT_XMM sse2
  699. IDCT_DC_NL 8, %1
  700. IDCT_DC 16, 4, %1
  701. IDCT_DC 32, 16, %1
  702. %if HAVE_AVX2_EXTERNAL
  703. INIT_YMM avx2
  704. IDCT_DC 16, 2, %1
  705. IDCT_DC 32, 8, %1
  706. %endif ;HAVE_AVX2_EXTERNAL
  707. %endmacro
  708. %macro INIT_IDCT 2
  709. INIT_XMM %2
  710. %if %1 == 8
  711. TRANSPOSE_8x8
  712. %if ARCH_X86_64
  713. TRANSPOSE_16x16
  714. TRANSPOSE_32x32
  715. %endif
  716. %endif
  717. %if ARCH_X86_64
  718. IDCT_32x32 %1
  719. IDCT_16x16 %1
  720. %endif
  721. IDCT_8x8 %1
  722. IDCT_4x4 %1
  723. %endmacro
  724. INIT_IDCT_DC 8
  725. INIT_IDCT_DC 10
  726. INIT_IDCT 8, sse2
  727. INIT_IDCT 8, avx
  728. INIT_IDCT 10, sse2
  729. INIT_IDCT 10, avx