You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1196 lines
30KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of FFmpeg.
  13. ;*
  14. ;* FFmpeg is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* FFmpeg is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with FFmpeg; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "libavutil/x86/x86util.asm"
  29. SECTION_RODATA
  30. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  31. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  32. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  33. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  34. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  35. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  36. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  37. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  38. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  39. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  40. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  41. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  42. %ifdef PIC
  43. %define npicregs 1
  44. %define scan8 picregq
  45. %else
  46. %define npicregs 0
  47. %define scan8 scan8_mem
  48. %endif
  49. cextern pw_32
  50. cextern pw_1
  51. SECTION .text
  52. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  53. %macro IDCT4_ADD 3
  54. ; Load dct coeffs
  55. movq m0, [%2]
  56. movq m1, [%2+8]
  57. movq m2, [%2+16]
  58. movq m3, [%2+24]
  59. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  60. mova m6, [pw_32]
  61. %if mmsize == 8
  62. TRANSPOSE4x4W 0, 1, 2, 3, 4
  63. %else
  64. punpcklwd m0, m1
  65. punpcklwd m2, m3
  66. SBUTTERFLY dq, 0, 2, 4
  67. MOVHL m1, m0
  68. MOVHL m3, m2
  69. %endif
  70. paddw m0, m6
  71. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  72. pxor m7, m7
  73. movq [%2+ 0], m7
  74. movq [%2+ 8], m7
  75. movq [%2+16], m7
  76. movq [%2+24], m7
  77. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  78. lea %1, [%1+%3*2]
  79. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  80. %endmacro
  81. INIT_MMX mmx
  82. ; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
  83. cglobal h264_idct_add_8, 3, 3, 0
  84. movsxdifnidn r2, r2d
  85. IDCT4_ADD r0, r1, r2
  86. RET
  87. %macro IDCT8_1D 2
  88. psraw m0, m1, 1
  89. SWAP 0, 1
  90. psraw m4, m5, 1
  91. paddw m4, m5
  92. paddw m1, m0
  93. paddw m4, m7
  94. paddw m1, m5
  95. psubw m4, m0
  96. paddw m1, m3
  97. psubw m0, m3
  98. psubw m5, m3
  99. psraw m3, 1
  100. paddw m0, m7
  101. psubw m5, m7
  102. psraw m7, 1
  103. psubw m0, m3
  104. psubw m5, m7
  105. psraw m7, m1, 2
  106. SWAP 7,1
  107. psraw m3, m4, 2
  108. paddw m3, m0
  109. psraw m0, 2
  110. paddw m1, m5
  111. psraw m5, 2
  112. psubw m0, m4
  113. psubw m7, m5
  114. psraw m5, m6, 1
  115. SWAP 5,6
  116. psraw m4, m2, 1
  117. paddw m6, m2
  118. psubw m4, m5
  119. mova m2, %1
  120. mova m5, %2
  121. SUMSUB_BA w, 5, 2
  122. SUMSUB_BA w, 6, 5
  123. SUMSUB_BA w, 4, 2
  124. SUMSUB_BA w, 7, 6
  125. SUMSUB_BA w, 0, 4
  126. SUMSUB_BA w, 3, 2
  127. SUMSUB_BA w, 1, 5
  128. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  129. %endmacro
  130. %macro IDCT8_1D_FULL 1
  131. mova m7, [%1+112]
  132. mova m6, [%1+ 96]
  133. mova m5, [%1+ 80]
  134. mova m3, [%1+ 48]
  135. mova m2, [%1+ 32]
  136. mova m1, [%1+ 16]
  137. IDCT8_1D [%1], [%1+ 64]
  138. %endmacro
  139. ; %1=int16_t *block, %2=int16_t *dstblock
  140. %macro IDCT8_ADD_MMX_START 2
  141. IDCT8_1D_FULL %1
  142. mova [%1], m7
  143. TRANSPOSE4x4W 0, 1, 2, 3, 7
  144. mova m7, [%1]
  145. mova [%2 ], m0
  146. mova [%2+16], m1
  147. mova [%2+32], m2
  148. mova [%2+48], m3
  149. TRANSPOSE4x4W 4, 5, 6, 7, 3
  150. mova [%2+ 8], m4
  151. mova [%2+24], m5
  152. mova [%2+40], m6
  153. mova [%2+56], m7
  154. %endmacro
  155. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  156. %macro IDCT8_ADD_MMX_END 3-4
  157. IDCT8_1D_FULL %2
  158. mova [%2 ], m5
  159. mova [%2+16], m6
  160. mova [%2+32], m7
  161. pxor m7, m7
  162. %if %0 == 4
  163. movq [%4+ 0], m7
  164. movq [%4+ 8], m7
  165. movq [%4+ 16], m7
  166. movq [%4+ 24], m7
  167. movq [%4+ 32], m7
  168. movq [%4+ 40], m7
  169. movq [%4+ 48], m7
  170. movq [%4+ 56], m7
  171. movq [%4+ 64], m7
  172. movq [%4+ 72], m7
  173. movq [%4+ 80], m7
  174. movq [%4+ 88], m7
  175. movq [%4+ 96], m7
  176. movq [%4+104], m7
  177. movq [%4+112], m7
  178. movq [%4+120], m7
  179. %endif
  180. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  181. lea %1, [%1+%3*2]
  182. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  183. mova m0, [%2 ]
  184. mova m1, [%2+16]
  185. mova m2, [%2+32]
  186. lea %1, [%1+%3*2]
  187. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  188. lea %1, [%1+%3*2]
  189. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  190. %endmacro
  191. INIT_MMX mmx
  192. ; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
  193. cglobal h264_idct8_add_8, 3, 4, 0
  194. movsxdifnidn r2, r2d
  195. %assign pad 128+4-(stack_offset&7)
  196. SUB rsp, pad
  197. add word [r1], 32
  198. IDCT8_ADD_MMX_START r1 , rsp
  199. IDCT8_ADD_MMX_START r1+8, rsp+64
  200. lea r3, [r0+4]
  201. IDCT8_ADD_MMX_END r0 , rsp, r2, r1
  202. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  203. ADD rsp, pad
  204. RET
  205. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  206. %macro IDCT8_ADD_SSE 4
  207. IDCT8_1D_FULL %2
  208. %if ARCH_X86_64
  209. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  210. %else
  211. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  212. %endif
  213. paddw m0, [pw_32]
  214. %if ARCH_X86_64 == 0
  215. mova [%2 ], m0
  216. mova [%2+16], m4
  217. IDCT8_1D [%2], [%2+ 16]
  218. mova [%2 ], m6
  219. mova [%2+16], m7
  220. %else
  221. SWAP 0, 8
  222. SWAP 4, 9
  223. IDCT8_1D m8, m9
  224. SWAP 6, 8
  225. SWAP 7, 9
  226. %endif
  227. pxor m7, m7
  228. lea %4, [%3*3]
  229. STORE_DIFF m0, m6, m7, [%1 ]
  230. STORE_DIFF m1, m6, m7, [%1+%3 ]
  231. STORE_DIFF m2, m6, m7, [%1+%3*2]
  232. STORE_DIFF m3, m6, m7, [%1+%4 ]
  233. %if ARCH_X86_64 == 0
  234. mova m0, [%2 ]
  235. mova m1, [%2+16]
  236. %else
  237. SWAP 0, 8
  238. SWAP 1, 9
  239. %endif
  240. mova [%2+ 0], m7
  241. mova [%2+ 16], m7
  242. mova [%2+ 32], m7
  243. mova [%2+ 48], m7
  244. mova [%2+ 64], m7
  245. mova [%2+ 80], m7
  246. mova [%2+ 96], m7
  247. mova [%2+112], m7
  248. lea %1, [%1+%3*4]
  249. STORE_DIFF m4, m6, m7, [%1 ]
  250. STORE_DIFF m5, m6, m7, [%1+%3 ]
  251. STORE_DIFF m0, m6, m7, [%1+%3*2]
  252. STORE_DIFF m1, m6, m7, [%1+%4 ]
  253. %endmacro
  254. INIT_XMM sse2
  255. ; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
  256. cglobal h264_idct8_add_8, 3, 4, 10
  257. movsxdifnidn r2, r2d
  258. IDCT8_ADD_SSE r0, r1, r2, r3
  259. RET
  260. %macro DC_ADD_MMXEXT_INIT 2
  261. add %1, 32
  262. sar %1, 6
  263. movd m0, %1d
  264. lea %1, [%2*3]
  265. pshufw m0, m0, 0
  266. pxor m1, m1
  267. psubw m1, m0
  268. packuswb m0, m0
  269. packuswb m1, m1
  270. %endmacro
  271. %macro DC_ADD_MMXEXT_OP 4
  272. %1 m2, [%2 ]
  273. %1 m3, [%2+%3 ]
  274. %1 m4, [%2+%3*2]
  275. %1 m5, [%2+%4 ]
  276. paddusb m2, m0
  277. paddusb m3, m0
  278. paddusb m4, m0
  279. paddusb m5, m0
  280. psubusb m2, m1
  281. psubusb m3, m1
  282. psubusb m4, m1
  283. psubusb m5, m1
  284. %1 [%2 ], m2
  285. %1 [%2+%3 ], m3
  286. %1 [%2+%3*2], m4
  287. %1 [%2+%4 ], m5
  288. %endmacro
  289. INIT_MMX mmxext
  290. ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  291. %if ARCH_X86_64
  292. cglobal h264_idct_dc_add_8, 3, 4, 0
  293. movsxd r2, r2d
  294. movsx r3, word [r1]
  295. mov dword [r1], 0
  296. DC_ADD_MMXEXT_INIT r3, r2
  297. DC_ADD_MMXEXT_OP movh, r0, r2, r3
  298. RET
  299. ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  300. cglobal h264_idct8_dc_add_8, 3, 4, 0
  301. movsxd r2, r2d
  302. movsx r3, word [r1]
  303. mov dword [r1], 0
  304. DC_ADD_MMXEXT_INIT r3, r2
  305. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  306. lea r0, [r0+r2*4]
  307. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  308. RET
  309. %else
  310. ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  311. cglobal h264_idct_dc_add_8, 2, 3, 0
  312. movsx r2, word [r1]
  313. mov dword [r1], 0
  314. mov r1, r2m
  315. DC_ADD_MMXEXT_INIT r2, r1
  316. DC_ADD_MMXEXT_OP movh, r0, r1, r2
  317. RET
  318. ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  319. cglobal h264_idct8_dc_add_8, 2, 3, 0
  320. movsx r2, word [r1]
  321. mov dword [r1], 0
  322. mov r1, r2m
  323. DC_ADD_MMXEXT_INIT r2, r1
  324. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  325. lea r0, [r0+r1*4]
  326. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  327. RET
  328. %endif
  329. INIT_MMX mmx
  330. ; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
  331. ; int16_t *block, int stride,
  332. ; const uint8_t nnzc[6 * 8])
  333. cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  334. movsxdifnidn r3, r3d
  335. xor r5, r5
  336. %ifdef PIC
  337. lea picregq, [scan8_mem]
  338. %endif
  339. .nextblock:
  340. movzx r6, byte [scan8+r5]
  341. movzx r6, byte [r4+r6]
  342. test r6, r6
  343. jz .skipblock
  344. mov r6d, dword [r1+r5*4]
  345. lea r6, [r0+r6]
  346. IDCT4_ADD r6, r2, r3
  347. .skipblock:
  348. inc r5
  349. add r2, 32
  350. cmp r5, 16
  351. jl .nextblock
  352. REP_RET
  353. ; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
  354. ; int16_t *block, int stride,
  355. ; const uint8_t nnzc[6 * 8])
  356. cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  357. movsxdifnidn r3, r3d
  358. %assign pad 128+4-(stack_offset&7)
  359. SUB rsp, pad
  360. xor r5, r5
  361. %ifdef PIC
  362. lea picregq, [scan8_mem]
  363. %endif
  364. .nextblock:
  365. movzx r6, byte [scan8+r5]
  366. movzx r6, byte [r4+r6]
  367. test r6, r6
  368. jz .skipblock
  369. mov r6d, dword [r1+r5*4]
  370. add r6, r0
  371. add word [r2], 32
  372. IDCT8_ADD_MMX_START r2 , rsp
  373. IDCT8_ADD_MMX_START r2+8, rsp+64
  374. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  375. mov r6d, dword [r1+r5*4]
  376. lea r6, [r0+r6+4]
  377. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  378. .skipblock:
  379. add r5, 4
  380. add r2, 128
  381. cmp r5, 16
  382. jl .nextblock
  383. ADD rsp, pad
  384. RET
  385. INIT_MMX mmxext
  386. ; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
  387. ; int16_t *block, int stride,
  388. ; const uint8_t nnzc[6 * 8])
  389. cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  390. movsxdifnidn r3, r3d
  391. xor r5, r5
  392. %ifdef PIC
  393. lea picregq, [scan8_mem]
  394. %endif
  395. .nextblock:
  396. movzx r6, byte [scan8+r5]
  397. movzx r6, byte [r4+r6]
  398. test r6, r6
  399. jz .skipblock
  400. cmp r6, 1
  401. jnz .no_dc
  402. movsx r6, word [r2]
  403. test r6, r6
  404. jz .no_dc
  405. mov word [r2], 0
  406. DC_ADD_MMXEXT_INIT r6, r3
  407. %if ARCH_X86_64 == 0
  408. %define dst2q r1
  409. %define dst2d r1d
  410. %endif
  411. mov dst2d, dword [r1+r5*4]
  412. lea dst2q, [r0+dst2q]
  413. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  414. %if ARCH_X86_64 == 0
  415. mov r1, r1m
  416. %endif
  417. inc r5
  418. add r2, 32
  419. cmp r5, 16
  420. jl .nextblock
  421. REP_RET
  422. .no_dc:
  423. mov r6d, dword [r1+r5*4]
  424. add r6, r0
  425. IDCT4_ADD r6, r2, r3
  426. .skipblock:
  427. inc r5
  428. add r2, 32
  429. cmp r5, 16
  430. jl .nextblock
  431. REP_RET
  432. INIT_MMX mmx
  433. ; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
  434. ; int16_t *block, int stride,
  435. ; const uint8_t nnzc[6 * 8])
  436. cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  437. movsxdifnidn r3, r3d
  438. xor r5, r5
  439. %ifdef PIC
  440. lea picregq, [scan8_mem]
  441. %endif
  442. .nextblock:
  443. movzx r6, byte [scan8+r5]
  444. movzx r6, byte [r4+r6]
  445. or r6w, word [r2]
  446. test r6, r6
  447. jz .skipblock
  448. mov r6d, dword [r1+r5*4]
  449. add r6, r0
  450. IDCT4_ADD r6, r2, r3
  451. .skipblock:
  452. inc r5
  453. add r2, 32
  454. cmp r5, 16
  455. jl .nextblock
  456. REP_RET
  457. INIT_MMX mmxext
  458. ; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
  459. ; int16_t *block, int stride,
  460. ; const uint8_t nnzc[6 * 8])
  461. cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  462. movsxdifnidn r3, r3d
  463. xor r5, r5
  464. %ifdef PIC
  465. lea picregq, [scan8_mem]
  466. %endif
  467. .nextblock:
  468. movzx r6, byte [scan8+r5]
  469. movzx r6, byte [r4+r6]
  470. test r6, r6
  471. jz .try_dc
  472. mov r6d, dword [r1+r5*4]
  473. lea r6, [r0+r6]
  474. IDCT4_ADD r6, r2, r3
  475. inc r5
  476. add r2, 32
  477. cmp r5, 16
  478. jl .nextblock
  479. REP_RET
  480. .try_dc:
  481. movsx r6, word [r2]
  482. test r6, r6
  483. jz .skipblock
  484. mov word [r2], 0
  485. DC_ADD_MMXEXT_INIT r6, r3
  486. %if ARCH_X86_64 == 0
  487. %define dst2q r1
  488. %define dst2d r1d
  489. %endif
  490. mov dst2d, dword [r1+r5*4]
  491. add dst2q, r0
  492. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  493. %if ARCH_X86_64 == 0
  494. mov r1, r1m
  495. %endif
  496. .skipblock:
  497. inc r5
  498. add r2, 32
  499. cmp r5, 16
  500. jl .nextblock
  501. REP_RET
  502. ; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
  503. ; int16_t *block, int stride,
  504. ; const uint8_t nnzc[6 * 8])
  505. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  506. movsxdifnidn r3, r3d
  507. %assign pad 128+4-(stack_offset&7)
  508. SUB rsp, pad
  509. xor r5, r5
  510. %ifdef PIC
  511. lea picregq, [scan8_mem]
  512. %endif
  513. .nextblock:
  514. movzx r6, byte [scan8+r5]
  515. movzx r6, byte [r4+r6]
  516. test r6, r6
  517. jz .skipblock
  518. cmp r6, 1
  519. jnz .no_dc
  520. movsx r6, word [r2]
  521. test r6, r6
  522. jz .no_dc
  523. mov word [r2], 0
  524. DC_ADD_MMXEXT_INIT r6, r3
  525. %if ARCH_X86_64 == 0
  526. %define dst2q r1
  527. %define dst2d r1d
  528. %endif
  529. mov dst2d, dword [r1+r5*4]
  530. lea dst2q, [r0+dst2q]
  531. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  532. lea dst2q, [dst2q+r3*4]
  533. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  534. %if ARCH_X86_64 == 0
  535. mov r1, r1m
  536. %endif
  537. add r5, 4
  538. add r2, 128
  539. cmp r5, 16
  540. jl .nextblock
  541. ADD rsp, pad
  542. RET
  543. .no_dc:
  544. mov r6d, dword [r1+r5*4]
  545. add r6, r0
  546. add word [r2], 32
  547. IDCT8_ADD_MMX_START r2 , rsp
  548. IDCT8_ADD_MMX_START r2+8, rsp+64
  549. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  550. mov r6d, dword [r1+r5*4]
  551. lea r6, [r0+r6+4]
  552. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  553. .skipblock:
  554. add r5, 4
  555. add r2, 128
  556. cmp r5, 16
  557. jl .nextblock
  558. ADD rsp, pad
  559. RET
  560. INIT_XMM sse2
  561. ; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
  562. ; int16_t *block, int stride,
  563. ; const uint8_t nnzc[6 * 8])
  564. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  565. movsxdifnidn r3, r3d
  566. xor r5, r5
  567. %ifdef PIC
  568. lea picregq, [scan8_mem]
  569. %endif
  570. .nextblock:
  571. movzx r6, byte [scan8+r5]
  572. movzx r6, byte [r4+r6]
  573. test r6, r6
  574. jz .skipblock
  575. cmp r6, 1
  576. jnz .no_dc
  577. movsx r6, word [r2]
  578. test r6, r6
  579. jz .no_dc
  580. INIT_MMX cpuname
  581. mov word [r2], 0
  582. DC_ADD_MMXEXT_INIT r6, r3
  583. %if ARCH_X86_64 == 0
  584. %define dst2q r1
  585. %define dst2d r1d
  586. %endif
  587. mov dst2d, dword [r1+r5*4]
  588. add dst2q, r0
  589. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  590. lea dst2q, [dst2q+r3*4]
  591. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  592. %if ARCH_X86_64 == 0
  593. mov r1, r1m
  594. %endif
  595. add r5, 4
  596. add r2, 128
  597. cmp r5, 16
  598. jl .nextblock
  599. REP_RET
  600. .no_dc:
  601. INIT_XMM cpuname
  602. mov dst2d, dword [r1+r5*4]
  603. add dst2q, r0
  604. IDCT8_ADD_SSE dst2q, r2, r3, r6
  605. %if ARCH_X86_64 == 0
  606. mov r1, r1m
  607. %endif
  608. .skipblock:
  609. add r5, 4
  610. add r2, 128
  611. cmp r5, 16
  612. jl .nextblock
  613. REP_RET
  614. INIT_MMX mmx
  615. h264_idct_add8_mmx_plane:
  616. movsxdifnidn r3, r3d
  617. .nextblock:
  618. movzx r6, byte [scan8+r5]
  619. movzx r6, byte [r4+r6]
  620. or r6w, word [r2]
  621. test r6, r6
  622. jz .skipblock
  623. %if ARCH_X86_64
  624. mov r0d, dword [r1+r5*4]
  625. add r0, [dst2q]
  626. %else
  627. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  628. mov r0, [r0]
  629. add r0, dword [r1+r5*4]
  630. %endif
  631. IDCT4_ADD r0, r2, r3
  632. .skipblock:
  633. inc r5
  634. add r2, 32
  635. test r5, 3
  636. jnz .nextblock
  637. rep ret
  638. ; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
  639. ; int16_t *block, int stride,
  640. ; const uint8_t nnzc[6 * 8])
  641. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  642. movsxdifnidn r3, r3d
  643. mov r5, 16
  644. add r2, 512
  645. %ifdef PIC
  646. lea picregq, [scan8_mem]
  647. %endif
  648. %if ARCH_X86_64
  649. mov dst2q, r0
  650. %endif
  651. call h264_idct_add8_mmx_plane
  652. mov r5, 32
  653. add r2, 384
  654. %if ARCH_X86_64
  655. add dst2q, gprsize
  656. %else
  657. add r0mp, gprsize
  658. %endif
  659. call h264_idct_add8_mmx_plane
  660. RET ; TODO: check rep ret after a function call
  661. cglobal h264_idct_add8_422_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  662. ; dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  663. movsxdifnidn r3, r3d
  664. %ifdef PIC
  665. lea picregq, [scan8_mem]
  666. %endif
  667. %if ARCH_X86_64
  668. mov dst2q, r0
  669. %endif
  670. mov r5, 16 ; i
  671. add r2, 512 ; i * 16 * sizeof(dctcoef) ; #define dctcoef int16_t
  672. call h264_idct_add8_mmx_plane
  673. add r5, 4
  674. call h264_idct_add8_mmx_plane
  675. %if ARCH_X86_64
  676. add dst2q, gprsize ; dest[1]
  677. %else
  678. add r0mp, gprsize
  679. %endif
  680. add r5, 4 ; set to 32
  681. add r2, 256 ; set to i * 16 * sizeof(dctcoef)
  682. call h264_idct_add8_mmx_plane
  683. add r5, 4
  684. call h264_idct_add8_mmx_plane
  685. RET ; TODO: check rep ret after a function call
  686. h264_idct_add8_mmxext_plane:
  687. movsxdifnidn r3, r3d
  688. .nextblock:
  689. movzx r6, byte [scan8+r5]
  690. movzx r6, byte [r4+r6]
  691. test r6, r6
  692. jz .try_dc
  693. %if ARCH_X86_64
  694. mov r0d, dword [r1+r5*4]
  695. add r0, [dst2q]
  696. %else
  697. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  698. mov r0, [r0]
  699. add r0, dword [r1+r5*4]
  700. %endif
  701. IDCT4_ADD r0, r2, r3
  702. inc r5
  703. add r2, 32
  704. test r5, 3
  705. jnz .nextblock
  706. rep ret
  707. .try_dc:
  708. movsx r6, word [r2]
  709. test r6, r6
  710. jz .skipblock
  711. mov word [r2], 0
  712. DC_ADD_MMXEXT_INIT r6, r3
  713. %if ARCH_X86_64
  714. mov r0d, dword [r1+r5*4]
  715. add r0, [dst2q]
  716. %else
  717. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  718. mov r0, [r0]
  719. add r0, dword [r1+r5*4]
  720. %endif
  721. DC_ADD_MMXEXT_OP movh, r0, r3, r6
  722. .skipblock:
  723. inc r5
  724. add r2, 32
  725. test r5, 3
  726. jnz .nextblock
  727. rep ret
  728. INIT_MMX mmxext
  729. ; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
  730. ; int16_t *block, int stride,
  731. ; const uint8_t nnzc[6 * 8])
  732. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  733. movsxdifnidn r3, r3d
  734. mov r5, 16
  735. add r2, 512
  736. %if ARCH_X86_64
  737. mov dst2q, r0
  738. %endif
  739. %ifdef PIC
  740. lea picregq, [scan8_mem]
  741. %endif
  742. call h264_idct_add8_mmxext_plane
  743. mov r5, 32
  744. add r2, 384
  745. %if ARCH_X86_64
  746. add dst2q, gprsize
  747. %else
  748. add r0mp, gprsize
  749. %endif
  750. call h264_idct_add8_mmxext_plane
  751. RET ; TODO: check rep ret after a function call
  752. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  753. h264_idct_dc_add8_mmxext:
  754. movsxdifnidn r3, r3d
  755. movd m0, [r2 ] ; 0 0 X D
  756. mov word [r2+ 0], 0
  757. punpcklwd m0, [r2+32] ; x X d D
  758. mov word [r2+32], 0
  759. paddsw m0, [pw_32]
  760. psraw m0, 6
  761. punpcklwd m0, m0 ; d d D D
  762. pxor m1, m1 ; 0 0 0 0
  763. psubw m1, m0 ; -d-d-D-D
  764. packuswb m0, m1 ; -d-d-D-D d d D D
  765. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  766. punpcklwd m0, m0 ; d d d d D D D D
  767. lea r6, [r3*3]
  768. DC_ADD_MMXEXT_OP movq, r0, r3, r6
  769. ret
  770. ALIGN 16
  771. INIT_XMM sse2
  772. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  773. h264_add8x4_idct_sse2:
  774. movsxdifnidn r3, r3d
  775. movq m0, [r2+ 0]
  776. movq m1, [r2+ 8]
  777. movq m2, [r2+16]
  778. movq m3, [r2+24]
  779. movhps m0, [r2+32]
  780. movhps m1, [r2+40]
  781. movhps m2, [r2+48]
  782. movhps m3, [r2+56]
  783. IDCT4_1D w,0,1,2,3,4,5
  784. TRANSPOSE2x4x4W 0,1,2,3,4
  785. paddw m0, [pw_32]
  786. IDCT4_1D w,0,1,2,3,4,5
  787. pxor m7, m7
  788. mova [r2+ 0], m7
  789. mova [r2+16], m7
  790. mova [r2+32], m7
  791. mova [r2+48], m7
  792. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  793. lea r0, [r0+r3*2]
  794. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  795. ret
  796. %macro add16_sse2_cycle 2
  797. movzx r0, word [r4+%2]
  798. test r0, r0
  799. jz .cycle%1end
  800. mov r0d, dword [r1+%1*8]
  801. %if ARCH_X86_64
  802. add r0, r5
  803. %else
  804. add r0, r0m
  805. %endif
  806. call h264_add8x4_idct_sse2
  807. .cycle%1end:
  808. %if %1 < 7
  809. add r2, 64
  810. %endif
  811. %endmacro
  812. ; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
  813. ; int16_t *block, int stride,
  814. ; const uint8_t nnzc[6 * 8])
  815. cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
  816. movsxdifnidn r3, r3d
  817. %if ARCH_X86_64
  818. mov r5, r0
  819. %endif
  820. ; unrolling of the loop leads to an average performance gain of
  821. ; 20-25%
  822. add16_sse2_cycle 0, 0xc
  823. add16_sse2_cycle 1, 0x14
  824. add16_sse2_cycle 2, 0xe
  825. add16_sse2_cycle 3, 0x16
  826. add16_sse2_cycle 4, 0x1c
  827. add16_sse2_cycle 5, 0x24
  828. add16_sse2_cycle 6, 0x1e
  829. add16_sse2_cycle 7, 0x26
  830. REP_RET
  831. %macro add16intra_sse2_cycle 2
  832. movzx r0, word [r4+%2]
  833. test r0, r0
  834. jz .try%1dc
  835. mov r0d, dword [r1+%1*8]
  836. %if ARCH_X86_64
  837. add r0, r7
  838. %else
  839. add r0, r0m
  840. %endif
  841. call h264_add8x4_idct_sse2
  842. jmp .cycle%1end
  843. .try%1dc:
  844. movsx r0, word [r2 ]
  845. or r0w, word [r2+32]
  846. jz .cycle%1end
  847. mov r0d, dword [r1+%1*8]
  848. %if ARCH_X86_64
  849. add r0, r7
  850. %else
  851. add r0, r0m
  852. %endif
  853. call h264_idct_dc_add8_mmxext
  854. .cycle%1end:
  855. %if %1 < 7
  856. add r2, 64
  857. %endif
  858. %endmacro
  859. ; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
  860. ; int16_t *block, int stride,
  861. ; const uint8_t nnzc[6 * 8])
  862. cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
  863. movsxdifnidn r3, r3d
  864. %if ARCH_X86_64
  865. mov r7, r0
  866. %endif
  867. add16intra_sse2_cycle 0, 0xc
  868. add16intra_sse2_cycle 1, 0x14
  869. add16intra_sse2_cycle 2, 0xe
  870. add16intra_sse2_cycle 3, 0x16
  871. add16intra_sse2_cycle 4, 0x1c
  872. add16intra_sse2_cycle 5, 0x24
  873. add16intra_sse2_cycle 6, 0x1e
  874. add16intra_sse2_cycle 7, 0x26
  875. REP_RET
  876. %macro add8_sse2_cycle 2
  877. movzx r0, word [r4+%2]
  878. test r0, r0
  879. jz .try%1dc
  880. %if ARCH_X86_64
  881. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  882. add r0, [r7]
  883. %else
  884. mov r0, r0m
  885. mov r0, [r0]
  886. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  887. %endif
  888. call h264_add8x4_idct_sse2
  889. jmp .cycle%1end
  890. .try%1dc:
  891. movsx r0, word [r2 ]
  892. or r0w, word [r2+32]
  893. jz .cycle%1end
  894. %if ARCH_X86_64
  895. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  896. add r0, [r7]
  897. %else
  898. mov r0, r0m
  899. mov r0, [r0]
  900. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  901. %endif
  902. call h264_idct_dc_add8_mmxext
  903. .cycle%1end:
  904. %if %1 == 1
  905. add r2, 384+64
  906. %elif %1 < 3
  907. add r2, 64
  908. %endif
  909. %endmacro
  910. ; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
  911. ; int16_t *block, int stride,
  912. ; const uint8_t nnzc[6 * 8])
  913. cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
  914. movsxdifnidn r3, r3d
  915. add r2, 512
  916. %if ARCH_X86_64
  917. mov r7, r0
  918. %endif
  919. add8_sse2_cycle 0, 0x34
  920. add8_sse2_cycle 1, 0x3c
  921. %if ARCH_X86_64
  922. add r7, gprsize
  923. %else
  924. add r0mp, gprsize
  925. %endif
  926. add8_sse2_cycle 2, 0x5c
  927. add8_sse2_cycle 3, 0x64
  928. REP_RET
  929. ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
  930. %macro WALSH4_1D 5
  931. SUMSUB_BADC w, %4, %3, %2, %1, %5
  932. SUMSUB_BADC w, %4, %2, %3, %1, %5
  933. SWAP %1, %4, %3
  934. %endmacro
  935. %macro DEQUANT 1-3
  936. %if cpuflag(sse2)
  937. movd xmm4, t3d
  938. movq xmm5, [pw_1]
  939. pshufd xmm4, xmm4, 0
  940. movq2dq xmm0, m0
  941. movq2dq xmm1, m1
  942. movq2dq xmm2, m2
  943. movq2dq xmm3, m3
  944. punpcklwd xmm0, xmm5
  945. punpcklwd xmm1, xmm5
  946. punpcklwd xmm2, xmm5
  947. punpcklwd xmm3, xmm5
  948. pmaddwd xmm0, xmm4
  949. pmaddwd xmm1, xmm4
  950. pmaddwd xmm2, xmm4
  951. pmaddwd xmm3, xmm4
  952. psrad xmm0, %1
  953. psrad xmm1, %1
  954. psrad xmm2, %1
  955. psrad xmm3, %1
  956. packssdw xmm0, xmm1
  957. packssdw xmm2, xmm3
  958. %else
  959. mova m7, [pw_1]
  960. mova m4, %1
  961. punpcklwd %1, m7
  962. punpckhwd m4, m7
  963. mova m5, %2
  964. punpcklwd %2, m7
  965. punpckhwd m5, m7
  966. movd m7, t3d
  967. punpckldq m7, m7
  968. pmaddwd %1, m7
  969. pmaddwd %2, m7
  970. pmaddwd m4, m7
  971. pmaddwd m5, m7
  972. psrad %1, %3
  973. psrad %2, %3
  974. psrad m4, %3
  975. psrad m5, %3
  976. packssdw %1, m4
  977. packssdw %2, m5
  978. %endif
  979. %endmacro
  980. %macro STORE_WORDS 5-9
  981. %if cpuflag(sse)
  982. movd t0d, %1
  983. psrldq %1, 4
  984. movd t1d, %1
  985. psrldq %1, 4
  986. mov [t2+%2*32], t0w
  987. mov [t2+%4*32], t1w
  988. shr t0d, 16
  989. shr t1d, 16
  990. mov [t2+%3*32], t0w
  991. mov [t2+%5*32], t1w
  992. movd t0d, %1
  993. psrldq %1, 4
  994. movd t1d, %1
  995. mov [t2+%6*32], t0w
  996. mov [t2+%8*32], t1w
  997. shr t0d, 16
  998. shr t1d, 16
  999. mov [t2+%7*32], t0w
  1000. mov [t2+%9*32], t1w
  1001. %else
  1002. movd t0d, %1
  1003. psrlq %1, 32
  1004. movd t1d, %1
  1005. mov [t2+%2*32], t0w
  1006. mov [t2+%4*32], t1w
  1007. shr t0d, 16
  1008. shr t1d, 16
  1009. mov [t2+%3*32], t0w
  1010. mov [t2+%5*32], t1w
  1011. %endif
  1012. %endmacro
  1013. %macro DEQUANT_STORE 1
  1014. %if cpuflag(sse2)
  1015. DEQUANT %1
  1016. STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  1017. STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  1018. %else
  1019. DEQUANT m0, m1, %1
  1020. STORE_WORDS m0, 0, 1, 4, 5
  1021. STORE_WORDS m1, 2, 3, 6, 7
  1022. DEQUANT m2, m3, %1
  1023. STORE_WORDS m2, 8, 9, 12, 13
  1024. STORE_WORDS m3, 10, 11, 14, 15
  1025. %endif
  1026. %endmacro
  1027. %macro IDCT_DC_DEQUANT 1
  1028. cglobal h264_luma_dc_dequant_idct, 3, 4, %1
  1029. ; manually spill XMM registers for Win64 because
  1030. ; the code here is initialized with INIT_MMX
  1031. WIN64_SPILL_XMM %1
  1032. movq m3, [r1+24]
  1033. movq m2, [r1+16]
  1034. movq m1, [r1+ 8]
  1035. movq m0, [r1+ 0]
  1036. WALSH4_1D 0,1,2,3,4
  1037. TRANSPOSE4x4W 0,1,2,3,4
  1038. WALSH4_1D 0,1,2,3,4
  1039. ; shift, tmp, output, qmul
  1040. %if WIN64
  1041. DECLARE_REG_TMP 0,3,1,2
  1042. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  1043. xchg r0, t2
  1044. %elif ARCH_X86_64
  1045. DECLARE_REG_TMP 3,1,0,2
  1046. %else
  1047. DECLARE_REG_TMP 1,3,0,2
  1048. %endif
  1049. cmp t3d, 32767
  1050. jg .big_qmul
  1051. add t3d, 128 << 16
  1052. DEQUANT_STORE 8
  1053. RET
  1054. .big_qmul:
  1055. bsr t0d, t3d
  1056. add t3d, 128 << 16
  1057. mov t1d, 7
  1058. cmp t0d, t1d
  1059. cmovg t0d, t1d
  1060. inc t1d
  1061. shr t3d, t0b
  1062. sub t1d, t0d
  1063. %if cpuflag(sse2)
  1064. movd xmm6, t1d
  1065. DEQUANT_STORE xmm6
  1066. %else
  1067. movd m6, t1d
  1068. DEQUANT_STORE m6
  1069. %endif
  1070. RET
  1071. %endmacro
  1072. INIT_MMX mmx
  1073. IDCT_DC_DEQUANT 0
  1074. INIT_MMX sse2
  1075. IDCT_DC_DEQUANT 7
  1076. ; %unmacro STORE_DIFFx2 8 ; remove macro from x86util.asm but yasm doesn't have this yet
  1077. %macro STORE_DIFFx2 8 ; add1, add2, reg1, reg2, zero, shift, source, stride
  1078. movd %3, [%7]
  1079. movd %4, [%7+%8]
  1080. psraw %1, %6
  1081. psraw %2, %6
  1082. punpcklbw %3, %5
  1083. punpcklbw %4, %5
  1084. paddw %3, %1
  1085. paddw %4, %2
  1086. packuswb %3, %5
  1087. packuswb %4, %5
  1088. movd [%7], %3
  1089. movd [%7+%8], %4
  1090. %endmacro
  1091. %macro DC_ADD_INIT 1
  1092. add %1d, 32
  1093. sar %1d, 6
  1094. movd m0, %1d
  1095. pshuflw m0, m0, 0
  1096. lea %1, [3*stride_q]
  1097. pxor m1, m1
  1098. psubw m1, m0
  1099. packuswb m0, m0
  1100. packuswb m1, m1
  1101. %endmacro
  1102. %macro IDCT_XMM 1
  1103. INIT_XMM %1
  1104. cglobal h264_idct_add_8, 3, 3, 8, dst_, block_, stride_
  1105. movsxdifnidn stride_q, stride_d
  1106. IDCT4_ADD dst_q, block_q, stride_q
  1107. RET
  1108. cglobal h264_idct_dc_add_8, 3, 4, 6, dst_, block_, stride_
  1109. movsxdifnidn stride_q, stride_d
  1110. movsx r3d, word [block_q]
  1111. mov dword [block_q], 0
  1112. DC_ADD_INIT r3
  1113. DC_ADD_MMXEXT_OP movd, dst_q, stride_q, r3
  1114. RET
  1115. %endmacro
  1116. IDCT_XMM sse2
  1117. IDCT_XMM avx