You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1015 lines
26KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of FFmpeg.
  13. ;*
  14. ;* FFmpeg is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* FFmpeg is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with FFmpeg; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "libavutil/x86/x86util.asm"
  29. SECTION_RODATA
  30. ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  31. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  32. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  33. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  34. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  35. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  36. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  37. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  38. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  39. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  40. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  41. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  42. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  43. %ifdef PIC
  44. %define npicregs 1
  45. %define scan8 picregq
  46. %else
  47. %define npicregs 0
  48. %define scan8 scan8_mem
  49. %endif
  50. cextern pw_32
  51. cextern pw_1
  52. SECTION .text
  53. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  54. %macro IDCT4_ADD 3
  55. ; Load dct coeffs
  56. movq m0, [%2]
  57. movq m1, [%2+8]
  58. movq m2, [%2+16]
  59. movq m3, [%2+24]
  60. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  61. mova m6, [pw_32]
  62. TRANSPOSE4x4W 0, 1, 2, 3, 4
  63. paddw m0, m6
  64. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  65. pxor m7, m7
  66. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  67. lea %1, [%1+%3*2]
  68. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  69. %endmacro
  70. INIT_MMX
  71. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  72. cglobal h264_idct_add_8_mmx, 3, 3, 0
  73. IDCT4_ADD r0, r1, r2
  74. RET
  75. %macro IDCT8_1D 2
  76. mova m0, m1
  77. psraw m1, 1
  78. mova m4, m5
  79. psraw m4, 1
  80. paddw m4, m5
  81. paddw m1, m0
  82. paddw m4, m7
  83. paddw m1, m5
  84. psubw m4, m0
  85. paddw m1, m3
  86. psubw m0, m3
  87. psubw m5, m3
  88. psraw m3, 1
  89. paddw m0, m7
  90. psubw m5, m7
  91. psraw m7, 1
  92. psubw m0, m3
  93. psubw m5, m7
  94. mova m7, m1
  95. psraw m1, 2
  96. mova m3, m4
  97. psraw m3, 2
  98. paddw m3, m0
  99. psraw m0, 2
  100. paddw m1, m5
  101. psraw m5, 2
  102. psubw m0, m4
  103. psubw m7, m5
  104. mova m5, m6
  105. psraw m6, 1
  106. mova m4, m2
  107. psraw m4, 1
  108. paddw m6, m2
  109. psubw m4, m5
  110. mova m2, %1
  111. mova m5, %2
  112. SUMSUB_BA w, 5, 2
  113. SUMSUB_BA w, 6, 5
  114. SUMSUB_BA w, 4, 2
  115. SUMSUB_BA w, 7, 6
  116. SUMSUB_BA w, 0, 4
  117. SUMSUB_BA w, 3, 2
  118. SUMSUB_BA w, 1, 5
  119. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  120. %endmacro
  121. %macro IDCT8_1D_FULL 1
  122. mova m7, [%1+112]
  123. mova m6, [%1+ 96]
  124. mova m5, [%1+ 80]
  125. mova m3, [%1+ 48]
  126. mova m2, [%1+ 32]
  127. mova m1, [%1+ 16]
  128. IDCT8_1D [%1], [%1+ 64]
  129. %endmacro
  130. ; %1=int16_t *block, %2=int16_t *dstblock
  131. %macro IDCT8_ADD_MMX_START 2
  132. IDCT8_1D_FULL %1
  133. mova [%1], m7
  134. TRANSPOSE4x4W 0, 1, 2, 3, 7
  135. mova m7, [%1]
  136. mova [%2 ], m0
  137. mova [%2+16], m1
  138. mova [%2+32], m2
  139. mova [%2+48], m3
  140. TRANSPOSE4x4W 4, 5, 6, 7, 3
  141. mova [%2+ 8], m4
  142. mova [%2+24], m5
  143. mova [%2+40], m6
  144. mova [%2+56], m7
  145. %endmacro
  146. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  147. %macro IDCT8_ADD_MMX_END 3
  148. IDCT8_1D_FULL %2
  149. mova [%2 ], m5
  150. mova [%2+16], m6
  151. mova [%2+32], m7
  152. pxor m7, m7
  153. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  154. lea %1, [%1+%3*2]
  155. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  156. mova m0, [%2 ]
  157. mova m1, [%2+16]
  158. mova m2, [%2+32]
  159. lea %1, [%1+%3*2]
  160. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  161. lea %1, [%1+%3*2]
  162. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  163. %endmacro
  164. INIT_MMX
  165. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  166. cglobal h264_idct8_add_8_mmx, 3, 4, 0
  167. %assign pad 128+4-(stack_offset&7)
  168. SUB rsp, pad
  169. add word [r1], 32
  170. IDCT8_ADD_MMX_START r1 , rsp
  171. IDCT8_ADD_MMX_START r1+8, rsp+64
  172. lea r3, [r0+4]
  173. IDCT8_ADD_MMX_END r0 , rsp, r2
  174. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  175. ADD rsp, pad
  176. RET
  177. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  178. %macro IDCT8_ADD_SSE 4
  179. IDCT8_1D_FULL %2
  180. %if ARCH_X86_64
  181. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  182. %else
  183. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  184. %endif
  185. paddw m0, [pw_32]
  186. %if ARCH_X86_64 == 0
  187. mova [%2 ], m0
  188. mova [%2+16], m4
  189. IDCT8_1D [%2], [%2+ 16]
  190. mova [%2 ], m6
  191. mova [%2+16], m7
  192. %else
  193. SWAP 0, 8
  194. SWAP 4, 9
  195. IDCT8_1D m8, m9
  196. SWAP 6, 8
  197. SWAP 7, 9
  198. %endif
  199. pxor m7, m7
  200. lea %4, [%3*3]
  201. STORE_DIFF m0, m6, m7, [%1 ]
  202. STORE_DIFF m1, m6, m7, [%1+%3 ]
  203. STORE_DIFF m2, m6, m7, [%1+%3*2]
  204. STORE_DIFF m3, m6, m7, [%1+%4 ]
  205. %if ARCH_X86_64 == 0
  206. mova m0, [%2 ]
  207. mova m1, [%2+16]
  208. %else
  209. SWAP 0, 8
  210. SWAP 1, 9
  211. %endif
  212. lea %1, [%1+%3*4]
  213. STORE_DIFF m4, m6, m7, [%1 ]
  214. STORE_DIFF m5, m6, m7, [%1+%3 ]
  215. STORE_DIFF m0, m6, m7, [%1+%3*2]
  216. STORE_DIFF m1, m6, m7, [%1+%4 ]
  217. %endmacro
  218. INIT_XMM
  219. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  220. cglobal h264_idct8_add_8_sse2, 3, 4, 10
  221. IDCT8_ADD_SSE r0, r1, r2, r3
  222. RET
  223. %macro DC_ADD_MMXEXT_INIT 2-3
  224. %if %0 == 2
  225. movsx %1, word [%1]
  226. add %1, 32
  227. sar %1, 6
  228. movd m0, %1d
  229. lea %1, [%2*3]
  230. %else
  231. add %3, 32
  232. sar %3, 6
  233. movd m0, %3d
  234. lea %3, [%2*3]
  235. %endif
  236. pshufw m0, m0, 0
  237. pxor m1, m1
  238. psubw m1, m0
  239. packuswb m0, m0
  240. packuswb m1, m1
  241. %endmacro
  242. %macro DC_ADD_MMXEXT_OP 4
  243. %1 m2, [%2 ]
  244. %1 m3, [%2+%3 ]
  245. %1 m4, [%2+%3*2]
  246. %1 m5, [%2+%4 ]
  247. paddusb m2, m0
  248. paddusb m3, m0
  249. paddusb m4, m0
  250. paddusb m5, m0
  251. psubusb m2, m1
  252. psubusb m3, m1
  253. psubusb m4, m1
  254. psubusb m5, m1
  255. %1 [%2 ], m2
  256. %1 [%2+%3 ], m3
  257. %1 [%2+%3*2], m4
  258. %1 [%2+%4 ], m5
  259. %endmacro
  260. INIT_MMX
  261. ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  262. cglobal h264_idct_dc_add_8_mmxext, 3, 3, 0
  263. DC_ADD_MMXEXT_INIT r1, r2
  264. DC_ADD_MMXEXT_OP movh, r0, r2, r1
  265. RET
  266. ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  267. cglobal h264_idct8_dc_add_8_mmxext, 3, 3, 0
  268. DC_ADD_MMXEXT_INIT r1, r2
  269. DC_ADD_MMXEXT_OP mova, r0, r2, r1
  270. lea r0, [r0+r2*4]
  271. DC_ADD_MMXEXT_OP mova, r0, r2, r1
  272. RET
  273. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  274. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  275. cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  276. xor r5, r5
  277. %ifdef PIC
  278. lea picregq, [scan8_mem]
  279. %endif
  280. .nextblock:
  281. movzx r6, byte [scan8+r5]
  282. movzx r6, byte [r4+r6]
  283. test r6, r6
  284. jz .skipblock
  285. mov r6d, dword [r1+r5*4]
  286. lea r6, [r0+r6]
  287. IDCT4_ADD r6, r2, r3
  288. .skipblock:
  289. inc r5
  290. add r2, 32
  291. cmp r5, 16
  292. jl .nextblock
  293. REP_RET
  294. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  295. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  296. cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  297. %assign pad 128+4-(stack_offset&7)
  298. SUB rsp, pad
  299. xor r5, r5
  300. %ifdef PIC
  301. lea picregq, [scan8_mem]
  302. %endif
  303. .nextblock:
  304. movzx r6, byte [scan8+r5]
  305. movzx r6, byte [r4+r6]
  306. test r6, r6
  307. jz .skipblock
  308. mov r6d, dword [r1+r5*4]
  309. add r6, r0
  310. add word [r2], 32
  311. IDCT8_ADD_MMX_START r2 , rsp
  312. IDCT8_ADD_MMX_START r2+8, rsp+64
  313. IDCT8_ADD_MMX_END r6 , rsp, r3
  314. mov r6d, dword [r1+r5*4]
  315. lea r6, [r0+r6+4]
  316. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  317. .skipblock:
  318. add r5, 4
  319. add r2, 128
  320. cmp r5, 16
  321. jl .nextblock
  322. ADD rsp, pad
  323. RET
  324. ; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
  325. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  326. cglobal h264_idct_add16_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  327. xor r5, r5
  328. %ifdef PIC
  329. lea picregq, [scan8_mem]
  330. %endif
  331. .nextblock:
  332. movzx r6, byte [scan8+r5]
  333. movzx r6, byte [r4+r6]
  334. test r6, r6
  335. jz .skipblock
  336. cmp r6, 1
  337. jnz .no_dc
  338. movsx r6, word [r2]
  339. test r6, r6
  340. jz .no_dc
  341. DC_ADD_MMXEXT_INIT r2, r3, r6
  342. %if ARCH_X86_64 == 0
  343. %define dst2q r1
  344. %define dst2d r1d
  345. %endif
  346. mov dst2d, dword [r1+r5*4]
  347. lea dst2q, [r0+dst2q]
  348. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  349. %if ARCH_X86_64 == 0
  350. mov r1, r1m
  351. %endif
  352. inc r5
  353. add r2, 32
  354. cmp r5, 16
  355. jl .nextblock
  356. REP_RET
  357. .no_dc:
  358. mov r6d, dword [r1+r5*4]
  359. add r6, r0
  360. IDCT4_ADD r6, r2, r3
  361. .skipblock:
  362. inc r5
  363. add r2, 32
  364. cmp r5, 16
  365. jl .nextblock
  366. REP_RET
  367. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  368. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  369. cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  370. xor r5, r5
  371. %ifdef PIC
  372. lea picregq, [scan8_mem]
  373. %endif
  374. .nextblock:
  375. movzx r6, byte [scan8+r5]
  376. movzx r6, byte [r4+r6]
  377. or r6w, word [r2]
  378. test r6, r6
  379. jz .skipblock
  380. mov r6d, dword [r1+r5*4]
  381. add r6, r0
  382. IDCT4_ADD r6, r2, r3
  383. .skipblock:
  384. inc r5
  385. add r2, 32
  386. cmp r5, 16
  387. jl .nextblock
  388. REP_RET
  389. ; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
  390. ; DCTELEM *block, int stride,
  391. ; const uint8_t nnzc[6*8])
  392. cglobal h264_idct_add16intra_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  393. xor r5, r5
  394. %ifdef PIC
  395. lea picregq, [scan8_mem]
  396. %endif
  397. .nextblock:
  398. movzx r6, byte [scan8+r5]
  399. movzx r6, byte [r4+r6]
  400. test r6, r6
  401. jz .try_dc
  402. mov r6d, dword [r1+r5*4]
  403. lea r6, [r0+r6]
  404. IDCT4_ADD r6, r2, r3
  405. inc r5
  406. add r2, 32
  407. cmp r5, 16
  408. jl .nextblock
  409. REP_RET
  410. .try_dc:
  411. movsx r6, word [r2]
  412. test r6, r6
  413. jz .skipblock
  414. DC_ADD_MMXEXT_INIT r2, r3, r6
  415. %if ARCH_X86_64 == 0
  416. %define dst2q r1
  417. %define dst2d r1d
  418. %endif
  419. mov dst2d, dword [r1+r5*4]
  420. add dst2q, r0
  421. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  422. %if ARCH_X86_64 == 0
  423. mov r1, r1m
  424. %endif
  425. .skipblock:
  426. inc r5
  427. add r2, 32
  428. cmp r5, 16
  429. jl .nextblock
  430. REP_RET
  431. ; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
  432. ; DCTELEM *block, int stride,
  433. ; const uint8_t nnzc[6*8])
  434. cglobal h264_idct8_add4_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  435. %assign pad 128+4-(stack_offset&7)
  436. SUB rsp, pad
  437. xor r5, r5
  438. %ifdef PIC
  439. lea picregq, [scan8_mem]
  440. %endif
  441. .nextblock:
  442. movzx r6, byte [scan8+r5]
  443. movzx r6, byte [r4+r6]
  444. test r6, r6
  445. jz .skipblock
  446. cmp r6, 1
  447. jnz .no_dc
  448. movsx r6, word [r2]
  449. test r6, r6
  450. jz .no_dc
  451. DC_ADD_MMXEXT_INIT r2, r3, r6
  452. %if ARCH_X86_64 == 0
  453. %define dst2q r1
  454. %define dst2d r1d
  455. %endif
  456. mov dst2d, dword [r1+r5*4]
  457. lea dst2q, [r0+dst2q]
  458. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  459. lea dst2q, [dst2q+r3*4]
  460. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  461. %if ARCH_X86_64 == 0
  462. mov r1, r1m
  463. %endif
  464. add r5, 4
  465. add r2, 128
  466. cmp r5, 16
  467. jl .nextblock
  468. ADD rsp, pad
  469. RET
  470. .no_dc:
  471. mov r6d, dword [r1+r5*4]
  472. add r6, r0
  473. add word [r2], 32
  474. IDCT8_ADD_MMX_START r2 , rsp
  475. IDCT8_ADD_MMX_START r2+8, rsp+64
  476. IDCT8_ADD_MMX_END r6 , rsp, r3
  477. mov r6d, dword [r1+r5*4]
  478. lea r6, [r0+r6+4]
  479. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  480. .skipblock:
  481. add r5, 4
  482. add r2, 128
  483. cmp r5, 16
  484. jl .nextblock
  485. ADD rsp, pad
  486. RET
  487. INIT_XMM
  488. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  489. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  490. cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  491. xor r5, r5
  492. %ifdef PIC
  493. lea picregq, [scan8_mem]
  494. %endif
  495. .nextblock:
  496. movzx r6, byte [scan8+r5]
  497. movzx r6, byte [r4+r6]
  498. test r6, r6
  499. jz .skipblock
  500. cmp r6, 1
  501. jnz .no_dc
  502. movsx r6, word [r2]
  503. test r6, r6
  504. jz .no_dc
  505. INIT_MMX
  506. DC_ADD_MMXEXT_INIT r2, r3, r6
  507. %if ARCH_X86_64 == 0
  508. %define dst2q r1
  509. %define dst2d r1d
  510. %endif
  511. mov dst2d, dword [r1+r5*4]
  512. add dst2q, r0
  513. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  514. lea dst2q, [dst2q+r3*4]
  515. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  516. %if ARCH_X86_64 == 0
  517. mov r1, r1m
  518. %endif
  519. add r5, 4
  520. add r2, 128
  521. cmp r5, 16
  522. jl .nextblock
  523. REP_RET
  524. .no_dc:
  525. INIT_XMM
  526. mov dst2d, dword [r1+r5*4]
  527. add dst2q, r0
  528. IDCT8_ADD_SSE dst2q, r2, r3, r6
  529. %if ARCH_X86_64 == 0
  530. mov r1, r1m
  531. %endif
  532. .skipblock:
  533. add r5, 4
  534. add r2, 128
  535. cmp r5, 16
  536. jl .nextblock
  537. REP_RET
  538. INIT_MMX
  539. h264_idct_add8_mmx_plane:
  540. .nextblock:
  541. movzx r6, byte [scan8+r5]
  542. movzx r6, byte [r4+r6]
  543. or r6w, word [r2]
  544. test r6, r6
  545. jz .skipblock
  546. %if ARCH_X86_64
  547. mov r0d, dword [r1+r5*4]
  548. add r0, [dst2q]
  549. %else
  550. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  551. mov r0, [r0]
  552. add r0, dword [r1+r5*4]
  553. %endif
  554. IDCT4_ADD r0, r2, r3
  555. .skipblock:
  556. inc r5
  557. add r2, 32
  558. test r5, 3
  559. jnz .nextblock
  560. rep ret
  561. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  562. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  563. cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  564. mov r5, 16
  565. add r2, 512
  566. %ifdef PIC
  567. lea picregq, [scan8_mem]
  568. %endif
  569. %if ARCH_X86_64
  570. mov dst2q, r0
  571. %endif
  572. call h264_idct_add8_mmx_plane
  573. mov r5, 32
  574. add r2, 384
  575. %if ARCH_X86_64
  576. add dst2q, gprsize
  577. %else
  578. add r0mp, gprsize
  579. %endif
  580. call h264_idct_add8_mmx_plane
  581. RET
  582. h264_idct_add8_mmxext_plane:
  583. .nextblock:
  584. movzx r6, byte [scan8+r5]
  585. movzx r6, byte [r4+r6]
  586. test r6, r6
  587. jz .try_dc
  588. %if ARCH_X86_64
  589. mov r0d, dword [r1+r5*4]
  590. add r0, [dst2q]
  591. %else
  592. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  593. mov r0, [r0]
  594. add r0, dword [r1+r5*4]
  595. %endif
  596. IDCT4_ADD r0, r2, r3
  597. inc r5
  598. add r2, 32
  599. test r5, 3
  600. jnz .nextblock
  601. rep ret
  602. .try_dc:
  603. movsx r6, word [r2]
  604. test r6, r6
  605. jz .skipblock
  606. DC_ADD_MMXEXT_INIT r2, r3, r6
  607. %if ARCH_X86_64
  608. mov r0d, dword [r1+r5*4]
  609. add r0, [dst2q]
  610. %else
  611. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  612. mov r0, [r0]
  613. add r0, dword [r1+r5*4]
  614. %endif
  615. DC_ADD_MMXEXT_OP movh, r0, r3, r6
  616. .skipblock:
  617. inc r5
  618. add r2, 32
  619. test r5, 3
  620. jnz .nextblock
  621. rep ret
  622. ; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
  623. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  624. cglobal h264_idct_add8_8_mmxext, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  625. mov r5, 16
  626. add r2, 512
  627. %if ARCH_X86_64
  628. mov dst2q, r0
  629. %endif
  630. %ifdef PIC
  631. lea picregq, [scan8_mem]
  632. %endif
  633. call h264_idct_add8_mmxext_plane
  634. mov r5, 32
  635. add r2, 384
  636. %if ARCH_X86_64
  637. add dst2q, gprsize
  638. %else
  639. add r0mp, gprsize
  640. %endif
  641. call h264_idct_add8_mmxext_plane
  642. RET
  643. INIT_MMX
  644. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  645. h264_idct_dc_add8_mmxext:
  646. movd m0, [r2 ] ; 0 0 X D
  647. punpcklwd m0, [r2+32] ; x X d D
  648. paddsw m0, [pw_32]
  649. psraw m0, 6
  650. punpcklwd m0, m0 ; d d D D
  651. pxor m1, m1 ; 0 0 0 0
  652. psubw m1, m0 ; -d-d-D-D
  653. packuswb m0, m1 ; -d-d-D-D d d D D
  654. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  655. punpcklwd m0, m0 ; d d d d D D D D
  656. lea r6, [r3*3]
  657. DC_ADD_MMXEXT_OP movq, r0, r3, r6
  658. ret
  659. ALIGN 16
  660. INIT_XMM
  661. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  662. h264_add8x4_idct_sse2:
  663. movq m0, [r2+ 0]
  664. movq m1, [r2+ 8]
  665. movq m2, [r2+16]
  666. movq m3, [r2+24]
  667. movhps m0, [r2+32]
  668. movhps m1, [r2+40]
  669. movhps m2, [r2+48]
  670. movhps m3, [r2+56]
  671. IDCT4_1D w,0,1,2,3,4,5
  672. TRANSPOSE2x4x4W 0,1,2,3,4
  673. paddw m0, [pw_32]
  674. IDCT4_1D w,0,1,2,3,4,5
  675. pxor m7, m7
  676. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  677. lea r0, [r0+r3*2]
  678. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  679. ret
  680. %macro add16_sse2_cycle 2
  681. movzx r0, word [r4+%2]
  682. test r0, r0
  683. jz .cycle%1end
  684. mov r0d, dword [r1+%1*8]
  685. %if ARCH_X86_64
  686. add r0, r5
  687. %else
  688. add r0, r0m
  689. %endif
  690. call h264_add8x4_idct_sse2
  691. .cycle%1end:
  692. %if %1 < 7
  693. add r2, 64
  694. %endif
  695. %endmacro
  696. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  697. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  698. cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8
  699. %if ARCH_X86_64
  700. mov r5, r0
  701. %endif
  702. ; unrolling of the loop leads to an average performance gain of
  703. ; 20-25%
  704. add16_sse2_cycle 0, 0xc
  705. add16_sse2_cycle 1, 0x14
  706. add16_sse2_cycle 2, 0xe
  707. add16_sse2_cycle 3, 0x16
  708. add16_sse2_cycle 4, 0x1c
  709. add16_sse2_cycle 5, 0x24
  710. add16_sse2_cycle 6, 0x1e
  711. add16_sse2_cycle 7, 0x26
  712. RET
  713. %macro add16intra_sse2_cycle 2
  714. movzx r0, word [r4+%2]
  715. test r0, r0
  716. jz .try%1dc
  717. mov r0d, dword [r1+%1*8]
  718. %if ARCH_X86_64
  719. add r0, r7
  720. %else
  721. add r0, r0m
  722. %endif
  723. call h264_add8x4_idct_sse2
  724. jmp .cycle%1end
  725. .try%1dc:
  726. movsx r0, word [r2 ]
  727. or r0w, word [r2+32]
  728. jz .cycle%1end
  729. mov r0d, dword [r1+%1*8]
  730. %if ARCH_X86_64
  731. add r0, r7
  732. %else
  733. add r0, r0m
  734. %endif
  735. call h264_idct_dc_add8_mmxext
  736. .cycle%1end:
  737. %if %1 < 7
  738. add r2, 64
  739. %endif
  740. %endmacro
  741. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  742. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  743. cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8
  744. %if ARCH_X86_64
  745. mov r7, r0
  746. %endif
  747. add16intra_sse2_cycle 0, 0xc
  748. add16intra_sse2_cycle 1, 0x14
  749. add16intra_sse2_cycle 2, 0xe
  750. add16intra_sse2_cycle 3, 0x16
  751. add16intra_sse2_cycle 4, 0x1c
  752. add16intra_sse2_cycle 5, 0x24
  753. add16intra_sse2_cycle 6, 0x1e
  754. add16intra_sse2_cycle 7, 0x26
  755. RET
  756. %macro add8_sse2_cycle 2
  757. movzx r0, word [r4+%2]
  758. test r0, r0
  759. jz .try%1dc
  760. %if ARCH_X86_64
  761. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  762. add r0, [r7]
  763. %else
  764. mov r0, r0m
  765. mov r0, [r0]
  766. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  767. %endif
  768. call h264_add8x4_idct_sse2
  769. jmp .cycle%1end
  770. .try%1dc:
  771. movsx r0, word [r2 ]
  772. or r0w, word [r2+32]
  773. jz .cycle%1end
  774. %if ARCH_X86_64
  775. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  776. add r0, [r7]
  777. %else
  778. mov r0, r0m
  779. mov r0, [r0]
  780. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  781. %endif
  782. call h264_idct_dc_add8_mmxext
  783. .cycle%1end:
  784. %if %1 == 1
  785. add r2, 384+64
  786. %elif %1 < 3
  787. add r2, 64
  788. %endif
  789. %endmacro
  790. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  791. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  792. cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8
  793. add r2, 512
  794. %if ARCH_X86_64
  795. mov r7, r0
  796. %endif
  797. add8_sse2_cycle 0, 0x34
  798. add8_sse2_cycle 1, 0x3c
  799. %if ARCH_X86_64
  800. add r7, gprsize
  801. %else
  802. add r0mp, gprsize
  803. %endif
  804. add8_sse2_cycle 2, 0x5c
  805. add8_sse2_cycle 3, 0x64
  806. RET
  807. ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
  808. %macro WALSH4_1D 5
  809. SUMSUB_BADC w, %4, %3, %2, %1, %5
  810. SUMSUB_BADC w, %4, %2, %3, %1, %5
  811. SWAP %1, %4, %3
  812. %endmacro
  813. %macro DEQUANT_MMX 3
  814. mova m7, [pw_1]
  815. mova m4, %1
  816. punpcklwd %1, m7
  817. punpckhwd m4, m7
  818. mova m5, %2
  819. punpcklwd %2, m7
  820. punpckhwd m5, m7
  821. movd m7, t3d
  822. punpckldq m7, m7
  823. pmaddwd %1, m7
  824. pmaddwd %2, m7
  825. pmaddwd m4, m7
  826. pmaddwd m5, m7
  827. psrad %1, %3
  828. psrad %2, %3
  829. psrad m4, %3
  830. psrad m5, %3
  831. packssdw %1, m4
  832. packssdw %2, m5
  833. %endmacro
  834. %macro STORE_WORDS_MMX 5
  835. movd t0d, %1
  836. psrlq %1, 32
  837. movd t1d, %1
  838. mov [t2+%2*32], t0w
  839. mov [t2+%4*32], t1w
  840. shr t0d, 16
  841. shr t1d, 16
  842. mov [t2+%3*32], t0w
  843. mov [t2+%5*32], t1w
  844. %endmacro
  845. %macro DEQUANT_STORE_MMX 1
  846. DEQUANT_MMX m0, m1, %1
  847. STORE_WORDS_MMX m0, 0, 1, 4, 5
  848. STORE_WORDS_MMX m1, 2, 3, 6, 7
  849. DEQUANT_MMX m2, m3, %1
  850. STORE_WORDS_MMX m2, 8, 9, 12, 13
  851. STORE_WORDS_MMX m3, 10, 11, 14, 15
  852. %endmacro
  853. %macro STORE_WORDS_SSE 9
  854. movd t0d, %1
  855. psrldq %1, 4
  856. movd t1d, %1
  857. psrldq %1, 4
  858. mov [t2+%2*32], t0w
  859. mov [t2+%4*32], t1w
  860. shr t0d, 16
  861. shr t1d, 16
  862. mov [t2+%3*32], t0w
  863. mov [t2+%5*32], t1w
  864. movd t0d, %1
  865. psrldq %1, 4
  866. movd t1d, %1
  867. mov [t2+%6*32], t0w
  868. mov [t2+%8*32], t1w
  869. shr t0d, 16
  870. shr t1d, 16
  871. mov [t2+%7*32], t0w
  872. mov [t2+%9*32], t1w
  873. %endmacro
  874. %macro DEQUANT_STORE_SSE2 1
  875. movd xmm4, t3d
  876. movq xmm5, [pw_1]
  877. pshufd xmm4, xmm4, 0
  878. movq2dq xmm0, m0
  879. movq2dq xmm1, m1
  880. movq2dq xmm2, m2
  881. movq2dq xmm3, m3
  882. punpcklwd xmm0, xmm5
  883. punpcklwd xmm1, xmm5
  884. punpcklwd xmm2, xmm5
  885. punpcklwd xmm3, xmm5
  886. pmaddwd xmm0, xmm4
  887. pmaddwd xmm1, xmm4
  888. pmaddwd xmm2, xmm4
  889. pmaddwd xmm3, xmm4
  890. psrad xmm0, %1
  891. psrad xmm1, %1
  892. psrad xmm2, %1
  893. psrad xmm3, %1
  894. packssdw xmm0, xmm1
  895. packssdw xmm2, xmm3
  896. STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  897. STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  898. %endmacro
  899. %macro IDCT_DC_DEQUANT 2
  900. cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
  901. ; manually spill XMM registers for Win64 because
  902. ; the code here is initialized with INIT_MMX
  903. WIN64_SPILL_XMM %2
  904. movq m3, [r1+24]
  905. movq m2, [r1+16]
  906. movq m1, [r1+ 8]
  907. movq m0, [r1+ 0]
  908. WALSH4_1D 0,1,2,3,4
  909. TRANSPOSE4x4W 0,1,2,3,4
  910. WALSH4_1D 0,1,2,3,4
  911. ; shift, tmp, output, qmul
  912. %if WIN64
  913. DECLARE_REG_TMP 0,3,1,2
  914. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  915. xchg r0, t2
  916. %elif ARCH_X86_64
  917. DECLARE_REG_TMP 3,1,0,2
  918. %else
  919. DECLARE_REG_TMP 1,3,0,2
  920. %endif
  921. cmp t3d, 32767
  922. jg .big_qmul
  923. add t3d, 128 << 16
  924. %ifidn %1,mmx
  925. DEQUANT_STORE_MMX 8
  926. %else
  927. DEQUANT_STORE_SSE2 8
  928. %endif
  929. RET
  930. .big_qmul:
  931. bsr t0d, t3d
  932. add t3d, 128 << 16
  933. mov t1d, 7
  934. cmp t0d, t1d
  935. cmovg t0d, t1d
  936. inc t1d
  937. shr t3d, t0b
  938. sub t1d, t0d
  939. %ifidn %1,mmx
  940. movd m6, t1d
  941. DEQUANT_STORE_MMX m6
  942. %else
  943. movd xmm6, t1d
  944. DEQUANT_STORE_SSE2 xmm6
  945. %endif
  946. RET
  947. %endmacro
  948. INIT_MMX
  949. IDCT_DC_DEQUANT mmx, 0
  950. IDCT_DC_DEQUANT sse2, 7