You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1014 lines
26KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of Libav.
  13. ;*
  14. ;* Libav is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* Libav is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with Libav; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "x86inc.asm"
  29. %include "x86util.asm"
  30. SECTION_RODATA
  31. ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  32. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  33. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  34. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  35. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  36. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  37. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  38. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  39. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  40. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  41. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  42. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  43. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  44. %ifdef PIC
  45. %define npicregs 1
  46. %define scan8 picregq
  47. %else
  48. %define npicregs 0
  49. %define scan8 scan8_mem
  50. %endif
  51. cextern pw_32
  52. cextern pw_1
  53. SECTION .text
  54. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  55. %macro IDCT4_ADD 3
  56. ; Load dct coeffs
  57. movq m0, [%2]
  58. movq m1, [%2+8]
  59. movq m2, [%2+16]
  60. movq m3, [%2+24]
  61. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  62. mova m6, [pw_32]
  63. TRANSPOSE4x4W 0, 1, 2, 3, 4
  64. paddw m0, m6
  65. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  66. pxor m7, m7
  67. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  68. lea %1, [%1+%3*2]
  69. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  70. %endmacro
  71. INIT_MMX
  72. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  73. cglobal h264_idct_add_8_mmx, 3, 3, 0
  74. IDCT4_ADD r0, r1, r2
  75. RET
  76. %macro IDCT8_1D 2
  77. mova m0, m1
  78. psraw m1, 1
  79. mova m4, m5
  80. psraw m4, 1
  81. paddw m4, m5
  82. paddw m1, m0
  83. paddw m4, m7
  84. paddw m1, m5
  85. psubw m4, m0
  86. paddw m1, m3
  87. psubw m0, m3
  88. psubw m5, m3
  89. psraw m3, 1
  90. paddw m0, m7
  91. psubw m5, m7
  92. psraw m7, 1
  93. psubw m0, m3
  94. psubw m5, m7
  95. mova m7, m1
  96. psraw m1, 2
  97. mova m3, m4
  98. psraw m3, 2
  99. paddw m3, m0
  100. psraw m0, 2
  101. paddw m1, m5
  102. psraw m5, 2
  103. psubw m0, m4
  104. psubw m7, m5
  105. mova m5, m6
  106. psraw m6, 1
  107. mova m4, m2
  108. psraw m4, 1
  109. paddw m6, m2
  110. psubw m4, m5
  111. mova m2, %1
  112. mova m5, %2
  113. SUMSUB_BA w, 5, 2
  114. SUMSUB_BA w, 6, 5
  115. SUMSUB_BA w, 4, 2
  116. SUMSUB_BA w, 7, 6
  117. SUMSUB_BA w, 0, 4
  118. SUMSUB_BA w, 3, 2
  119. SUMSUB_BA w, 1, 5
  120. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  121. %endmacro
  122. %macro IDCT8_1D_FULL 1
  123. mova m7, [%1+112]
  124. mova m6, [%1+ 96]
  125. mova m5, [%1+ 80]
  126. mova m3, [%1+ 48]
  127. mova m2, [%1+ 32]
  128. mova m1, [%1+ 16]
  129. IDCT8_1D [%1], [%1+ 64]
  130. %endmacro
  131. ; %1=int16_t *block, %2=int16_t *dstblock
  132. %macro IDCT8_ADD_MMX_START 2
  133. IDCT8_1D_FULL %1
  134. mova [%1], m7
  135. TRANSPOSE4x4W 0, 1, 2, 3, 7
  136. mova m7, [%1]
  137. mova [%2 ], m0
  138. mova [%2+16], m1
  139. mova [%2+32], m2
  140. mova [%2+48], m3
  141. TRANSPOSE4x4W 4, 5, 6, 7, 3
  142. mova [%2+ 8], m4
  143. mova [%2+24], m5
  144. mova [%2+40], m6
  145. mova [%2+56], m7
  146. %endmacro
  147. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  148. %macro IDCT8_ADD_MMX_END 3
  149. IDCT8_1D_FULL %2
  150. mova [%2 ], m5
  151. mova [%2+16], m6
  152. mova [%2+32], m7
  153. pxor m7, m7
  154. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  155. lea %1, [%1+%3*2]
  156. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  157. mova m0, [%2 ]
  158. mova m1, [%2+16]
  159. mova m2, [%2+32]
  160. lea %1, [%1+%3*2]
  161. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  162. lea %1, [%1+%3*2]
  163. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  164. %endmacro
  165. INIT_MMX
  166. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  167. cglobal h264_idct8_add_8_mmx, 3, 4, 0
  168. %assign pad 128+4-(stack_offset&7)
  169. SUB rsp, pad
  170. add word [r1], 32
  171. IDCT8_ADD_MMX_START r1 , rsp
  172. IDCT8_ADD_MMX_START r1+8, rsp+64
  173. lea r3, [r0+4]
  174. IDCT8_ADD_MMX_END r0 , rsp, r2
  175. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  176. ADD rsp, pad
  177. RET
  178. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  179. %macro IDCT8_ADD_SSE 4
  180. IDCT8_1D_FULL %2
  181. %if ARCH_X86_64
  182. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  183. %else
  184. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  185. %endif
  186. paddw m0, [pw_32]
  187. %if ARCH_X86_64 == 0
  188. mova [%2 ], m0
  189. mova [%2+16], m4
  190. IDCT8_1D [%2], [%2+ 16]
  191. mova [%2 ], m6
  192. mova [%2+16], m7
  193. %else
  194. SWAP 0, 8
  195. SWAP 4, 9
  196. IDCT8_1D m8, m9
  197. SWAP 6, 8
  198. SWAP 7, 9
  199. %endif
  200. pxor m7, m7
  201. lea %4, [%3*3]
  202. STORE_DIFF m0, m6, m7, [%1 ]
  203. STORE_DIFF m1, m6, m7, [%1+%3 ]
  204. STORE_DIFF m2, m6, m7, [%1+%3*2]
  205. STORE_DIFF m3, m6, m7, [%1+%4 ]
  206. %if ARCH_X86_64 == 0
  207. mova m0, [%2 ]
  208. mova m1, [%2+16]
  209. %else
  210. SWAP 0, 8
  211. SWAP 1, 9
  212. %endif
  213. lea %1, [%1+%3*4]
  214. STORE_DIFF m4, m6, m7, [%1 ]
  215. STORE_DIFF m5, m6, m7, [%1+%3 ]
  216. STORE_DIFF m0, m6, m7, [%1+%3*2]
  217. STORE_DIFF m1, m6, m7, [%1+%4 ]
  218. %endmacro
  219. INIT_XMM
  220. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  221. cglobal h264_idct8_add_8_sse2, 3, 4, 10
  222. IDCT8_ADD_SSE r0, r1, r2, r3
  223. RET
  224. %macro DC_ADD_MMX2_INIT 2-3
  225. %if %0 == 2
  226. movsx %1, word [%1]
  227. add %1, 32
  228. sar %1, 6
  229. movd m0, %1d
  230. lea %1, [%2*3]
  231. %else
  232. add %3, 32
  233. sar %3, 6
  234. movd m0, %3d
  235. lea %3, [%2*3]
  236. %endif
  237. pshufw m0, m0, 0
  238. pxor m1, m1
  239. psubw m1, m0
  240. packuswb m0, m0
  241. packuswb m1, m1
  242. %endmacro
  243. %macro DC_ADD_MMX2_OP 4
  244. %1 m2, [%2 ]
  245. %1 m3, [%2+%3 ]
  246. %1 m4, [%2+%3*2]
  247. %1 m5, [%2+%4 ]
  248. paddusb m2, m0
  249. paddusb m3, m0
  250. paddusb m4, m0
  251. paddusb m5, m0
  252. psubusb m2, m1
  253. psubusb m3, m1
  254. psubusb m4, m1
  255. psubusb m5, m1
  256. %1 [%2 ], m2
  257. %1 [%2+%3 ], m3
  258. %1 [%2+%3*2], m4
  259. %1 [%2+%4 ], m5
  260. %endmacro
  261. INIT_MMX
  262. ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  263. cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0
  264. DC_ADD_MMX2_INIT r1, r2
  265. DC_ADD_MMX2_OP movh, r0, r2, r1
  266. RET
  267. ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  268. cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
  269. DC_ADD_MMX2_INIT r1, r2
  270. DC_ADD_MMX2_OP mova, r0, r2, r1
  271. lea r0, [r0+r2*4]
  272. DC_ADD_MMX2_OP mova, r0, r2, r1
  273. RET
  274. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  275. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  276. cglobal h264_idct_add16_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  277. xor r5, r5
  278. %ifdef PIC
  279. lea picregq, [scan8_mem]
  280. %endif
  281. .nextblock
  282. movzx r6, byte [scan8+r5]
  283. movzx r6, byte [r4+r6]
  284. test r6, r6
  285. jz .skipblock
  286. mov r6d, dword [r1+r5*4]
  287. lea r6, [r0+r6]
  288. IDCT4_ADD r6, r2, r3
  289. .skipblock
  290. inc r5
  291. add r2, 32
  292. cmp r5, 16
  293. jl .nextblock
  294. REP_RET
  295. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  296. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  297. cglobal h264_idct8_add4_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  298. %assign pad 128+4-(stack_offset&7)
  299. SUB rsp, pad
  300. xor r5, r5
  301. %ifdef PIC
  302. lea picregq, [scan8_mem]
  303. %endif
  304. .nextblock
  305. movzx r6, byte [scan8+r5]
  306. movzx r6, byte [r4+r6]
  307. test r6, r6
  308. jz .skipblock
  309. mov r6d, dword [r1+r5*4]
  310. add r6, r0
  311. add word [r2], 32
  312. IDCT8_ADD_MMX_START r2 , rsp
  313. IDCT8_ADD_MMX_START r2+8, rsp+64
  314. IDCT8_ADD_MMX_END r6 , rsp, r3
  315. mov r6d, dword [r1+r5*4]
  316. lea r6, [r0+r6+4]
  317. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  318. .skipblock
  319. add r5, 4
  320. add r2, 128
  321. cmp r5, 16
  322. jl .nextblock
  323. ADD rsp, pad
  324. RET
  325. ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
  326. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  327. cglobal h264_idct_add16_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  328. xor r5, r5
  329. %ifdef PIC
  330. lea picregq, [scan8_mem]
  331. %endif
  332. .nextblock
  333. movzx r6, byte [scan8+r5]
  334. movzx r6, byte [r4+r6]
  335. test r6, r6
  336. jz .skipblock
  337. cmp r6, 1
  338. jnz .no_dc
  339. movsx r6, word [r2]
  340. test r6, r6
  341. jz .no_dc
  342. DC_ADD_MMX2_INIT r2, r3, r6
  343. %if ARCH_X86_64 == 0
  344. %define dst2q r1
  345. %define dst2d r1d
  346. %endif
  347. mov dst2d, dword [r1+r5*4]
  348. lea dst2q, [r0+dst2q]
  349. DC_ADD_MMX2_OP movh, dst2q, r3, r6
  350. %if ARCH_X86_64 == 0
  351. mov r1, r1m
  352. %endif
  353. inc r5
  354. add r2, 32
  355. cmp r5, 16
  356. jl .nextblock
  357. REP_RET
  358. .no_dc
  359. mov r6d, dword [r1+r5*4]
  360. add r6, r0
  361. IDCT4_ADD r6, r2, r3
  362. .skipblock
  363. inc r5
  364. add r2, 32
  365. cmp r5, 16
  366. jl .nextblock
  367. REP_RET
  368. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  369. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  370. cglobal h264_idct_add16intra_8_mmx, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  371. xor r5, r5
  372. %ifdef PIC
  373. lea picregq, [scan8_mem]
  374. %endif
  375. .nextblock
  376. movzx r6, byte [scan8+r5]
  377. movzx r6, byte [r4+r6]
  378. or r6w, word [r2]
  379. test r6, r6
  380. jz .skipblock
  381. mov r6d, dword [r1+r5*4]
  382. add r6, r0
  383. IDCT4_ADD r6, r2, r3
  384. .skipblock
  385. inc r5
  386. add r2, 32
  387. cmp r5, 16
  388. jl .nextblock
  389. REP_RET
  390. ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
  391. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  392. cglobal h264_idct_add16intra_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  393. xor r5, r5
  394. %ifdef PIC
  395. lea picregq, [scan8_mem]
  396. %endif
  397. .nextblock
  398. movzx r6, byte [scan8+r5]
  399. movzx r6, byte [r4+r6]
  400. test r6, r6
  401. jz .try_dc
  402. mov r6d, dword [r1+r5*4]
  403. lea r6, [r0+r6]
  404. IDCT4_ADD r6, r2, r3
  405. inc r5
  406. add r2, 32
  407. cmp r5, 16
  408. jl .nextblock
  409. REP_RET
  410. .try_dc
  411. movsx r6, word [r2]
  412. test r6, r6
  413. jz .skipblock
  414. DC_ADD_MMX2_INIT r2, r3, r6
  415. %if ARCH_X86_64 == 0
  416. %define dst2q r1
  417. %define dst2d r1d
  418. %endif
  419. mov dst2d, dword [r1+r5*4]
  420. add dst2q, r0
  421. DC_ADD_MMX2_OP movh, dst2q, r3, r6
  422. %if ARCH_X86_64 == 0
  423. mov r1, r1m
  424. %endif
  425. .skipblock
  426. inc r5
  427. add r2, 32
  428. cmp r5, 16
  429. jl .nextblock
  430. REP_RET
  431. ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
  432. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  433. cglobal h264_idct8_add4_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  434. %assign pad 128+4-(stack_offset&7)
  435. SUB rsp, pad
  436. xor r5, r5
  437. %ifdef PIC
  438. lea picregq, [scan8_mem]
  439. %endif
  440. .nextblock
  441. movzx r6, byte [scan8+r5]
  442. movzx r6, byte [r4+r6]
  443. test r6, r6
  444. jz .skipblock
  445. cmp r6, 1
  446. jnz .no_dc
  447. movsx r6, word [r2]
  448. test r6, r6
  449. jz .no_dc
  450. DC_ADD_MMX2_INIT r2, r3, r6
  451. %if ARCH_X86_64 == 0
  452. %define dst2q r1
  453. %define dst2d r1d
  454. %endif
  455. mov dst2d, dword [r1+r5*4]
  456. lea dst2q, [r0+dst2q]
  457. DC_ADD_MMX2_OP mova, dst2q, r3, r6
  458. lea dst2q, [dst2q+r3*4]
  459. DC_ADD_MMX2_OP mova, dst2q, r3, r6
  460. %if ARCH_X86_64 == 0
  461. mov r1, r1m
  462. %endif
  463. add r5, 4
  464. add r2, 128
  465. cmp r5, 16
  466. jl .nextblock
  467. ADD rsp, pad
  468. RET
  469. .no_dc
  470. mov r6d, dword [r1+r5*4]
  471. add r6, r0
  472. add word [r2], 32
  473. IDCT8_ADD_MMX_START r2 , rsp
  474. IDCT8_ADD_MMX_START r2+8, rsp+64
  475. IDCT8_ADD_MMX_END r6 , rsp, r3
  476. mov r6d, dword [r1+r5*4]
  477. lea r6, [r0+r6+4]
  478. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  479. .skipblock
  480. add r5, 4
  481. add r2, 128
  482. cmp r5, 16
  483. jl .nextblock
  484. ADD rsp, pad
  485. RET
  486. INIT_XMM
  487. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  488. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  489. cglobal h264_idct8_add4_8_sse2, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  490. xor r5, r5
  491. %ifdef PIC
  492. lea picregq, [scan8_mem]
  493. %endif
  494. .nextblock
  495. movzx r6, byte [scan8+r5]
  496. movzx r6, byte [r4+r6]
  497. test r6, r6
  498. jz .skipblock
  499. cmp r6, 1
  500. jnz .no_dc
  501. movsx r6, word [r2]
  502. test r6, r6
  503. jz .no_dc
  504. INIT_MMX
  505. DC_ADD_MMX2_INIT r2, r3, r6
  506. %if ARCH_X86_64 == 0
  507. %define dst2q r1
  508. %define dst2d r1d
  509. %endif
  510. mov dst2d, dword [r1+r5*4]
  511. add dst2q, r0
  512. DC_ADD_MMX2_OP mova, dst2q, r3, r6
  513. lea dst2q, [dst2q+r3*4]
  514. DC_ADD_MMX2_OP mova, dst2q, r3, r6
  515. %if ARCH_X86_64 == 0
  516. mov r1, r1m
  517. %endif
  518. add r5, 4
  519. add r2, 128
  520. cmp r5, 16
  521. jl .nextblock
  522. REP_RET
  523. .no_dc
  524. INIT_XMM
  525. mov dst2d, dword [r1+r5*4]
  526. add dst2q, r0
  527. IDCT8_ADD_SSE dst2q, r2, r3, r6
  528. %if ARCH_X86_64 == 0
  529. mov r1, r1m
  530. %endif
  531. .skipblock
  532. add r5, 4
  533. add r2, 128
  534. cmp r5, 16
  535. jl .nextblock
  536. REP_RET
  537. INIT_MMX
  538. h264_idct_add8_mmx_plane:
  539. .nextblock
  540. movzx r6, byte [scan8+r5]
  541. movzx r6, byte [r4+r6]
  542. or r6w, word [r2]
  543. test r6, r6
  544. jz .skipblock
  545. %if ARCH_X86_64
  546. mov r0d, dword [r1+r5*4]
  547. add r0, [dst2q]
  548. %else
  549. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  550. mov r0, [r0]
  551. add r0, dword [r1+r5*4]
  552. %endif
  553. IDCT4_ADD r0, r2, r3
  554. .skipblock
  555. inc r5
  556. add r2, 32
  557. test r5, 3
  558. jnz .nextblock
  559. rep ret
  560. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  561. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  562. cglobal h264_idct_add8_8_mmx, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  563. mov r5, 16
  564. add r2, 512
  565. %ifdef PIC
  566. lea picregq, [scan8_mem]
  567. %endif
  568. %if ARCH_X86_64
  569. mov dst2q, r0
  570. %endif
  571. call h264_idct_add8_mmx_plane
  572. mov r5, 32
  573. add r2, 384
  574. %if ARCH_X86_64
  575. add dst2q, gprsize
  576. %else
  577. add r0mp, gprsize
  578. %endif
  579. call h264_idct_add8_mmx_plane
  580. RET
  581. h264_idct_add8_mmx2_plane
  582. .nextblock
  583. movzx r6, byte [scan8+r5]
  584. movzx r6, byte [r4+r6]
  585. test r6, r6
  586. jz .try_dc
  587. %if ARCH_X86_64
  588. mov r0d, dword [r1+r5*4]
  589. add r0, [dst2q]
  590. %else
  591. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  592. mov r0, [r0]
  593. add r0, dword [r1+r5*4]
  594. %endif
  595. IDCT4_ADD r0, r2, r3
  596. inc r5
  597. add r2, 32
  598. test r5, 3
  599. jnz .nextblock
  600. rep ret
  601. .try_dc
  602. movsx r6, word [r2]
  603. test r6, r6
  604. jz .skipblock
  605. DC_ADD_MMX2_INIT r2, r3, r6
  606. %if ARCH_X86_64
  607. mov r0d, dword [r1+r5*4]
  608. add r0, [dst2q]
  609. %else
  610. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  611. mov r0, [r0]
  612. add r0, dword [r1+r5*4]
  613. %endif
  614. DC_ADD_MMX2_OP movh, r0, r3, r6
  615. .skipblock
  616. inc r5
  617. add r2, 32
  618. test r5, 3
  619. jnz .nextblock
  620. rep ret
  621. ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
  622. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  623. cglobal h264_idct_add8_8_mmx2, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  624. mov r5, 16
  625. add r2, 512
  626. %if ARCH_X86_64
  627. mov dst2q, r0
  628. %endif
  629. %ifdef PIC
  630. lea picregq, [scan8_mem]
  631. %endif
  632. call h264_idct_add8_mmx2_plane
  633. mov r5, 32
  634. add r2, 384
  635. %if ARCH_X86_64
  636. add dst2q, gprsize
  637. %else
  638. add r0mp, gprsize
  639. %endif
  640. call h264_idct_add8_mmx2_plane
  641. RET
  642. INIT_MMX
  643. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  644. h264_idct_dc_add8_mmx2:
  645. movd m0, [r2 ] ; 0 0 X D
  646. punpcklwd m0, [r2+32] ; x X d D
  647. paddsw m0, [pw_32]
  648. psraw m0, 6
  649. punpcklwd m0, m0 ; d d D D
  650. pxor m1, m1 ; 0 0 0 0
  651. psubw m1, m0 ; -d-d-D-D
  652. packuswb m0, m1 ; -d-d-D-D d d D D
  653. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  654. punpcklwd m0, m0 ; d d d d D D D D
  655. lea r6, [r3*3]
  656. DC_ADD_MMX2_OP movq, r0, r3, r6
  657. ret
  658. ALIGN 16
  659. INIT_XMM
  660. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  661. x264_add8x4_idct_sse2:
  662. movq m0, [r2+ 0]
  663. movq m1, [r2+ 8]
  664. movq m2, [r2+16]
  665. movq m3, [r2+24]
  666. movhps m0, [r2+32]
  667. movhps m1, [r2+40]
  668. movhps m2, [r2+48]
  669. movhps m3, [r2+56]
  670. IDCT4_1D w,0,1,2,3,4,5
  671. TRANSPOSE2x4x4W 0,1,2,3,4
  672. paddw m0, [pw_32]
  673. IDCT4_1D w,0,1,2,3,4,5
  674. pxor m7, m7
  675. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  676. lea r0, [r0+r3*2]
  677. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  678. ret
  679. %macro add16_sse2_cycle 2
  680. movzx r0, word [r4+%2]
  681. test r0, r0
  682. jz .cycle%1end
  683. mov r0d, dword [r1+%1*8]
  684. %if ARCH_X86_64
  685. add r0, r5
  686. %else
  687. add r0, r0m
  688. %endif
  689. call x264_add8x4_idct_sse2
  690. .cycle%1end
  691. %if %1 < 7
  692. add r2, 64
  693. %endif
  694. %endmacro
  695. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  696. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  697. cglobal h264_idct_add16_8_sse2, 5, 5 + ARCH_X86_64, 8
  698. %if ARCH_X86_64
  699. mov r5, r0
  700. %endif
  701. ; unrolling of the loop leads to an average performance gain of
  702. ; 20-25%
  703. add16_sse2_cycle 0, 0xc
  704. add16_sse2_cycle 1, 0x14
  705. add16_sse2_cycle 2, 0xe
  706. add16_sse2_cycle 3, 0x16
  707. add16_sse2_cycle 4, 0x1c
  708. add16_sse2_cycle 5, 0x24
  709. add16_sse2_cycle 6, 0x1e
  710. add16_sse2_cycle 7, 0x26
  711. RET
  712. %macro add16intra_sse2_cycle 2
  713. movzx r0, word [r4+%2]
  714. test r0, r0
  715. jz .try%1dc
  716. mov r0d, dword [r1+%1*8]
  717. %if ARCH_X86_64
  718. add r0, r7
  719. %else
  720. add r0, r0m
  721. %endif
  722. call x264_add8x4_idct_sse2
  723. jmp .cycle%1end
  724. .try%1dc
  725. movsx r0, word [r2 ]
  726. or r0w, word [r2+32]
  727. jz .cycle%1end
  728. mov r0d, dword [r1+%1*8]
  729. %if ARCH_X86_64
  730. add r0, r7
  731. %else
  732. add r0, r0m
  733. %endif
  734. call h264_idct_dc_add8_mmx2
  735. .cycle%1end
  736. %if %1 < 7
  737. add r2, 64
  738. %endif
  739. %endmacro
  740. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  741. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  742. cglobal h264_idct_add16intra_8_sse2, 5, 7 + ARCH_X86_64, 8
  743. %if ARCH_X86_64
  744. mov r7, r0
  745. %endif
  746. add16intra_sse2_cycle 0, 0xc
  747. add16intra_sse2_cycle 1, 0x14
  748. add16intra_sse2_cycle 2, 0xe
  749. add16intra_sse2_cycle 3, 0x16
  750. add16intra_sse2_cycle 4, 0x1c
  751. add16intra_sse2_cycle 5, 0x24
  752. add16intra_sse2_cycle 6, 0x1e
  753. add16intra_sse2_cycle 7, 0x26
  754. RET
  755. %macro add8_sse2_cycle 2
  756. movzx r0, word [r4+%2]
  757. test r0, r0
  758. jz .try%1dc
  759. %if ARCH_X86_64
  760. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  761. add r0, [r7]
  762. %else
  763. mov r0, r0m
  764. mov r0, [r0]
  765. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  766. %endif
  767. call x264_add8x4_idct_sse2
  768. jmp .cycle%1end
  769. .try%1dc
  770. movsx r0, word [r2 ]
  771. or r0w, word [r2+32]
  772. jz .cycle%1end
  773. %if ARCH_X86_64
  774. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  775. add r0, [r7]
  776. %else
  777. mov r0, r0m
  778. mov r0, [r0]
  779. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  780. %endif
  781. call h264_idct_dc_add8_mmx2
  782. .cycle%1end
  783. %if %1 == 1
  784. add r2, 384+64
  785. %elif %1 < 3
  786. add r2, 64
  787. %endif
  788. %endmacro
  789. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  790. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  791. cglobal h264_idct_add8_8_sse2, 5, 7 + ARCH_X86_64, 8
  792. add r2, 512
  793. %if ARCH_X86_64
  794. mov r7, r0
  795. %endif
  796. add8_sse2_cycle 0, 0x34
  797. add8_sse2_cycle 1, 0x3c
  798. %if ARCH_X86_64
  799. add r7, gprsize
  800. %else
  801. add r0mp, gprsize
  802. %endif
  803. add8_sse2_cycle 2, 0x5c
  804. add8_sse2_cycle 3, 0x64
  805. RET
  806. ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
  807. %macro WALSH4_1D 5
  808. SUMSUB_BADC w, %4, %3, %2, %1, %5
  809. SUMSUB_BADC w, %4, %2, %3, %1, %5
  810. SWAP %1, %4, %3
  811. %endmacro
  812. %macro DEQUANT_MMX 3
  813. mova m7, [pw_1]
  814. mova m4, %1
  815. punpcklwd %1, m7
  816. punpckhwd m4, m7
  817. mova m5, %2
  818. punpcklwd %2, m7
  819. punpckhwd m5, m7
  820. movd m7, t3d
  821. punpckldq m7, m7
  822. pmaddwd %1, m7
  823. pmaddwd %2, m7
  824. pmaddwd m4, m7
  825. pmaddwd m5, m7
  826. psrad %1, %3
  827. psrad %2, %3
  828. psrad m4, %3
  829. psrad m5, %3
  830. packssdw %1, m4
  831. packssdw %2, m5
  832. %endmacro
  833. %macro STORE_WORDS_MMX 5
  834. movd t0d, %1
  835. psrlq %1, 32
  836. movd t1d, %1
  837. mov [t2+%2*32], t0w
  838. mov [t2+%4*32], t1w
  839. shr t0d, 16
  840. shr t1d, 16
  841. mov [t2+%3*32], t0w
  842. mov [t2+%5*32], t1w
  843. %endmacro
  844. %macro DEQUANT_STORE_MMX 1
  845. DEQUANT_MMX m0, m1, %1
  846. STORE_WORDS_MMX m0, 0, 1, 4, 5
  847. STORE_WORDS_MMX m1, 2, 3, 6, 7
  848. DEQUANT_MMX m2, m3, %1
  849. STORE_WORDS_MMX m2, 8, 9, 12, 13
  850. STORE_WORDS_MMX m3, 10, 11, 14, 15
  851. %endmacro
  852. %macro STORE_WORDS_SSE 9
  853. movd t0d, %1
  854. psrldq %1, 4
  855. movd t1d, %1
  856. psrldq %1, 4
  857. mov [t2+%2*32], t0w
  858. mov [t2+%4*32], t1w
  859. shr t0d, 16
  860. shr t1d, 16
  861. mov [t2+%3*32], t0w
  862. mov [t2+%5*32], t1w
  863. movd t0d, %1
  864. psrldq %1, 4
  865. movd t1d, %1
  866. mov [t2+%6*32], t0w
  867. mov [t2+%8*32], t1w
  868. shr t0d, 16
  869. shr t1d, 16
  870. mov [t2+%7*32], t0w
  871. mov [t2+%9*32], t1w
  872. %endmacro
  873. %macro DEQUANT_STORE_SSE2 1
  874. movd xmm4, t3d
  875. movq xmm5, [pw_1]
  876. pshufd xmm4, xmm4, 0
  877. movq2dq xmm0, m0
  878. movq2dq xmm1, m1
  879. movq2dq xmm2, m2
  880. movq2dq xmm3, m3
  881. punpcklwd xmm0, xmm5
  882. punpcklwd xmm1, xmm5
  883. punpcklwd xmm2, xmm5
  884. punpcklwd xmm3, xmm5
  885. pmaddwd xmm0, xmm4
  886. pmaddwd xmm1, xmm4
  887. pmaddwd xmm2, xmm4
  888. pmaddwd xmm3, xmm4
  889. psrad xmm0, %1
  890. psrad xmm1, %1
  891. psrad xmm2, %1
  892. psrad xmm3, %1
  893. packssdw xmm0, xmm1
  894. packssdw xmm2, xmm3
  895. STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  896. STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  897. %endmacro
  898. %macro IDCT_DC_DEQUANT 2
  899. cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
  900. ; manually spill XMM registers for Win64 because
  901. ; the code here is initialized with INIT_MMX
  902. WIN64_SPILL_XMM %2
  903. movq m3, [r1+24]
  904. movq m2, [r1+16]
  905. movq m1, [r1+ 8]
  906. movq m0, [r1+ 0]
  907. WALSH4_1D 0,1,2,3,4
  908. TRANSPOSE4x4W 0,1,2,3,4
  909. WALSH4_1D 0,1,2,3,4
  910. ; shift, tmp, output, qmul
  911. %if WIN64
  912. DECLARE_REG_TMP 0,3,1,2
  913. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  914. xchg r0, t2
  915. %elif ARCH_X86_64
  916. DECLARE_REG_TMP 3,1,0,2
  917. %else
  918. DECLARE_REG_TMP 1,3,0,2
  919. %endif
  920. cmp t3d, 32767
  921. jg .big_qmul
  922. add t3d, 128 << 16
  923. %ifidn %1,mmx
  924. DEQUANT_STORE_MMX 8
  925. %else
  926. DEQUANT_STORE_SSE2 8
  927. %endif
  928. RET
  929. .big_qmul:
  930. bsr t0d, t3d
  931. add t3d, 128 << 16
  932. mov t1d, 7
  933. cmp t0d, t1d
  934. cmovg t0d, t1d
  935. inc t1d
  936. shr t3d, t0b
  937. sub t1d, t0d
  938. %ifidn %1,mmx
  939. movd m6, t1d
  940. DEQUANT_STORE_MMX m6
  941. %else
  942. movd xmm6, t1d
  943. DEQUANT_STORE_SSE2 xmm6
  944. %endif
  945. RET
  946. %endmacro
  947. INIT_MMX
  948. IDCT_DC_DEQUANT mmx, 0
  949. IDCT_DC_DEQUANT sse2, 7