You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1016 lines
26KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of FFmpeg.
  13. ;*
  14. ;* FFmpeg is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* FFmpeg is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with FFmpeg; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "libavutil/x86/x86util.asm"
  29. SECTION_RODATA
  30. ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  31. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  32. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  33. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  34. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  35. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  36. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  37. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  38. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  39. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  40. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  41. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  42. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  43. %ifdef PIC
  44. %define npicregs 1
  45. %define scan8 picregq
  46. %else
  47. %define npicregs 0
  48. %define scan8 scan8_mem
  49. %endif
  50. cextern pw_32
  51. cextern pw_1
  52. SECTION .text
  53. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  54. %macro IDCT4_ADD 3
  55. ; Load dct coeffs
  56. movq m0, [%2]
  57. movq m1, [%2+8]
  58. movq m2, [%2+16]
  59. movq m3, [%2+24]
  60. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  61. mova m6, [pw_32]
  62. TRANSPOSE4x4W 0, 1, 2, 3, 4
  63. paddw m0, m6
  64. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  65. pxor m7, m7
  66. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  67. lea %1, [%1+%3*2]
  68. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  69. %endmacro
  70. INIT_MMX mmx
  71. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  72. cglobal h264_idct_add_8, 3, 3, 0
  73. IDCT4_ADD r0, r1, r2
  74. RET
  75. %macro IDCT8_1D 2
  76. mova m0, m1
  77. psraw m1, 1
  78. mova m4, m5
  79. psraw m4, 1
  80. paddw m4, m5
  81. paddw m1, m0
  82. paddw m4, m7
  83. paddw m1, m5
  84. psubw m4, m0
  85. paddw m1, m3
  86. psubw m0, m3
  87. psubw m5, m3
  88. psraw m3, 1
  89. paddw m0, m7
  90. psubw m5, m7
  91. psraw m7, 1
  92. psubw m0, m3
  93. psubw m5, m7
  94. mova m7, m1
  95. psraw m1, 2
  96. mova m3, m4
  97. psraw m3, 2
  98. paddw m3, m0
  99. psraw m0, 2
  100. paddw m1, m5
  101. psraw m5, 2
  102. psubw m0, m4
  103. psubw m7, m5
  104. mova m5, m6
  105. psraw m6, 1
  106. mova m4, m2
  107. psraw m4, 1
  108. paddw m6, m2
  109. psubw m4, m5
  110. mova m2, %1
  111. mova m5, %2
  112. SUMSUB_BA w, 5, 2
  113. SUMSUB_BA w, 6, 5
  114. SUMSUB_BA w, 4, 2
  115. SUMSUB_BA w, 7, 6
  116. SUMSUB_BA w, 0, 4
  117. SUMSUB_BA w, 3, 2
  118. SUMSUB_BA w, 1, 5
  119. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  120. %endmacro
  121. %macro IDCT8_1D_FULL 1
  122. mova m7, [%1+112]
  123. mova m6, [%1+ 96]
  124. mova m5, [%1+ 80]
  125. mova m3, [%1+ 48]
  126. mova m2, [%1+ 32]
  127. mova m1, [%1+ 16]
  128. IDCT8_1D [%1], [%1+ 64]
  129. %endmacro
  130. ; %1=int16_t *block, %2=int16_t *dstblock
  131. %macro IDCT8_ADD_MMX_START 2
  132. IDCT8_1D_FULL %1
  133. mova [%1], m7
  134. TRANSPOSE4x4W 0, 1, 2, 3, 7
  135. mova m7, [%1]
  136. mova [%2 ], m0
  137. mova [%2+16], m1
  138. mova [%2+32], m2
  139. mova [%2+48], m3
  140. TRANSPOSE4x4W 4, 5, 6, 7, 3
  141. mova [%2+ 8], m4
  142. mova [%2+24], m5
  143. mova [%2+40], m6
  144. mova [%2+56], m7
  145. %endmacro
  146. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  147. %macro IDCT8_ADD_MMX_END 3
  148. IDCT8_1D_FULL %2
  149. mova [%2 ], m5
  150. mova [%2+16], m6
  151. mova [%2+32], m7
  152. pxor m7, m7
  153. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  154. lea %1, [%1+%3*2]
  155. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  156. mova m0, [%2 ]
  157. mova m1, [%2+16]
  158. mova m2, [%2+32]
  159. lea %1, [%1+%3*2]
  160. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  161. lea %1, [%1+%3*2]
  162. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  163. %endmacro
  164. INIT_MMX mmx
  165. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  166. cglobal h264_idct8_add_8, 3, 4, 0
  167. %assign pad 128+4-(stack_offset&7)
  168. SUB rsp, pad
  169. add word [r1], 32
  170. IDCT8_ADD_MMX_START r1 , rsp
  171. IDCT8_ADD_MMX_START r1+8, rsp+64
  172. lea r3, [r0+4]
  173. IDCT8_ADD_MMX_END r0 , rsp, r2
  174. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  175. ADD rsp, pad
  176. RET
  177. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  178. %macro IDCT8_ADD_SSE 4
  179. IDCT8_1D_FULL %2
  180. %if ARCH_X86_64
  181. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  182. %else
  183. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  184. %endif
  185. paddw m0, [pw_32]
  186. %if ARCH_X86_64 == 0
  187. mova [%2 ], m0
  188. mova [%2+16], m4
  189. IDCT8_1D [%2], [%2+ 16]
  190. mova [%2 ], m6
  191. mova [%2+16], m7
  192. %else
  193. SWAP 0, 8
  194. SWAP 4, 9
  195. IDCT8_1D m8, m9
  196. SWAP 6, 8
  197. SWAP 7, 9
  198. %endif
  199. pxor m7, m7
  200. lea %4, [%3*3]
  201. STORE_DIFF m0, m6, m7, [%1 ]
  202. STORE_DIFF m1, m6, m7, [%1+%3 ]
  203. STORE_DIFF m2, m6, m7, [%1+%3*2]
  204. STORE_DIFF m3, m6, m7, [%1+%4 ]
  205. %if ARCH_X86_64 == 0
  206. mova m0, [%2 ]
  207. mova m1, [%2+16]
  208. %else
  209. SWAP 0, 8
  210. SWAP 1, 9
  211. %endif
  212. lea %1, [%1+%3*4]
  213. STORE_DIFF m4, m6, m7, [%1 ]
  214. STORE_DIFF m5, m6, m7, [%1+%3 ]
  215. STORE_DIFF m0, m6, m7, [%1+%3*2]
  216. STORE_DIFF m1, m6, m7, [%1+%4 ]
  217. %endmacro
  218. INIT_XMM sse2
  219. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  220. cglobal h264_idct8_add_8, 3, 4, 10
  221. IDCT8_ADD_SSE r0, r1, r2, r3
  222. RET
  223. %macro DC_ADD_MMXEXT_INIT 2-3
  224. %if %0 == 2
  225. movsx %1, word [%1]
  226. add %1, 32
  227. sar %1, 6
  228. movd m0, %1d
  229. lea %1, [%2*3]
  230. %else
  231. add %3, 32
  232. sar %3, 6
  233. movd m0, %3d
  234. lea %3, [%2*3]
  235. %endif
  236. pshufw m0, m0, 0
  237. pxor m1, m1
  238. psubw m1, m0
  239. packuswb m0, m0
  240. packuswb m1, m1
  241. %endmacro
  242. %macro DC_ADD_MMXEXT_OP 4
  243. %1 m2, [%2 ]
  244. %1 m3, [%2+%3 ]
  245. %1 m4, [%2+%3*2]
  246. %1 m5, [%2+%4 ]
  247. paddusb m2, m0
  248. paddusb m3, m0
  249. paddusb m4, m0
  250. paddusb m5, m0
  251. psubusb m2, m1
  252. psubusb m3, m1
  253. psubusb m4, m1
  254. psubusb m5, m1
  255. %1 [%2 ], m2
  256. %1 [%2+%3 ], m3
  257. %1 [%2+%3*2], m4
  258. %1 [%2+%4 ], m5
  259. %endmacro
  260. INIT_MMX mmxext
  261. ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  262. cglobal h264_idct_dc_add_8, 3, 3, 0
  263. DC_ADD_MMXEXT_INIT r1, r2
  264. DC_ADD_MMXEXT_OP movh, r0, r2, r1
  265. RET
  266. ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  267. cglobal h264_idct8_dc_add_8, 3, 3, 0
  268. DC_ADD_MMXEXT_INIT r1, r2
  269. DC_ADD_MMXEXT_OP mova, r0, r2, r1
  270. lea r0, [r0+r2*4]
  271. DC_ADD_MMXEXT_OP mova, r0, r2, r1
  272. RET
  273. INIT_MMX mmx
  274. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  275. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  276. cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  277. xor r5, r5
  278. %ifdef PIC
  279. lea picregq, [scan8_mem]
  280. %endif
  281. .nextblock:
  282. movzx r6, byte [scan8+r5]
  283. movzx r6, byte [r4+r6]
  284. test r6, r6
  285. jz .skipblock
  286. mov r6d, dword [r1+r5*4]
  287. lea r6, [r0+r6]
  288. IDCT4_ADD r6, r2, r3
  289. .skipblock:
  290. inc r5
  291. add r2, 32
  292. cmp r5, 16
  293. jl .nextblock
  294. REP_RET
  295. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  296. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  297. cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  298. %assign pad 128+4-(stack_offset&7)
  299. SUB rsp, pad
  300. xor r5, r5
  301. %ifdef PIC
  302. lea picregq, [scan8_mem]
  303. %endif
  304. .nextblock:
  305. movzx r6, byte [scan8+r5]
  306. movzx r6, byte [r4+r6]
  307. test r6, r6
  308. jz .skipblock
  309. mov r6d, dword [r1+r5*4]
  310. add r6, r0
  311. add word [r2], 32
  312. IDCT8_ADD_MMX_START r2 , rsp
  313. IDCT8_ADD_MMX_START r2+8, rsp+64
  314. IDCT8_ADD_MMX_END r6 , rsp, r3
  315. mov r6d, dword [r1+r5*4]
  316. lea r6, [r0+r6+4]
  317. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  318. .skipblock:
  319. add r5, 4
  320. add r2, 128
  321. cmp r5, 16
  322. jl .nextblock
  323. ADD rsp, pad
  324. RET
  325. INIT_MMX mmxext
  326. ; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
  327. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  328. cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  329. xor r5, r5
  330. %ifdef PIC
  331. lea picregq, [scan8_mem]
  332. %endif
  333. .nextblock:
  334. movzx r6, byte [scan8+r5]
  335. movzx r6, byte [r4+r6]
  336. test r6, r6
  337. jz .skipblock
  338. cmp r6, 1
  339. jnz .no_dc
  340. movsx r6, word [r2]
  341. test r6, r6
  342. jz .no_dc
  343. DC_ADD_MMXEXT_INIT r2, r3, r6
  344. %if ARCH_X86_64 == 0
  345. %define dst2q r1
  346. %define dst2d r1d
  347. %endif
  348. mov dst2d, dword [r1+r5*4]
  349. lea dst2q, [r0+dst2q]
  350. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  351. %if ARCH_X86_64 == 0
  352. mov r1, r1m
  353. %endif
  354. inc r5
  355. add r2, 32
  356. cmp r5, 16
  357. jl .nextblock
  358. REP_RET
  359. .no_dc:
  360. mov r6d, dword [r1+r5*4]
  361. add r6, r0
  362. IDCT4_ADD r6, r2, r3
  363. .skipblock:
  364. inc r5
  365. add r2, 32
  366. cmp r5, 16
  367. jl .nextblock
  368. REP_RET
  369. INIT_MMX mmx
  370. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  371. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  372. cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  373. xor r5, r5
  374. %ifdef PIC
  375. lea picregq, [scan8_mem]
  376. %endif
  377. .nextblock:
  378. movzx r6, byte [scan8+r5]
  379. movzx r6, byte [r4+r6]
  380. or r6w, word [r2]
  381. test r6, r6
  382. jz .skipblock
  383. mov r6d, dword [r1+r5*4]
  384. add r6, r0
  385. IDCT4_ADD r6, r2, r3
  386. .skipblock:
  387. inc r5
  388. add r2, 32
  389. cmp r5, 16
  390. jl .nextblock
  391. REP_RET
  392. INIT_MMX mmxext
  393. ; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
  394. ; int16_t *block, int stride,
  395. ; const uint8_t nnzc[6*8])
  396. cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  397. xor r5, r5
  398. %ifdef PIC
  399. lea picregq, [scan8_mem]
  400. %endif
  401. .nextblock:
  402. movzx r6, byte [scan8+r5]
  403. movzx r6, byte [r4+r6]
  404. test r6, r6
  405. jz .try_dc
  406. mov r6d, dword [r1+r5*4]
  407. lea r6, [r0+r6]
  408. IDCT4_ADD r6, r2, r3
  409. inc r5
  410. add r2, 32
  411. cmp r5, 16
  412. jl .nextblock
  413. REP_RET
  414. .try_dc:
  415. movsx r6, word [r2]
  416. test r6, r6
  417. jz .skipblock
  418. DC_ADD_MMXEXT_INIT r2, r3, r6
  419. %if ARCH_X86_64 == 0
  420. %define dst2q r1
  421. %define dst2d r1d
  422. %endif
  423. mov dst2d, dword [r1+r5*4]
  424. add dst2q, r0
  425. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  426. %if ARCH_X86_64 == 0
  427. mov r1, r1m
  428. %endif
  429. .skipblock:
  430. inc r5
  431. add r2, 32
  432. cmp r5, 16
  433. jl .nextblock
  434. REP_RET
  435. ; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
  436. ; int16_t *block, int stride,
  437. ; const uint8_t nnzc[6*8])
  438. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  439. %assign pad 128+4-(stack_offset&7)
  440. SUB rsp, pad
  441. xor r5, r5
  442. %ifdef PIC
  443. lea picregq, [scan8_mem]
  444. %endif
  445. .nextblock:
  446. movzx r6, byte [scan8+r5]
  447. movzx r6, byte [r4+r6]
  448. test r6, r6
  449. jz .skipblock
  450. cmp r6, 1
  451. jnz .no_dc
  452. movsx r6, word [r2]
  453. test r6, r6
  454. jz .no_dc
  455. DC_ADD_MMXEXT_INIT r2, r3, r6
  456. %if ARCH_X86_64 == 0
  457. %define dst2q r1
  458. %define dst2d r1d
  459. %endif
  460. mov dst2d, dword [r1+r5*4]
  461. lea dst2q, [r0+dst2q]
  462. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  463. lea dst2q, [dst2q+r3*4]
  464. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  465. %if ARCH_X86_64 == 0
  466. mov r1, r1m
  467. %endif
  468. add r5, 4
  469. add r2, 128
  470. cmp r5, 16
  471. jl .nextblock
  472. ADD rsp, pad
  473. RET
  474. .no_dc:
  475. mov r6d, dword [r1+r5*4]
  476. add r6, r0
  477. add word [r2], 32
  478. IDCT8_ADD_MMX_START r2 , rsp
  479. IDCT8_ADD_MMX_START r2+8, rsp+64
  480. IDCT8_ADD_MMX_END r6 , rsp, r3
  481. mov r6d, dword [r1+r5*4]
  482. lea r6, [r0+r6+4]
  483. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  484. .skipblock:
  485. add r5, 4
  486. add r2, 128
  487. cmp r5, 16
  488. jl .nextblock
  489. ADD rsp, pad
  490. RET
  491. INIT_XMM sse2
  492. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  493. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  494. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  495. xor r5, r5
  496. %ifdef PIC
  497. lea picregq, [scan8_mem]
  498. %endif
  499. .nextblock:
  500. movzx r6, byte [scan8+r5]
  501. movzx r6, byte [r4+r6]
  502. test r6, r6
  503. jz .skipblock
  504. cmp r6, 1
  505. jnz .no_dc
  506. movsx r6, word [r2]
  507. test r6, r6
  508. jz .no_dc
  509. INIT_MMX cpuname
  510. DC_ADD_MMXEXT_INIT r2, r3, r6
  511. %if ARCH_X86_64 == 0
  512. %define dst2q r1
  513. %define dst2d r1d
  514. %endif
  515. mov dst2d, dword [r1+r5*4]
  516. add dst2q, r0
  517. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  518. lea dst2q, [dst2q+r3*4]
  519. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  520. %if ARCH_X86_64 == 0
  521. mov r1, r1m
  522. %endif
  523. add r5, 4
  524. add r2, 128
  525. cmp r5, 16
  526. jl .nextblock
  527. REP_RET
  528. .no_dc:
  529. INIT_XMM cpuname
  530. mov dst2d, dword [r1+r5*4]
  531. add dst2q, r0
  532. IDCT8_ADD_SSE dst2q, r2, r3, r6
  533. %if ARCH_X86_64 == 0
  534. mov r1, r1m
  535. %endif
  536. .skipblock:
  537. add r5, 4
  538. add r2, 128
  539. cmp r5, 16
  540. jl .nextblock
  541. REP_RET
  542. INIT_MMX mmx
  543. h264_idct_add8_mmx_plane:
  544. .nextblock:
  545. movzx r6, byte [scan8+r5]
  546. movzx r6, byte [r4+r6]
  547. or r6w, word [r2]
  548. test r6, r6
  549. jz .skipblock
  550. %if ARCH_X86_64
  551. mov r0d, dword [r1+r5*4]
  552. add r0, [dst2q]
  553. %else
  554. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  555. mov r0, [r0]
  556. add r0, dword [r1+r5*4]
  557. %endif
  558. IDCT4_ADD r0, r2, r3
  559. .skipblock:
  560. inc r5
  561. add r2, 32
  562. test r5, 3
  563. jnz .nextblock
  564. rep ret
  565. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  566. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  567. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  568. mov r5, 16
  569. add r2, 512
  570. %ifdef PIC
  571. lea picregq, [scan8_mem]
  572. %endif
  573. %if ARCH_X86_64
  574. mov dst2q, r0
  575. %endif
  576. call h264_idct_add8_mmx_plane
  577. mov r5, 32
  578. add r2, 384
  579. %if ARCH_X86_64
  580. add dst2q, gprsize
  581. %else
  582. add r0mp, gprsize
  583. %endif
  584. call h264_idct_add8_mmx_plane
  585. RET
  586. h264_idct_add8_mmxext_plane:
  587. .nextblock:
  588. movzx r6, byte [scan8+r5]
  589. movzx r6, byte [r4+r6]
  590. test r6, r6
  591. jz .try_dc
  592. %if ARCH_X86_64
  593. mov r0d, dword [r1+r5*4]
  594. add r0, [dst2q]
  595. %else
  596. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  597. mov r0, [r0]
  598. add r0, dword [r1+r5*4]
  599. %endif
  600. IDCT4_ADD r0, r2, r3
  601. inc r5
  602. add r2, 32
  603. test r5, 3
  604. jnz .nextblock
  605. rep ret
  606. .try_dc:
  607. movsx r6, word [r2]
  608. test r6, r6
  609. jz .skipblock
  610. DC_ADD_MMXEXT_INIT r2, r3, r6
  611. %if ARCH_X86_64
  612. mov r0d, dword [r1+r5*4]
  613. add r0, [dst2q]
  614. %else
  615. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  616. mov r0, [r0]
  617. add r0, dword [r1+r5*4]
  618. %endif
  619. DC_ADD_MMXEXT_OP movh, r0, r3, r6
  620. .skipblock:
  621. inc r5
  622. add r2, 32
  623. test r5, 3
  624. jnz .nextblock
  625. rep ret
  626. INIT_MMX mmxext
  627. ; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
  628. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  629. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  630. mov r5, 16
  631. add r2, 512
  632. %if ARCH_X86_64
  633. mov dst2q, r0
  634. %endif
  635. %ifdef PIC
  636. lea picregq, [scan8_mem]
  637. %endif
  638. call h264_idct_add8_mmxext_plane
  639. mov r5, 32
  640. add r2, 384
  641. %if ARCH_X86_64
  642. add dst2q, gprsize
  643. %else
  644. add r0mp, gprsize
  645. %endif
  646. call h264_idct_add8_mmxext_plane
  647. RET
  648. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  649. h264_idct_dc_add8_mmxext:
  650. movd m0, [r2 ] ; 0 0 X D
  651. punpcklwd m0, [r2+32] ; x X d D
  652. paddsw m0, [pw_32]
  653. psraw m0, 6
  654. punpcklwd m0, m0 ; d d D D
  655. pxor m1, m1 ; 0 0 0 0
  656. psubw m1, m0 ; -d-d-D-D
  657. packuswb m0, m1 ; -d-d-D-D d d D D
  658. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  659. punpcklwd m0, m0 ; d d d d D D D D
  660. lea r6, [r3*3]
  661. DC_ADD_MMXEXT_OP movq, r0, r3, r6
  662. ret
  663. ALIGN 16
  664. INIT_XMM sse2
  665. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  666. h264_add8x4_idct_sse2:
  667. movq m0, [r2+ 0]
  668. movq m1, [r2+ 8]
  669. movq m2, [r2+16]
  670. movq m3, [r2+24]
  671. movhps m0, [r2+32]
  672. movhps m1, [r2+40]
  673. movhps m2, [r2+48]
  674. movhps m3, [r2+56]
  675. IDCT4_1D w,0,1,2,3,4,5
  676. TRANSPOSE2x4x4W 0,1,2,3,4
  677. paddw m0, [pw_32]
  678. IDCT4_1D w,0,1,2,3,4,5
  679. pxor m7, m7
  680. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  681. lea r0, [r0+r3*2]
  682. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  683. ret
  684. %macro add16_sse2_cycle 2
  685. movzx r0, word [r4+%2]
  686. test r0, r0
  687. jz .cycle%1end
  688. mov r0d, dword [r1+%1*8]
  689. %if ARCH_X86_64
  690. add r0, r5
  691. %else
  692. add r0, r0m
  693. %endif
  694. call h264_add8x4_idct_sse2
  695. .cycle%1end:
  696. %if %1 < 7
  697. add r2, 64
  698. %endif
  699. %endmacro
  700. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  701. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  702. cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
  703. %if ARCH_X86_64
  704. mov r5, r0
  705. %endif
  706. ; unrolling of the loop leads to an average performance gain of
  707. ; 20-25%
  708. add16_sse2_cycle 0, 0xc
  709. add16_sse2_cycle 1, 0x14
  710. add16_sse2_cycle 2, 0xe
  711. add16_sse2_cycle 3, 0x16
  712. add16_sse2_cycle 4, 0x1c
  713. add16_sse2_cycle 5, 0x24
  714. add16_sse2_cycle 6, 0x1e
  715. add16_sse2_cycle 7, 0x26
  716. RET
  717. %macro add16intra_sse2_cycle 2
  718. movzx r0, word [r4+%2]
  719. test r0, r0
  720. jz .try%1dc
  721. mov r0d, dword [r1+%1*8]
  722. %if ARCH_X86_64
  723. add r0, r7
  724. %else
  725. add r0, r0m
  726. %endif
  727. call h264_add8x4_idct_sse2
  728. jmp .cycle%1end
  729. .try%1dc:
  730. movsx r0, word [r2 ]
  731. or r0w, word [r2+32]
  732. jz .cycle%1end
  733. mov r0d, dword [r1+%1*8]
  734. %if ARCH_X86_64
  735. add r0, r7
  736. %else
  737. add r0, r0m
  738. %endif
  739. call h264_idct_dc_add8_mmxext
  740. .cycle%1end:
  741. %if %1 < 7
  742. add r2, 64
  743. %endif
  744. %endmacro
  745. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  746. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  747. cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
  748. %if ARCH_X86_64
  749. mov r7, r0
  750. %endif
  751. add16intra_sse2_cycle 0, 0xc
  752. add16intra_sse2_cycle 1, 0x14
  753. add16intra_sse2_cycle 2, 0xe
  754. add16intra_sse2_cycle 3, 0x16
  755. add16intra_sse2_cycle 4, 0x1c
  756. add16intra_sse2_cycle 5, 0x24
  757. add16intra_sse2_cycle 6, 0x1e
  758. add16intra_sse2_cycle 7, 0x26
  759. RET
  760. %macro add8_sse2_cycle 2
  761. movzx r0, word [r4+%2]
  762. test r0, r0
  763. jz .try%1dc
  764. %if ARCH_X86_64
  765. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  766. add r0, [r7]
  767. %else
  768. mov r0, r0m
  769. mov r0, [r0]
  770. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  771. %endif
  772. call h264_add8x4_idct_sse2
  773. jmp .cycle%1end
  774. .try%1dc:
  775. movsx r0, word [r2 ]
  776. or r0w, word [r2+32]
  777. jz .cycle%1end
  778. %if ARCH_X86_64
  779. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  780. add r0, [r7]
  781. %else
  782. mov r0, r0m
  783. mov r0, [r0]
  784. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  785. %endif
  786. call h264_idct_dc_add8_mmxext
  787. .cycle%1end:
  788. %if %1 == 1
  789. add r2, 384+64
  790. %elif %1 < 3
  791. add r2, 64
  792. %endif
  793. %endmacro
  794. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  795. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  796. cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
  797. add r2, 512
  798. %if ARCH_X86_64
  799. mov r7, r0
  800. %endif
  801. add8_sse2_cycle 0, 0x34
  802. add8_sse2_cycle 1, 0x3c
  803. %if ARCH_X86_64
  804. add r7, gprsize
  805. %else
  806. add r0mp, gprsize
  807. %endif
  808. add8_sse2_cycle 2, 0x5c
  809. add8_sse2_cycle 3, 0x64
  810. RET
  811. ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
  812. %macro WALSH4_1D 5
  813. SUMSUB_BADC w, %4, %3, %2, %1, %5
  814. SUMSUB_BADC w, %4, %2, %3, %1, %5
  815. SWAP %1, %4, %3
  816. %endmacro
  817. %macro DEQUANT_MMX 3
  818. mova m7, [pw_1]
  819. mova m4, %1
  820. punpcklwd %1, m7
  821. punpckhwd m4, m7
  822. mova m5, %2
  823. punpcklwd %2, m7
  824. punpckhwd m5, m7
  825. movd m7, t3d
  826. punpckldq m7, m7
  827. pmaddwd %1, m7
  828. pmaddwd %2, m7
  829. pmaddwd m4, m7
  830. pmaddwd m5, m7
  831. psrad %1, %3
  832. psrad %2, %3
  833. psrad m4, %3
  834. psrad m5, %3
  835. packssdw %1, m4
  836. packssdw %2, m5
  837. %endmacro
  838. %macro STORE_WORDS 5-9
  839. %if cpuflag(sse)
  840. movd t0d, %1
  841. psrldq %1, 4
  842. movd t1d, %1
  843. psrldq %1, 4
  844. mov [t2+%2*32], t0w
  845. mov [t2+%4*32], t1w
  846. shr t0d, 16
  847. shr t1d, 16
  848. mov [t2+%3*32], t0w
  849. mov [t2+%5*32], t1w
  850. movd t0d, %1
  851. psrldq %1, 4
  852. movd t1d, %1
  853. mov [t2+%6*32], t0w
  854. mov [t2+%8*32], t1w
  855. shr t0d, 16
  856. shr t1d, 16
  857. mov [t2+%7*32], t0w
  858. mov [t2+%9*32], t1w
  859. %else
  860. movd t0d, %1
  861. psrlq %1, 32
  862. movd t1d, %1
  863. mov [t2+%2*32], t0w
  864. mov [t2+%4*32], t1w
  865. shr t0d, 16
  866. shr t1d, 16
  867. mov [t2+%3*32], t0w
  868. mov [t2+%5*32], t1w
  869. %endif
  870. %endmacro
  871. %macro DEQUANT_STORE 1
  872. %if cpuflag(sse2)
  873. movd xmm4, t3d
  874. movq xmm5, [pw_1]
  875. pshufd xmm4, xmm4, 0
  876. movq2dq xmm0, m0
  877. movq2dq xmm1, m1
  878. movq2dq xmm2, m2
  879. movq2dq xmm3, m3
  880. punpcklwd xmm0, xmm5
  881. punpcklwd xmm1, xmm5
  882. punpcklwd xmm2, xmm5
  883. punpcklwd xmm3, xmm5
  884. pmaddwd xmm0, xmm4
  885. pmaddwd xmm1, xmm4
  886. pmaddwd xmm2, xmm4
  887. pmaddwd xmm3, xmm4
  888. psrad xmm0, %1
  889. psrad xmm1, %1
  890. psrad xmm2, %1
  891. psrad xmm3, %1
  892. packssdw xmm0, xmm1
  893. packssdw xmm2, xmm3
  894. STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  895. STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  896. %else
  897. DEQUANT_MMX m0, m1, %1
  898. STORE_WORDS m0, 0, 1, 4, 5
  899. STORE_WORDS m1, 2, 3, 6, 7
  900. DEQUANT_MMX m2, m3, %1
  901. STORE_WORDS m2, 8, 9, 12, 13
  902. STORE_WORDS m3, 10, 11, 14, 15
  903. %endif
  904. %endmacro
  905. %macro IDCT_DC_DEQUANT 1
  906. cglobal h264_luma_dc_dequant_idct, 3, 4, %1
  907. ; manually spill XMM registers for Win64 because
  908. ; the code here is initialized with INIT_MMX
  909. WIN64_SPILL_XMM %1
  910. movq m3, [r1+24]
  911. movq m2, [r1+16]
  912. movq m1, [r1+ 8]
  913. movq m0, [r1+ 0]
  914. WALSH4_1D 0,1,2,3,4
  915. TRANSPOSE4x4W 0,1,2,3,4
  916. WALSH4_1D 0,1,2,3,4
  917. ; shift, tmp, output, qmul
  918. %if WIN64
  919. DECLARE_REG_TMP 0,3,1,2
  920. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  921. xchg r0, t2
  922. %elif ARCH_X86_64
  923. DECLARE_REG_TMP 3,1,0,2
  924. %else
  925. DECLARE_REG_TMP 1,3,0,2
  926. %endif
  927. cmp t3d, 32767
  928. jg .big_qmul
  929. add t3d, 128 << 16
  930. DEQUANT_STORE 8
  931. RET
  932. .big_qmul:
  933. bsr t0d, t3d
  934. add t3d, 128 << 16
  935. mov t1d, 7
  936. cmp t0d, t1d
  937. cmovg t0d, t1d
  938. inc t1d
  939. shr t3d, t0b
  940. sub t1d, t0d
  941. %if cpuflag(sse2)
  942. movd xmm6, t1d
  943. DEQUANT_STORE xmm6
  944. %else
  945. movd m6, t1d
  946. DEQUANT_STORE m6
  947. %endif
  948. RET
  949. %endmacro
  950. INIT_MMX mmx
  951. IDCT_DC_DEQUANT 0
  952. INIT_MMX sse2
  953. IDCT_DC_DEQUANT 7