You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1073 lines
27KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of Libav.
  13. ;*
  14. ;* Libav is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* Libav is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with Libav; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "libavutil/x86/x86util.asm"
  29. SECTION_RODATA
  30. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  31. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  32. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  33. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  34. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  35. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  36. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  37. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  38. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  39. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  40. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  41. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  42. %ifdef PIC
  43. %define npicregs 1
  44. %define scan8 picregq
  45. %else
  46. %define npicregs 0
  47. %define scan8 scan8_mem
  48. %endif
  49. cextern pw_32
  50. cextern pw_1
  51. SECTION .text
  52. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  53. %macro IDCT4_ADD 3
  54. ; Load dct coeffs
  55. movq m0, [%2]
  56. movq m1, [%2+8]
  57. movq m2, [%2+16]
  58. movq m3, [%2+24]
  59. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  60. mova m6, [pw_32]
  61. TRANSPOSE4x4W 0, 1, 2, 3, 4
  62. paddw m0, m6
  63. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  64. pxor m7, m7
  65. movq [%2+ 0], m7
  66. movq [%2+ 8], m7
  67. movq [%2+16], m7
  68. movq [%2+24], m7
  69. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  70. lea %1, [%1+%3*2]
  71. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  72. %endmacro
  73. INIT_MMX mmx
  74. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  75. cglobal h264_idct_add_8, 3, 3, 0
  76. IDCT4_ADD r0, r1, r2
  77. RET
  78. %macro IDCT8_1D 2
  79. mova m0, m1
  80. psraw m1, 1
  81. mova m4, m5
  82. psraw m4, 1
  83. paddw m4, m5
  84. paddw m1, m0
  85. paddw m4, m7
  86. paddw m1, m5
  87. psubw m4, m0
  88. paddw m1, m3
  89. psubw m0, m3
  90. psubw m5, m3
  91. psraw m3, 1
  92. paddw m0, m7
  93. psubw m5, m7
  94. psraw m7, 1
  95. psubw m0, m3
  96. psubw m5, m7
  97. mova m7, m1
  98. psraw m1, 2
  99. mova m3, m4
  100. psraw m3, 2
  101. paddw m3, m0
  102. psraw m0, 2
  103. paddw m1, m5
  104. psraw m5, 2
  105. psubw m0, m4
  106. psubw m7, m5
  107. mova m5, m6
  108. psraw m6, 1
  109. mova m4, m2
  110. psraw m4, 1
  111. paddw m6, m2
  112. psubw m4, m5
  113. mova m2, %1
  114. mova m5, %2
  115. SUMSUB_BA w, 5, 2
  116. SUMSUB_BA w, 6, 5
  117. SUMSUB_BA w, 4, 2
  118. SUMSUB_BA w, 7, 6
  119. SUMSUB_BA w, 0, 4
  120. SUMSUB_BA w, 3, 2
  121. SUMSUB_BA w, 1, 5
  122. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  123. %endmacro
  124. %macro IDCT8_1D_FULL 1
  125. mova m7, [%1+112]
  126. mova m6, [%1+ 96]
  127. mova m5, [%1+ 80]
  128. mova m3, [%1+ 48]
  129. mova m2, [%1+ 32]
  130. mova m1, [%1+ 16]
  131. IDCT8_1D [%1], [%1+ 64]
  132. %endmacro
  133. ; %1=int16_t *block, %2=int16_t *dstblock
  134. %macro IDCT8_ADD_MMX_START 2
  135. IDCT8_1D_FULL %1
  136. mova [%1], m7
  137. TRANSPOSE4x4W 0, 1, 2, 3, 7
  138. mova m7, [%1]
  139. mova [%2 ], m0
  140. mova [%2+16], m1
  141. mova [%2+32], m2
  142. mova [%2+48], m3
  143. TRANSPOSE4x4W 4, 5, 6, 7, 3
  144. mova [%2+ 8], m4
  145. mova [%2+24], m5
  146. mova [%2+40], m6
  147. mova [%2+56], m7
  148. %endmacro
  149. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  150. %macro IDCT8_ADD_MMX_END 3-4
  151. IDCT8_1D_FULL %2
  152. mova [%2 ], m5
  153. mova [%2+16], m6
  154. mova [%2+32], m7
  155. pxor m7, m7
  156. %if %0 == 4
  157. movq [%4+ 0], m7
  158. movq [%4+ 8], m7
  159. movq [%4+ 16], m7
  160. movq [%4+ 24], m7
  161. movq [%4+ 32], m7
  162. movq [%4+ 40], m7
  163. movq [%4+ 48], m7
  164. movq [%4+ 56], m7
  165. movq [%4+ 64], m7
  166. movq [%4+ 72], m7
  167. movq [%4+ 80], m7
  168. movq [%4+ 88], m7
  169. movq [%4+ 96], m7
  170. movq [%4+104], m7
  171. movq [%4+112], m7
  172. movq [%4+120], m7
  173. %endif
  174. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  175. lea %1, [%1+%3*2]
  176. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  177. mova m0, [%2 ]
  178. mova m1, [%2+16]
  179. mova m2, [%2+32]
  180. lea %1, [%1+%3*2]
  181. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  182. lea %1, [%1+%3*2]
  183. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  184. %endmacro
  185. INIT_MMX mmx
  186. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  187. cglobal h264_idct8_add_8, 3, 4, 0
  188. %assign pad 128+4-(stack_offset&7)
  189. SUB rsp, pad
  190. add word [r1], 32
  191. IDCT8_ADD_MMX_START r1 , rsp
  192. IDCT8_ADD_MMX_START r1+8, rsp+64
  193. lea r3, [r0+4]
  194. IDCT8_ADD_MMX_END r0 , rsp, r2, r1
  195. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  196. ADD rsp, pad
  197. RET
  198. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  199. %macro IDCT8_ADD_SSE 4
  200. IDCT8_1D_FULL %2
  201. %if ARCH_X86_64
  202. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  203. %else
  204. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  205. %endif
  206. paddw m0, [pw_32]
  207. %if ARCH_X86_64 == 0
  208. mova [%2 ], m0
  209. mova [%2+16], m4
  210. IDCT8_1D [%2], [%2+ 16]
  211. mova [%2 ], m6
  212. mova [%2+16], m7
  213. %else
  214. SWAP 0, 8
  215. SWAP 4, 9
  216. IDCT8_1D m8, m9
  217. SWAP 6, 8
  218. SWAP 7, 9
  219. %endif
  220. pxor m7, m7
  221. lea %4, [%3*3]
  222. STORE_DIFF m0, m6, m7, [%1 ]
  223. STORE_DIFF m1, m6, m7, [%1+%3 ]
  224. STORE_DIFF m2, m6, m7, [%1+%3*2]
  225. STORE_DIFF m3, m6, m7, [%1+%4 ]
  226. %if ARCH_X86_64 == 0
  227. mova m0, [%2 ]
  228. mova m1, [%2+16]
  229. %else
  230. SWAP 0, 8
  231. SWAP 1, 9
  232. %endif
  233. mova [%2+ 0], m7
  234. mova [%2+ 16], m7
  235. mova [%2+ 32], m7
  236. mova [%2+ 48], m7
  237. mova [%2+ 64], m7
  238. mova [%2+ 80], m7
  239. mova [%2+ 96], m7
  240. mova [%2+112], m7
  241. lea %1, [%1+%3*4]
  242. STORE_DIFF m4, m6, m7, [%1 ]
  243. STORE_DIFF m5, m6, m7, [%1+%3 ]
  244. STORE_DIFF m0, m6, m7, [%1+%3*2]
  245. STORE_DIFF m1, m6, m7, [%1+%4 ]
  246. %endmacro
  247. INIT_XMM sse2
  248. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  249. cglobal h264_idct8_add_8, 3, 4, 10
  250. IDCT8_ADD_SSE r0, r1, r2, r3
  251. RET
  252. %macro DC_ADD_MMXEXT_INIT 2
  253. add %1, 32
  254. sar %1, 6
  255. movd m0, %1d
  256. lea %1, [%2*3]
  257. pshufw m0, m0, 0
  258. pxor m1, m1
  259. psubw m1, m0
  260. packuswb m0, m0
  261. packuswb m1, m1
  262. %endmacro
  263. %macro DC_ADD_MMXEXT_OP 4
  264. %1 m2, [%2 ]
  265. %1 m3, [%2+%3 ]
  266. %1 m4, [%2+%3*2]
  267. %1 m5, [%2+%4 ]
  268. paddusb m2, m0
  269. paddusb m3, m0
  270. paddusb m4, m0
  271. paddusb m5, m0
  272. psubusb m2, m1
  273. psubusb m3, m1
  274. psubusb m4, m1
  275. psubusb m5, m1
  276. %1 [%2 ], m2
  277. %1 [%2+%3 ], m3
  278. %1 [%2+%3*2], m4
  279. %1 [%2+%4 ], m5
  280. %endmacro
  281. INIT_MMX mmxext
  282. ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  283. %if ARCH_X86_64
  284. cglobal h264_idct_dc_add_8, 3, 4, 0
  285. movsx r3, word [r1]
  286. mov dword [r1], 0
  287. DC_ADD_MMXEXT_INIT r3, r2
  288. DC_ADD_MMXEXT_OP movh, r0, r2, r3
  289. RET
  290. ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  291. cglobal h264_idct8_dc_add_8, 3, 4, 0
  292. movsx r3, word [r1]
  293. mov dword [r1], 0
  294. DC_ADD_MMXEXT_INIT r3, r2
  295. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  296. lea r0, [r0+r2*4]
  297. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  298. RET
  299. %else
  300. cglobal h264_idct_dc_add_8, 2, 3, 0
  301. movsx r2, word [r1]
  302. mov dword [r1], 0
  303. mov r1, r2m
  304. DC_ADD_MMXEXT_INIT r2, r1
  305. DC_ADD_MMXEXT_OP movh, r0, r1, r2
  306. RET
  307. ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  308. cglobal h264_idct8_dc_add_8, 2, 3, 0
  309. movsx r2, word [r1]
  310. mov dword [r1], 0
  311. mov r1, r2m
  312. DC_ADD_MMXEXT_INIT r2, r1
  313. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  314. lea r0, [r0+r1*4]
  315. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  316. RET
  317. %endif
  318. INIT_MMX mmx
  319. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  320. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  321. cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  322. xor r5, r5
  323. %ifdef PIC
  324. lea picregq, [scan8_mem]
  325. %endif
  326. .nextblock:
  327. movzx r6, byte [scan8+r5]
  328. movzx r6, byte [r4+r6]
  329. test r6, r6
  330. jz .skipblock
  331. mov r6d, dword [r1+r5*4]
  332. lea r6, [r0+r6]
  333. IDCT4_ADD r6, r2, r3
  334. .skipblock:
  335. inc r5
  336. add r2, 32
  337. cmp r5, 16
  338. jl .nextblock
  339. REP_RET
  340. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  341. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  342. cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  343. %assign pad 128+4-(stack_offset&7)
  344. SUB rsp, pad
  345. xor r5, r5
  346. %ifdef PIC
  347. lea picregq, [scan8_mem]
  348. %endif
  349. .nextblock:
  350. movzx r6, byte [scan8+r5]
  351. movzx r6, byte [r4+r6]
  352. test r6, r6
  353. jz .skipblock
  354. mov r6d, dword [r1+r5*4]
  355. add r6, r0
  356. add word [r2], 32
  357. IDCT8_ADD_MMX_START r2 , rsp
  358. IDCT8_ADD_MMX_START r2+8, rsp+64
  359. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  360. mov r6d, dword [r1+r5*4]
  361. lea r6, [r0+r6+4]
  362. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  363. .skipblock:
  364. add r5, 4
  365. add r2, 128
  366. cmp r5, 16
  367. jl .nextblock
  368. ADD rsp, pad
  369. RET
  370. INIT_MMX mmxext
  371. ; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
  372. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  373. cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  374. xor r5, r5
  375. %ifdef PIC
  376. lea picregq, [scan8_mem]
  377. %endif
  378. .nextblock:
  379. movzx r6, byte [scan8+r5]
  380. movzx r6, byte [r4+r6]
  381. test r6, r6
  382. jz .skipblock
  383. cmp r6, 1
  384. jnz .no_dc
  385. movsx r6, word [r2]
  386. test r6, r6
  387. jz .no_dc
  388. mov word [r2], 0
  389. DC_ADD_MMXEXT_INIT r6, r3
  390. %if ARCH_X86_64 == 0
  391. %define dst2q r1
  392. %define dst2d r1d
  393. %endif
  394. mov dst2d, dword [r1+r5*4]
  395. lea dst2q, [r0+dst2q]
  396. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  397. %if ARCH_X86_64 == 0
  398. mov r1, r1m
  399. %endif
  400. inc r5
  401. add r2, 32
  402. cmp r5, 16
  403. jl .nextblock
  404. REP_RET
  405. .no_dc:
  406. mov r6d, dword [r1+r5*4]
  407. add r6, r0
  408. IDCT4_ADD r6, r2, r3
  409. .skipblock:
  410. inc r5
  411. add r2, 32
  412. cmp r5, 16
  413. jl .nextblock
  414. REP_RET
  415. INIT_MMX mmx
  416. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  417. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  418. cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  419. xor r5, r5
  420. %ifdef PIC
  421. lea picregq, [scan8_mem]
  422. %endif
  423. .nextblock:
  424. movzx r6, byte [scan8+r5]
  425. movzx r6, byte [r4+r6]
  426. or r6w, word [r2]
  427. test r6, r6
  428. jz .skipblock
  429. mov r6d, dword [r1+r5*4]
  430. add r6, r0
  431. IDCT4_ADD r6, r2, r3
  432. .skipblock:
  433. inc r5
  434. add r2, 32
  435. cmp r5, 16
  436. jl .nextblock
  437. REP_RET
  438. INIT_MMX mmxext
  439. ; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
  440. ; int16_t *block, int stride,
  441. ; const uint8_t nnzc[6*8])
  442. cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  443. xor r5, r5
  444. %ifdef PIC
  445. lea picregq, [scan8_mem]
  446. %endif
  447. .nextblock:
  448. movzx r6, byte [scan8+r5]
  449. movzx r6, byte [r4+r6]
  450. test r6, r6
  451. jz .try_dc
  452. mov r6d, dword [r1+r5*4]
  453. lea r6, [r0+r6]
  454. IDCT4_ADD r6, r2, r3
  455. inc r5
  456. add r2, 32
  457. cmp r5, 16
  458. jl .nextblock
  459. REP_RET
  460. .try_dc:
  461. movsx r6, word [r2]
  462. test r6, r6
  463. jz .skipblock
  464. mov word [r2], 0
  465. DC_ADD_MMXEXT_INIT r6, r3
  466. %if ARCH_X86_64 == 0
  467. %define dst2q r1
  468. %define dst2d r1d
  469. %endif
  470. mov dst2d, dword [r1+r5*4]
  471. add dst2q, r0
  472. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  473. %if ARCH_X86_64 == 0
  474. mov r1, r1m
  475. %endif
  476. .skipblock:
  477. inc r5
  478. add r2, 32
  479. cmp r5, 16
  480. jl .nextblock
  481. REP_RET
  482. ; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
  483. ; int16_t *block, int stride,
  484. ; const uint8_t nnzc[6*8])
  485. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  486. %assign pad 128+4-(stack_offset&7)
  487. SUB rsp, pad
  488. xor r5, r5
  489. %ifdef PIC
  490. lea picregq, [scan8_mem]
  491. %endif
  492. .nextblock:
  493. movzx r6, byte [scan8+r5]
  494. movzx r6, byte [r4+r6]
  495. test r6, r6
  496. jz .skipblock
  497. cmp r6, 1
  498. jnz .no_dc
  499. movsx r6, word [r2]
  500. test r6, r6
  501. jz .no_dc
  502. mov word [r2], 0
  503. DC_ADD_MMXEXT_INIT r6, r3
  504. %if ARCH_X86_64 == 0
  505. %define dst2q r1
  506. %define dst2d r1d
  507. %endif
  508. mov dst2d, dword [r1+r5*4]
  509. lea dst2q, [r0+dst2q]
  510. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  511. lea dst2q, [dst2q+r3*4]
  512. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  513. %if ARCH_X86_64 == 0
  514. mov r1, r1m
  515. %endif
  516. add r5, 4
  517. add r2, 128
  518. cmp r5, 16
  519. jl .nextblock
  520. ADD rsp, pad
  521. RET
  522. .no_dc:
  523. mov r6d, dword [r1+r5*4]
  524. add r6, r0
  525. add word [r2], 32
  526. IDCT8_ADD_MMX_START r2 , rsp
  527. IDCT8_ADD_MMX_START r2+8, rsp+64
  528. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  529. mov r6d, dword [r1+r5*4]
  530. lea r6, [r0+r6+4]
  531. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  532. .skipblock:
  533. add r5, 4
  534. add r2, 128
  535. cmp r5, 16
  536. jl .nextblock
  537. ADD rsp, pad
  538. RET
  539. INIT_XMM sse2
  540. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  541. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  542. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  543. xor r5, r5
  544. %ifdef PIC
  545. lea picregq, [scan8_mem]
  546. %endif
  547. .nextblock:
  548. movzx r6, byte [scan8+r5]
  549. movzx r6, byte [r4+r6]
  550. test r6, r6
  551. jz .skipblock
  552. cmp r6, 1
  553. jnz .no_dc
  554. movsx r6, word [r2]
  555. test r6, r6
  556. jz .no_dc
  557. INIT_MMX cpuname
  558. mov word [r2], 0
  559. DC_ADD_MMXEXT_INIT r6, r3
  560. %if ARCH_X86_64 == 0
  561. %define dst2q r1
  562. %define dst2d r1d
  563. %endif
  564. mov dst2d, dword [r1+r5*4]
  565. add dst2q, r0
  566. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  567. lea dst2q, [dst2q+r3*4]
  568. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  569. %if ARCH_X86_64 == 0
  570. mov r1, r1m
  571. %endif
  572. add r5, 4
  573. add r2, 128
  574. cmp r5, 16
  575. jl .nextblock
  576. REP_RET
  577. .no_dc:
  578. INIT_XMM cpuname
  579. mov dst2d, dword [r1+r5*4]
  580. add dst2q, r0
  581. IDCT8_ADD_SSE dst2q, r2, r3, r6
  582. %if ARCH_X86_64 == 0
  583. mov r1, r1m
  584. %endif
  585. .skipblock:
  586. add r5, 4
  587. add r2, 128
  588. cmp r5, 16
  589. jl .nextblock
  590. REP_RET
  591. INIT_MMX mmx
  592. h264_idct_add8_mmx_plane:
  593. .nextblock:
  594. movzx r6, byte [scan8+r5]
  595. movzx r6, byte [r4+r6]
  596. or r6w, word [r2]
  597. test r6, r6
  598. jz .skipblock
  599. %if ARCH_X86_64
  600. mov r0d, dword [r1+r5*4]
  601. add r0, [dst2q]
  602. %else
  603. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  604. mov r0, [r0]
  605. add r0, dword [r1+r5*4]
  606. %endif
  607. IDCT4_ADD r0, r2, r3
  608. .skipblock:
  609. inc r5
  610. add r2, 32
  611. test r5, 3
  612. jnz .nextblock
  613. rep ret
  614. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  615. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  616. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  617. mov r5, 16
  618. add r2, 512
  619. %ifdef PIC
  620. lea picregq, [scan8_mem]
  621. %endif
  622. %if ARCH_X86_64
  623. mov dst2q, r0
  624. %endif
  625. call h264_idct_add8_mmx_plane
  626. mov r5, 32
  627. add r2, 384
  628. %if ARCH_X86_64
  629. add dst2q, gprsize
  630. %else
  631. add r0mp, gprsize
  632. %endif
  633. call h264_idct_add8_mmx_plane
  634. RET
  635. h264_idct_add8_mmxext_plane:
  636. .nextblock:
  637. movzx r6, byte [scan8+r5]
  638. movzx r6, byte [r4+r6]
  639. test r6, r6
  640. jz .try_dc
  641. %if ARCH_X86_64
  642. mov r0d, dword [r1+r5*4]
  643. add r0, [dst2q]
  644. %else
  645. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  646. mov r0, [r0]
  647. add r0, dword [r1+r5*4]
  648. %endif
  649. IDCT4_ADD r0, r2, r3
  650. inc r5
  651. add r2, 32
  652. test r5, 3
  653. jnz .nextblock
  654. rep ret
  655. .try_dc:
  656. movsx r6, word [r2]
  657. test r6, r6
  658. jz .skipblock
  659. mov word [r2], 0
  660. DC_ADD_MMXEXT_INIT r6, r3
  661. %if ARCH_X86_64
  662. mov r0d, dword [r1+r5*4]
  663. add r0, [dst2q]
  664. %else
  665. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  666. mov r0, [r0]
  667. add r0, dword [r1+r5*4]
  668. %endif
  669. DC_ADD_MMXEXT_OP movh, r0, r3, r6
  670. .skipblock:
  671. inc r5
  672. add r2, 32
  673. test r5, 3
  674. jnz .nextblock
  675. rep ret
  676. INIT_MMX mmxext
  677. ; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
  678. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  679. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  680. mov r5, 16
  681. add r2, 512
  682. %if ARCH_X86_64
  683. mov dst2q, r0
  684. %endif
  685. %ifdef PIC
  686. lea picregq, [scan8_mem]
  687. %endif
  688. call h264_idct_add8_mmxext_plane
  689. mov r5, 32
  690. add r2, 384
  691. %if ARCH_X86_64
  692. add dst2q, gprsize
  693. %else
  694. add r0mp, gprsize
  695. %endif
  696. call h264_idct_add8_mmxext_plane
  697. RET
  698. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  699. h264_idct_dc_add8_mmxext:
  700. movd m0, [r2 ] ; 0 0 X D
  701. mov word [r2+ 0], 0
  702. punpcklwd m0, [r2+32] ; x X d D
  703. mov word [r2+32], 0
  704. paddsw m0, [pw_32]
  705. psraw m0, 6
  706. punpcklwd m0, m0 ; d d D D
  707. pxor m1, m1 ; 0 0 0 0
  708. psubw m1, m0 ; -d-d-D-D
  709. packuswb m0, m1 ; -d-d-D-D d d D D
  710. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  711. punpcklwd m0, m0 ; d d d d D D D D
  712. lea r6, [r3*3]
  713. DC_ADD_MMXEXT_OP movq, r0, r3, r6
  714. ret
  715. ALIGN 16
  716. INIT_XMM sse2
  717. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  718. h264_add8x4_idct_sse2:
  719. movq m0, [r2+ 0]
  720. movq m1, [r2+ 8]
  721. movq m2, [r2+16]
  722. movq m3, [r2+24]
  723. movhps m0, [r2+32]
  724. movhps m1, [r2+40]
  725. movhps m2, [r2+48]
  726. movhps m3, [r2+56]
  727. IDCT4_1D w,0,1,2,3,4,5
  728. TRANSPOSE2x4x4W 0,1,2,3,4
  729. paddw m0, [pw_32]
  730. IDCT4_1D w,0,1,2,3,4,5
  731. pxor m7, m7
  732. mova [r2+ 0], m7
  733. mova [r2+16], m7
  734. mova [r2+32], m7
  735. mova [r2+48], m7
  736. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  737. lea r0, [r0+r3*2]
  738. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  739. ret
  740. %macro add16_sse2_cycle 2
  741. movzx r0, word [r4+%2]
  742. test r0, r0
  743. jz .cycle%1end
  744. mov r0d, dword [r1+%1*8]
  745. %if ARCH_X86_64
  746. add r0, r5
  747. %else
  748. add r0, r0m
  749. %endif
  750. call h264_add8x4_idct_sse2
  751. .cycle%1end:
  752. %if %1 < 7
  753. add r2, 64
  754. %endif
  755. %endmacro
  756. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  757. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  758. cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
  759. %if ARCH_X86_64
  760. mov r5, r0
  761. %endif
  762. ; unrolling of the loop leads to an average performance gain of
  763. ; 20-25%
  764. add16_sse2_cycle 0, 0xc
  765. add16_sse2_cycle 1, 0x14
  766. add16_sse2_cycle 2, 0xe
  767. add16_sse2_cycle 3, 0x16
  768. add16_sse2_cycle 4, 0x1c
  769. add16_sse2_cycle 5, 0x24
  770. add16_sse2_cycle 6, 0x1e
  771. add16_sse2_cycle 7, 0x26
  772. RET
  773. %macro add16intra_sse2_cycle 2
  774. movzx r0, word [r4+%2]
  775. test r0, r0
  776. jz .try%1dc
  777. mov r0d, dword [r1+%1*8]
  778. %if ARCH_X86_64
  779. add r0, r7
  780. %else
  781. add r0, r0m
  782. %endif
  783. call h264_add8x4_idct_sse2
  784. jmp .cycle%1end
  785. .try%1dc:
  786. movsx r0, word [r2 ]
  787. or r0w, word [r2+32]
  788. jz .cycle%1end
  789. mov r0d, dword [r1+%1*8]
  790. %if ARCH_X86_64
  791. add r0, r7
  792. %else
  793. add r0, r0m
  794. %endif
  795. call h264_idct_dc_add8_mmxext
  796. .cycle%1end:
  797. %if %1 < 7
  798. add r2, 64
  799. %endif
  800. %endmacro
  801. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  802. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  803. cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
  804. %if ARCH_X86_64
  805. mov r7, r0
  806. %endif
  807. add16intra_sse2_cycle 0, 0xc
  808. add16intra_sse2_cycle 1, 0x14
  809. add16intra_sse2_cycle 2, 0xe
  810. add16intra_sse2_cycle 3, 0x16
  811. add16intra_sse2_cycle 4, 0x1c
  812. add16intra_sse2_cycle 5, 0x24
  813. add16intra_sse2_cycle 6, 0x1e
  814. add16intra_sse2_cycle 7, 0x26
  815. RET
  816. %macro add8_sse2_cycle 2
  817. movzx r0, word [r4+%2]
  818. test r0, r0
  819. jz .try%1dc
  820. %if ARCH_X86_64
  821. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  822. add r0, [r7]
  823. %else
  824. mov r0, r0m
  825. mov r0, [r0]
  826. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  827. %endif
  828. call h264_add8x4_idct_sse2
  829. jmp .cycle%1end
  830. .try%1dc:
  831. movsx r0, word [r2 ]
  832. or r0w, word [r2+32]
  833. jz .cycle%1end
  834. %if ARCH_X86_64
  835. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  836. add r0, [r7]
  837. %else
  838. mov r0, r0m
  839. mov r0, [r0]
  840. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  841. %endif
  842. call h264_idct_dc_add8_mmxext
  843. .cycle%1end:
  844. %if %1 == 1
  845. add r2, 384+64
  846. %elif %1 < 3
  847. add r2, 64
  848. %endif
  849. %endmacro
  850. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  851. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  852. cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
  853. add r2, 512
  854. %if ARCH_X86_64
  855. mov r7, r0
  856. %endif
  857. add8_sse2_cycle 0, 0x34
  858. add8_sse2_cycle 1, 0x3c
  859. %if ARCH_X86_64
  860. add r7, gprsize
  861. %else
  862. add r0mp, gprsize
  863. %endif
  864. add8_sse2_cycle 2, 0x5c
  865. add8_sse2_cycle 3, 0x64
  866. RET
  867. ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
  868. %macro WALSH4_1D 5
  869. SUMSUB_BADC w, %4, %3, %2, %1, %5
  870. SUMSUB_BADC w, %4, %2, %3, %1, %5
  871. SWAP %1, %4, %3
  872. %endmacro
  873. %macro DEQUANT_MMX 3
  874. mova m7, [pw_1]
  875. mova m4, %1
  876. punpcklwd %1, m7
  877. punpckhwd m4, m7
  878. mova m5, %2
  879. punpcklwd %2, m7
  880. punpckhwd m5, m7
  881. movd m7, t3d
  882. punpckldq m7, m7
  883. pmaddwd %1, m7
  884. pmaddwd %2, m7
  885. pmaddwd m4, m7
  886. pmaddwd m5, m7
  887. psrad %1, %3
  888. psrad %2, %3
  889. psrad m4, %3
  890. psrad m5, %3
  891. packssdw %1, m4
  892. packssdw %2, m5
  893. %endmacro
  894. %macro STORE_WORDS 5-9
  895. %if cpuflag(sse)
  896. movd t0d, %1
  897. psrldq %1, 4
  898. movd t1d, %1
  899. psrldq %1, 4
  900. mov [t2+%2*32], t0w
  901. mov [t2+%4*32], t1w
  902. shr t0d, 16
  903. shr t1d, 16
  904. mov [t2+%3*32], t0w
  905. mov [t2+%5*32], t1w
  906. movd t0d, %1
  907. psrldq %1, 4
  908. movd t1d, %1
  909. mov [t2+%6*32], t0w
  910. mov [t2+%8*32], t1w
  911. shr t0d, 16
  912. shr t1d, 16
  913. mov [t2+%7*32], t0w
  914. mov [t2+%9*32], t1w
  915. %else
  916. movd t0d, %1
  917. psrlq %1, 32
  918. movd t1d, %1
  919. mov [t2+%2*32], t0w
  920. mov [t2+%4*32], t1w
  921. shr t0d, 16
  922. shr t1d, 16
  923. mov [t2+%3*32], t0w
  924. mov [t2+%5*32], t1w
  925. %endif
  926. %endmacro
  927. %macro DEQUANT_STORE 1
  928. %if cpuflag(sse2)
  929. movd xmm4, t3d
  930. movq xmm5, [pw_1]
  931. pshufd xmm4, xmm4, 0
  932. movq2dq xmm0, m0
  933. movq2dq xmm1, m1
  934. movq2dq xmm2, m2
  935. movq2dq xmm3, m3
  936. punpcklwd xmm0, xmm5
  937. punpcklwd xmm1, xmm5
  938. punpcklwd xmm2, xmm5
  939. punpcklwd xmm3, xmm5
  940. pmaddwd xmm0, xmm4
  941. pmaddwd xmm1, xmm4
  942. pmaddwd xmm2, xmm4
  943. pmaddwd xmm3, xmm4
  944. psrad xmm0, %1
  945. psrad xmm1, %1
  946. psrad xmm2, %1
  947. psrad xmm3, %1
  948. packssdw xmm0, xmm1
  949. packssdw xmm2, xmm3
  950. STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  951. STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  952. %else
  953. DEQUANT_MMX m0, m1, %1
  954. STORE_WORDS m0, 0, 1, 4, 5
  955. STORE_WORDS m1, 2, 3, 6, 7
  956. DEQUANT_MMX m2, m3, %1
  957. STORE_WORDS m2, 8, 9, 12, 13
  958. STORE_WORDS m3, 10, 11, 14, 15
  959. %endif
  960. %endmacro
  961. %macro IDCT_DC_DEQUANT 1
  962. cglobal h264_luma_dc_dequant_idct, 3, 4, %1
  963. ; manually spill XMM registers for Win64 because
  964. ; the code here is initialized with INIT_MMX
  965. WIN64_SPILL_XMM %1
  966. movq m3, [r1+24]
  967. movq m2, [r1+16]
  968. movq m1, [r1+ 8]
  969. movq m0, [r1+ 0]
  970. WALSH4_1D 0,1,2,3,4
  971. TRANSPOSE4x4W 0,1,2,3,4
  972. WALSH4_1D 0,1,2,3,4
  973. ; shift, tmp, output, qmul
  974. %if WIN64
  975. DECLARE_REG_TMP 0,3,1,2
  976. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  977. xchg r0, t2
  978. %elif ARCH_X86_64
  979. DECLARE_REG_TMP 3,1,0,2
  980. %else
  981. DECLARE_REG_TMP 1,3,0,2
  982. %endif
  983. cmp t3d, 32767
  984. jg .big_qmul
  985. add t3d, 128 << 16
  986. DEQUANT_STORE 8
  987. RET
  988. .big_qmul:
  989. bsr t0d, t3d
  990. add t3d, 128 << 16
  991. mov t1d, 7
  992. cmp t0d, t1d
  993. cmovg t0d, t1d
  994. inc t1d
  995. shr t3d, t0b
  996. sub t1d, t0d
  997. %if cpuflag(sse2)
  998. movd xmm6, t1d
  999. DEQUANT_STORE xmm6
  1000. %else
  1001. movd m6, t1d
  1002. DEQUANT_STORE m6
  1003. %endif
  1004. RET
  1005. %endmacro
  1006. INIT_MMX mmx
  1007. IDCT_DC_DEQUANT 0
  1008. INIT_MMX sse2
  1009. IDCT_DC_DEQUANT 7