You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1074 lines
27KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of Libav.
  13. ;*
  14. ;* Libav is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* Libav is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with Libav; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "libavutil/x86/x86util.asm"
  29. SECTION_RODATA
  30. ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  31. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  32. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  33. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  34. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  35. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  36. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  37. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  38. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  39. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  40. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  41. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  42. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  43. %ifdef PIC
  44. %define npicregs 1
  45. %define scan8 picregq
  46. %else
  47. %define npicregs 0
  48. %define scan8 scan8_mem
  49. %endif
  50. cextern pw_32
  51. cextern pw_1
  52. SECTION .text
  53. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  54. %macro IDCT4_ADD 3
  55. ; Load dct coeffs
  56. movq m0, [%2]
  57. movq m1, [%2+8]
  58. movq m2, [%2+16]
  59. movq m3, [%2+24]
  60. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  61. mova m6, [pw_32]
  62. TRANSPOSE4x4W 0, 1, 2, 3, 4
  63. paddw m0, m6
  64. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  65. pxor m7, m7
  66. movq [%2+ 0], m7
  67. movq [%2+ 8], m7
  68. movq [%2+16], m7
  69. movq [%2+24], m7
  70. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  71. lea %1, [%1+%3*2]
  72. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  73. %endmacro
  74. INIT_MMX mmx
  75. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  76. cglobal h264_idct_add_8, 3, 3, 0
  77. IDCT4_ADD r0, r1, r2
  78. RET
  79. %macro IDCT8_1D 2
  80. mova m0, m1
  81. psraw m1, 1
  82. mova m4, m5
  83. psraw m4, 1
  84. paddw m4, m5
  85. paddw m1, m0
  86. paddw m4, m7
  87. paddw m1, m5
  88. psubw m4, m0
  89. paddw m1, m3
  90. psubw m0, m3
  91. psubw m5, m3
  92. psraw m3, 1
  93. paddw m0, m7
  94. psubw m5, m7
  95. psraw m7, 1
  96. psubw m0, m3
  97. psubw m5, m7
  98. mova m7, m1
  99. psraw m1, 2
  100. mova m3, m4
  101. psraw m3, 2
  102. paddw m3, m0
  103. psraw m0, 2
  104. paddw m1, m5
  105. psraw m5, 2
  106. psubw m0, m4
  107. psubw m7, m5
  108. mova m5, m6
  109. psraw m6, 1
  110. mova m4, m2
  111. psraw m4, 1
  112. paddw m6, m2
  113. psubw m4, m5
  114. mova m2, %1
  115. mova m5, %2
  116. SUMSUB_BA w, 5, 2
  117. SUMSUB_BA w, 6, 5
  118. SUMSUB_BA w, 4, 2
  119. SUMSUB_BA w, 7, 6
  120. SUMSUB_BA w, 0, 4
  121. SUMSUB_BA w, 3, 2
  122. SUMSUB_BA w, 1, 5
  123. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  124. %endmacro
  125. %macro IDCT8_1D_FULL 1
  126. mova m7, [%1+112]
  127. mova m6, [%1+ 96]
  128. mova m5, [%1+ 80]
  129. mova m3, [%1+ 48]
  130. mova m2, [%1+ 32]
  131. mova m1, [%1+ 16]
  132. IDCT8_1D [%1], [%1+ 64]
  133. %endmacro
  134. ; %1=int16_t *block, %2=int16_t *dstblock
  135. %macro IDCT8_ADD_MMX_START 2
  136. IDCT8_1D_FULL %1
  137. mova [%1], m7
  138. TRANSPOSE4x4W 0, 1, 2, 3, 7
  139. mova m7, [%1]
  140. mova [%2 ], m0
  141. mova [%2+16], m1
  142. mova [%2+32], m2
  143. mova [%2+48], m3
  144. TRANSPOSE4x4W 4, 5, 6, 7, 3
  145. mova [%2+ 8], m4
  146. mova [%2+24], m5
  147. mova [%2+40], m6
  148. mova [%2+56], m7
  149. %endmacro
  150. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  151. %macro IDCT8_ADD_MMX_END 3-4
  152. IDCT8_1D_FULL %2
  153. mova [%2 ], m5
  154. mova [%2+16], m6
  155. mova [%2+32], m7
  156. pxor m7, m7
  157. %if %0 == 4
  158. movq [%4+ 0], m7
  159. movq [%4+ 8], m7
  160. movq [%4+ 16], m7
  161. movq [%4+ 24], m7
  162. movq [%4+ 32], m7
  163. movq [%4+ 40], m7
  164. movq [%4+ 48], m7
  165. movq [%4+ 56], m7
  166. movq [%4+ 64], m7
  167. movq [%4+ 72], m7
  168. movq [%4+ 80], m7
  169. movq [%4+ 88], m7
  170. movq [%4+ 96], m7
  171. movq [%4+104], m7
  172. movq [%4+112], m7
  173. movq [%4+120], m7
  174. %endif
  175. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  176. lea %1, [%1+%3*2]
  177. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  178. mova m0, [%2 ]
  179. mova m1, [%2+16]
  180. mova m2, [%2+32]
  181. lea %1, [%1+%3*2]
  182. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  183. lea %1, [%1+%3*2]
  184. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  185. %endmacro
  186. INIT_MMX mmx
  187. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  188. cglobal h264_idct8_add_8, 3, 4, 0
  189. %assign pad 128+4-(stack_offset&7)
  190. SUB rsp, pad
  191. add word [r1], 32
  192. IDCT8_ADD_MMX_START r1 , rsp
  193. IDCT8_ADD_MMX_START r1+8, rsp+64
  194. lea r3, [r0+4]
  195. IDCT8_ADD_MMX_END r0 , rsp, r2, r1
  196. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  197. ADD rsp, pad
  198. RET
  199. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  200. %macro IDCT8_ADD_SSE 4
  201. IDCT8_1D_FULL %2
  202. %if ARCH_X86_64
  203. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  204. %else
  205. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  206. %endif
  207. paddw m0, [pw_32]
  208. %if ARCH_X86_64 == 0
  209. mova [%2 ], m0
  210. mova [%2+16], m4
  211. IDCT8_1D [%2], [%2+ 16]
  212. mova [%2 ], m6
  213. mova [%2+16], m7
  214. %else
  215. SWAP 0, 8
  216. SWAP 4, 9
  217. IDCT8_1D m8, m9
  218. SWAP 6, 8
  219. SWAP 7, 9
  220. %endif
  221. pxor m7, m7
  222. lea %4, [%3*3]
  223. STORE_DIFF m0, m6, m7, [%1 ]
  224. STORE_DIFF m1, m6, m7, [%1+%3 ]
  225. STORE_DIFF m2, m6, m7, [%1+%3*2]
  226. STORE_DIFF m3, m6, m7, [%1+%4 ]
  227. %if ARCH_X86_64 == 0
  228. mova m0, [%2 ]
  229. mova m1, [%2+16]
  230. %else
  231. SWAP 0, 8
  232. SWAP 1, 9
  233. %endif
  234. mova [%2+ 0], m7
  235. mova [%2+ 16], m7
  236. mova [%2+ 32], m7
  237. mova [%2+ 48], m7
  238. mova [%2+ 64], m7
  239. mova [%2+ 80], m7
  240. mova [%2+ 96], m7
  241. mova [%2+112], m7
  242. lea %1, [%1+%3*4]
  243. STORE_DIFF m4, m6, m7, [%1 ]
  244. STORE_DIFF m5, m6, m7, [%1+%3 ]
  245. STORE_DIFF m0, m6, m7, [%1+%3*2]
  246. STORE_DIFF m1, m6, m7, [%1+%4 ]
  247. %endmacro
  248. INIT_XMM sse2
  249. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  250. cglobal h264_idct8_add_8, 3, 4, 10
  251. IDCT8_ADD_SSE r0, r1, r2, r3
  252. RET
  253. %macro DC_ADD_MMXEXT_INIT 2
  254. add %1, 32
  255. sar %1, 6
  256. movd m0, %1d
  257. lea %1, [%2*3]
  258. pshufw m0, m0, 0
  259. pxor m1, m1
  260. psubw m1, m0
  261. packuswb m0, m0
  262. packuswb m1, m1
  263. %endmacro
  264. %macro DC_ADD_MMXEXT_OP 4
  265. %1 m2, [%2 ]
  266. %1 m3, [%2+%3 ]
  267. %1 m4, [%2+%3*2]
  268. %1 m5, [%2+%4 ]
  269. paddusb m2, m0
  270. paddusb m3, m0
  271. paddusb m4, m0
  272. paddusb m5, m0
  273. psubusb m2, m1
  274. psubusb m3, m1
  275. psubusb m4, m1
  276. psubusb m5, m1
  277. %1 [%2 ], m2
  278. %1 [%2+%3 ], m3
  279. %1 [%2+%3*2], m4
  280. %1 [%2+%4 ], m5
  281. %endmacro
  282. INIT_MMX mmxext
  283. ; ff_h264_idct_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  284. %if ARCH_X86_64
  285. cglobal h264_idct_dc_add_8, 3, 4, 0
  286. movsx r3, word [r1]
  287. mov dword [r1], 0
  288. DC_ADD_MMXEXT_INIT r3, r2
  289. DC_ADD_MMXEXT_OP movh, r0, r2, r3
  290. RET
  291. ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  292. cglobal h264_idct8_dc_add_8, 3, 4, 0
  293. movsx r3, word [r1]
  294. mov dword [r1], 0
  295. DC_ADD_MMXEXT_INIT r3, r2
  296. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  297. lea r0, [r0+r2*4]
  298. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  299. RET
  300. %else
  301. cglobal h264_idct_dc_add_8, 2, 3, 0
  302. movsx r2, word [r1]
  303. mov dword [r1], 0
  304. mov r1, r2m
  305. DC_ADD_MMXEXT_INIT r2, r1
  306. DC_ADD_MMXEXT_OP movh, r0, r1, r2
  307. RET
  308. ; ff_h264_idct8_dc_add_mmxext(uint8_t *dst, int16_t *block, int stride)
  309. cglobal h264_idct8_dc_add_8, 2, 3, 0
  310. movsx r2, word [r1]
  311. mov dword [r1], 0
  312. mov r1, r2m
  313. DC_ADD_MMXEXT_INIT r2, r1
  314. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  315. lea r0, [r0+r1*4]
  316. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  317. RET
  318. %endif
  319. INIT_MMX mmx
  320. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  321. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  322. cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  323. xor r5, r5
  324. %ifdef PIC
  325. lea picregq, [scan8_mem]
  326. %endif
  327. .nextblock:
  328. movzx r6, byte [scan8+r5]
  329. movzx r6, byte [r4+r6]
  330. test r6, r6
  331. jz .skipblock
  332. mov r6d, dword [r1+r5*4]
  333. lea r6, [r0+r6]
  334. IDCT4_ADD r6, r2, r3
  335. .skipblock:
  336. inc r5
  337. add r2, 32
  338. cmp r5, 16
  339. jl .nextblock
  340. REP_RET
  341. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  342. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  343. cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  344. %assign pad 128+4-(stack_offset&7)
  345. SUB rsp, pad
  346. xor r5, r5
  347. %ifdef PIC
  348. lea picregq, [scan8_mem]
  349. %endif
  350. .nextblock:
  351. movzx r6, byte [scan8+r5]
  352. movzx r6, byte [r4+r6]
  353. test r6, r6
  354. jz .skipblock
  355. mov r6d, dword [r1+r5*4]
  356. add r6, r0
  357. add word [r2], 32
  358. IDCT8_ADD_MMX_START r2 , rsp
  359. IDCT8_ADD_MMX_START r2+8, rsp+64
  360. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  361. mov r6d, dword [r1+r5*4]
  362. lea r6, [r0+r6+4]
  363. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  364. .skipblock:
  365. add r5, 4
  366. add r2, 128
  367. cmp r5, 16
  368. jl .nextblock
  369. ADD rsp, pad
  370. RET
  371. INIT_MMX mmxext
  372. ; ff_h264_idct_add16_mmxext(uint8_t *dst, const int *block_offset,
  373. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  374. cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  375. xor r5, r5
  376. %ifdef PIC
  377. lea picregq, [scan8_mem]
  378. %endif
  379. .nextblock:
  380. movzx r6, byte [scan8+r5]
  381. movzx r6, byte [r4+r6]
  382. test r6, r6
  383. jz .skipblock
  384. cmp r6, 1
  385. jnz .no_dc
  386. movsx r6, word [r2]
  387. test r6, r6
  388. jz .no_dc
  389. mov word [r2], 0
  390. DC_ADD_MMXEXT_INIT r6, r3
  391. %if ARCH_X86_64 == 0
  392. %define dst2q r1
  393. %define dst2d r1d
  394. %endif
  395. mov dst2d, dword [r1+r5*4]
  396. lea dst2q, [r0+dst2q]
  397. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  398. %if ARCH_X86_64 == 0
  399. mov r1, r1m
  400. %endif
  401. inc r5
  402. add r2, 32
  403. cmp r5, 16
  404. jl .nextblock
  405. REP_RET
  406. .no_dc:
  407. mov r6d, dword [r1+r5*4]
  408. add r6, r0
  409. IDCT4_ADD r6, r2, r3
  410. .skipblock:
  411. inc r5
  412. add r2, 32
  413. cmp r5, 16
  414. jl .nextblock
  415. REP_RET
  416. INIT_MMX mmx
  417. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  418. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  419. cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  420. xor r5, r5
  421. %ifdef PIC
  422. lea picregq, [scan8_mem]
  423. %endif
  424. .nextblock:
  425. movzx r6, byte [scan8+r5]
  426. movzx r6, byte [r4+r6]
  427. or r6w, word [r2]
  428. test r6, r6
  429. jz .skipblock
  430. mov r6d, dword [r1+r5*4]
  431. add r6, r0
  432. IDCT4_ADD r6, r2, r3
  433. .skipblock:
  434. inc r5
  435. add r2, 32
  436. cmp r5, 16
  437. jl .nextblock
  438. REP_RET
  439. INIT_MMX mmxext
  440. ; ff_h264_idct_add16intra_mmxext(uint8_t *dst, const int *block_offset,
  441. ; int16_t *block, int stride,
  442. ; const uint8_t nnzc[6*8])
  443. cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  444. xor r5, r5
  445. %ifdef PIC
  446. lea picregq, [scan8_mem]
  447. %endif
  448. .nextblock:
  449. movzx r6, byte [scan8+r5]
  450. movzx r6, byte [r4+r6]
  451. test r6, r6
  452. jz .try_dc
  453. mov r6d, dword [r1+r5*4]
  454. lea r6, [r0+r6]
  455. IDCT4_ADD r6, r2, r3
  456. inc r5
  457. add r2, 32
  458. cmp r5, 16
  459. jl .nextblock
  460. REP_RET
  461. .try_dc:
  462. movsx r6, word [r2]
  463. test r6, r6
  464. jz .skipblock
  465. mov word [r2], 0
  466. DC_ADD_MMXEXT_INIT r6, r3
  467. %if ARCH_X86_64 == 0
  468. %define dst2q r1
  469. %define dst2d r1d
  470. %endif
  471. mov dst2d, dword [r1+r5*4]
  472. add dst2q, r0
  473. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  474. %if ARCH_X86_64 == 0
  475. mov r1, r1m
  476. %endif
  477. .skipblock:
  478. inc r5
  479. add r2, 32
  480. cmp r5, 16
  481. jl .nextblock
  482. REP_RET
  483. ; ff_h264_idct8_add4_mmxext(uint8_t *dst, const int *block_offset,
  484. ; int16_t *block, int stride,
  485. ; const uint8_t nnzc[6*8])
  486. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  487. %assign pad 128+4-(stack_offset&7)
  488. SUB rsp, pad
  489. xor r5, r5
  490. %ifdef PIC
  491. lea picregq, [scan8_mem]
  492. %endif
  493. .nextblock:
  494. movzx r6, byte [scan8+r5]
  495. movzx r6, byte [r4+r6]
  496. test r6, r6
  497. jz .skipblock
  498. cmp r6, 1
  499. jnz .no_dc
  500. movsx r6, word [r2]
  501. test r6, r6
  502. jz .no_dc
  503. mov word [r2], 0
  504. DC_ADD_MMXEXT_INIT r6, r3
  505. %if ARCH_X86_64 == 0
  506. %define dst2q r1
  507. %define dst2d r1d
  508. %endif
  509. mov dst2d, dword [r1+r5*4]
  510. lea dst2q, [r0+dst2q]
  511. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  512. lea dst2q, [dst2q+r3*4]
  513. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  514. %if ARCH_X86_64 == 0
  515. mov r1, r1m
  516. %endif
  517. add r5, 4
  518. add r2, 128
  519. cmp r5, 16
  520. jl .nextblock
  521. ADD rsp, pad
  522. RET
  523. .no_dc:
  524. mov r6d, dword [r1+r5*4]
  525. add r6, r0
  526. add word [r2], 32
  527. IDCT8_ADD_MMX_START r2 , rsp
  528. IDCT8_ADD_MMX_START r2+8, rsp+64
  529. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  530. mov r6d, dword [r1+r5*4]
  531. lea r6, [r0+r6+4]
  532. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  533. .skipblock:
  534. add r5, 4
  535. add r2, 128
  536. cmp r5, 16
  537. jl .nextblock
  538. ADD rsp, pad
  539. RET
  540. INIT_XMM sse2
  541. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  542. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  543. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  544. xor r5, r5
  545. %ifdef PIC
  546. lea picregq, [scan8_mem]
  547. %endif
  548. .nextblock:
  549. movzx r6, byte [scan8+r5]
  550. movzx r6, byte [r4+r6]
  551. test r6, r6
  552. jz .skipblock
  553. cmp r6, 1
  554. jnz .no_dc
  555. movsx r6, word [r2]
  556. test r6, r6
  557. jz .no_dc
  558. INIT_MMX cpuname
  559. mov word [r2], 0
  560. DC_ADD_MMXEXT_INIT r6, r3
  561. %if ARCH_X86_64 == 0
  562. %define dst2q r1
  563. %define dst2d r1d
  564. %endif
  565. mov dst2d, dword [r1+r5*4]
  566. add dst2q, r0
  567. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  568. lea dst2q, [dst2q+r3*4]
  569. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  570. %if ARCH_X86_64 == 0
  571. mov r1, r1m
  572. %endif
  573. add r5, 4
  574. add r2, 128
  575. cmp r5, 16
  576. jl .nextblock
  577. REP_RET
  578. .no_dc:
  579. INIT_XMM cpuname
  580. mov dst2d, dword [r1+r5*4]
  581. add dst2q, r0
  582. IDCT8_ADD_SSE dst2q, r2, r3, r6
  583. %if ARCH_X86_64 == 0
  584. mov r1, r1m
  585. %endif
  586. .skipblock:
  587. add r5, 4
  588. add r2, 128
  589. cmp r5, 16
  590. jl .nextblock
  591. REP_RET
  592. INIT_MMX mmx
  593. h264_idct_add8_mmx_plane:
  594. .nextblock:
  595. movzx r6, byte [scan8+r5]
  596. movzx r6, byte [r4+r6]
  597. or r6w, word [r2]
  598. test r6, r6
  599. jz .skipblock
  600. %if ARCH_X86_64
  601. mov r0d, dword [r1+r5*4]
  602. add r0, [dst2q]
  603. %else
  604. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  605. mov r0, [r0]
  606. add r0, dword [r1+r5*4]
  607. %endif
  608. IDCT4_ADD r0, r2, r3
  609. .skipblock:
  610. inc r5
  611. add r2, 32
  612. test r5, 3
  613. jnz .nextblock
  614. rep ret
  615. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  616. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  617. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  618. mov r5, 16
  619. add r2, 512
  620. %ifdef PIC
  621. lea picregq, [scan8_mem]
  622. %endif
  623. %if ARCH_X86_64
  624. mov dst2q, r0
  625. %endif
  626. call h264_idct_add8_mmx_plane
  627. mov r5, 32
  628. add r2, 384
  629. %if ARCH_X86_64
  630. add dst2q, gprsize
  631. %else
  632. add r0mp, gprsize
  633. %endif
  634. call h264_idct_add8_mmx_plane
  635. RET
  636. h264_idct_add8_mmxext_plane:
  637. .nextblock:
  638. movzx r6, byte [scan8+r5]
  639. movzx r6, byte [r4+r6]
  640. test r6, r6
  641. jz .try_dc
  642. %if ARCH_X86_64
  643. mov r0d, dword [r1+r5*4]
  644. add r0, [dst2q]
  645. %else
  646. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  647. mov r0, [r0]
  648. add r0, dword [r1+r5*4]
  649. %endif
  650. IDCT4_ADD r0, r2, r3
  651. inc r5
  652. add r2, 32
  653. test r5, 3
  654. jnz .nextblock
  655. rep ret
  656. .try_dc:
  657. movsx r6, word [r2]
  658. test r6, r6
  659. jz .skipblock
  660. mov word [r2], 0
  661. DC_ADD_MMXEXT_INIT r6, r3
  662. %if ARCH_X86_64
  663. mov r0d, dword [r1+r5*4]
  664. add r0, [dst2q]
  665. %else
  666. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  667. mov r0, [r0]
  668. add r0, dword [r1+r5*4]
  669. %endif
  670. DC_ADD_MMXEXT_OP movh, r0, r3, r6
  671. .skipblock:
  672. inc r5
  673. add r2, 32
  674. test r5, 3
  675. jnz .nextblock
  676. rep ret
  677. INIT_MMX mmxext
  678. ; ff_h264_idct_add8_mmxext(uint8_t **dest, const int *block_offset,
  679. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  680. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  681. mov r5, 16
  682. add r2, 512
  683. %if ARCH_X86_64
  684. mov dst2q, r0
  685. %endif
  686. %ifdef PIC
  687. lea picregq, [scan8_mem]
  688. %endif
  689. call h264_idct_add8_mmxext_plane
  690. mov r5, 32
  691. add r2, 384
  692. %if ARCH_X86_64
  693. add dst2q, gprsize
  694. %else
  695. add r0mp, gprsize
  696. %endif
  697. call h264_idct_add8_mmxext_plane
  698. RET
  699. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  700. h264_idct_dc_add8_mmxext:
  701. movd m0, [r2 ] ; 0 0 X D
  702. mov word [r2+ 0], 0
  703. punpcklwd m0, [r2+32] ; x X d D
  704. mov word [r2+32], 0
  705. paddsw m0, [pw_32]
  706. psraw m0, 6
  707. punpcklwd m0, m0 ; d d D D
  708. pxor m1, m1 ; 0 0 0 0
  709. psubw m1, m0 ; -d-d-D-D
  710. packuswb m0, m1 ; -d-d-D-D d d D D
  711. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  712. punpcklwd m0, m0 ; d d d d D D D D
  713. lea r6, [r3*3]
  714. DC_ADD_MMXEXT_OP movq, r0, r3, r6
  715. ret
  716. ALIGN 16
  717. INIT_XMM sse2
  718. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  719. h264_add8x4_idct_sse2:
  720. movq m0, [r2+ 0]
  721. movq m1, [r2+ 8]
  722. movq m2, [r2+16]
  723. movq m3, [r2+24]
  724. movhps m0, [r2+32]
  725. movhps m1, [r2+40]
  726. movhps m2, [r2+48]
  727. movhps m3, [r2+56]
  728. IDCT4_1D w,0,1,2,3,4,5
  729. TRANSPOSE2x4x4W 0,1,2,3,4
  730. paddw m0, [pw_32]
  731. IDCT4_1D w,0,1,2,3,4,5
  732. pxor m7, m7
  733. mova [r2+ 0], m7
  734. mova [r2+16], m7
  735. mova [r2+32], m7
  736. mova [r2+48], m7
  737. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  738. lea r0, [r0+r3*2]
  739. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  740. ret
  741. %macro add16_sse2_cycle 2
  742. movzx r0, word [r4+%2]
  743. test r0, r0
  744. jz .cycle%1end
  745. mov r0d, dword [r1+%1*8]
  746. %if ARCH_X86_64
  747. add r0, r5
  748. %else
  749. add r0, r0m
  750. %endif
  751. call h264_add8x4_idct_sse2
  752. .cycle%1end:
  753. %if %1 < 7
  754. add r2, 64
  755. %endif
  756. %endmacro
  757. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  758. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  759. cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
  760. %if ARCH_X86_64
  761. mov r5, r0
  762. %endif
  763. ; unrolling of the loop leads to an average performance gain of
  764. ; 20-25%
  765. add16_sse2_cycle 0, 0xc
  766. add16_sse2_cycle 1, 0x14
  767. add16_sse2_cycle 2, 0xe
  768. add16_sse2_cycle 3, 0x16
  769. add16_sse2_cycle 4, 0x1c
  770. add16_sse2_cycle 5, 0x24
  771. add16_sse2_cycle 6, 0x1e
  772. add16_sse2_cycle 7, 0x26
  773. RET
  774. %macro add16intra_sse2_cycle 2
  775. movzx r0, word [r4+%2]
  776. test r0, r0
  777. jz .try%1dc
  778. mov r0d, dword [r1+%1*8]
  779. %if ARCH_X86_64
  780. add r0, r7
  781. %else
  782. add r0, r0m
  783. %endif
  784. call h264_add8x4_idct_sse2
  785. jmp .cycle%1end
  786. .try%1dc:
  787. movsx r0, word [r2 ]
  788. or r0w, word [r2+32]
  789. jz .cycle%1end
  790. mov r0d, dword [r1+%1*8]
  791. %if ARCH_X86_64
  792. add r0, r7
  793. %else
  794. add r0, r0m
  795. %endif
  796. call h264_idct_dc_add8_mmxext
  797. .cycle%1end:
  798. %if %1 < 7
  799. add r2, 64
  800. %endif
  801. %endmacro
  802. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  803. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  804. cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
  805. %if ARCH_X86_64
  806. mov r7, r0
  807. %endif
  808. add16intra_sse2_cycle 0, 0xc
  809. add16intra_sse2_cycle 1, 0x14
  810. add16intra_sse2_cycle 2, 0xe
  811. add16intra_sse2_cycle 3, 0x16
  812. add16intra_sse2_cycle 4, 0x1c
  813. add16intra_sse2_cycle 5, 0x24
  814. add16intra_sse2_cycle 6, 0x1e
  815. add16intra_sse2_cycle 7, 0x26
  816. RET
  817. %macro add8_sse2_cycle 2
  818. movzx r0, word [r4+%2]
  819. test r0, r0
  820. jz .try%1dc
  821. %if ARCH_X86_64
  822. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  823. add r0, [r7]
  824. %else
  825. mov r0, r0m
  826. mov r0, [r0]
  827. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  828. %endif
  829. call h264_add8x4_idct_sse2
  830. jmp .cycle%1end
  831. .try%1dc:
  832. movsx r0, word [r2 ]
  833. or r0w, word [r2+32]
  834. jz .cycle%1end
  835. %if ARCH_X86_64
  836. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  837. add r0, [r7]
  838. %else
  839. mov r0, r0m
  840. mov r0, [r0]
  841. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  842. %endif
  843. call h264_idct_dc_add8_mmxext
  844. .cycle%1end:
  845. %if %1 == 1
  846. add r2, 384+64
  847. %elif %1 < 3
  848. add r2, 64
  849. %endif
  850. %endmacro
  851. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  852. ; int16_t *block, int stride, const uint8_t nnzc[6*8])
  853. cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
  854. add r2, 512
  855. %if ARCH_X86_64
  856. mov r7, r0
  857. %endif
  858. add8_sse2_cycle 0, 0x34
  859. add8_sse2_cycle 1, 0x3c
  860. %if ARCH_X86_64
  861. add r7, gprsize
  862. %else
  863. add r0mp, gprsize
  864. %endif
  865. add8_sse2_cycle 2, 0x5c
  866. add8_sse2_cycle 3, 0x64
  867. RET
  868. ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
  869. %macro WALSH4_1D 5
  870. SUMSUB_BADC w, %4, %3, %2, %1, %5
  871. SUMSUB_BADC w, %4, %2, %3, %1, %5
  872. SWAP %1, %4, %3
  873. %endmacro
  874. %macro DEQUANT_MMX 3
  875. mova m7, [pw_1]
  876. mova m4, %1
  877. punpcklwd %1, m7
  878. punpckhwd m4, m7
  879. mova m5, %2
  880. punpcklwd %2, m7
  881. punpckhwd m5, m7
  882. movd m7, t3d
  883. punpckldq m7, m7
  884. pmaddwd %1, m7
  885. pmaddwd %2, m7
  886. pmaddwd m4, m7
  887. pmaddwd m5, m7
  888. psrad %1, %3
  889. psrad %2, %3
  890. psrad m4, %3
  891. psrad m5, %3
  892. packssdw %1, m4
  893. packssdw %2, m5
  894. %endmacro
  895. %macro STORE_WORDS 5-9
  896. %if cpuflag(sse)
  897. movd t0d, %1
  898. psrldq %1, 4
  899. movd t1d, %1
  900. psrldq %1, 4
  901. mov [t2+%2*32], t0w
  902. mov [t2+%4*32], t1w
  903. shr t0d, 16
  904. shr t1d, 16
  905. mov [t2+%3*32], t0w
  906. mov [t2+%5*32], t1w
  907. movd t0d, %1
  908. psrldq %1, 4
  909. movd t1d, %1
  910. mov [t2+%6*32], t0w
  911. mov [t2+%8*32], t1w
  912. shr t0d, 16
  913. shr t1d, 16
  914. mov [t2+%7*32], t0w
  915. mov [t2+%9*32], t1w
  916. %else
  917. movd t0d, %1
  918. psrlq %1, 32
  919. movd t1d, %1
  920. mov [t2+%2*32], t0w
  921. mov [t2+%4*32], t1w
  922. shr t0d, 16
  923. shr t1d, 16
  924. mov [t2+%3*32], t0w
  925. mov [t2+%5*32], t1w
  926. %endif
  927. %endmacro
  928. %macro DEQUANT_STORE 1
  929. %if cpuflag(sse2)
  930. movd xmm4, t3d
  931. movq xmm5, [pw_1]
  932. pshufd xmm4, xmm4, 0
  933. movq2dq xmm0, m0
  934. movq2dq xmm1, m1
  935. movq2dq xmm2, m2
  936. movq2dq xmm3, m3
  937. punpcklwd xmm0, xmm5
  938. punpcklwd xmm1, xmm5
  939. punpcklwd xmm2, xmm5
  940. punpcklwd xmm3, xmm5
  941. pmaddwd xmm0, xmm4
  942. pmaddwd xmm1, xmm4
  943. pmaddwd xmm2, xmm4
  944. pmaddwd xmm3, xmm4
  945. psrad xmm0, %1
  946. psrad xmm1, %1
  947. psrad xmm2, %1
  948. psrad xmm3, %1
  949. packssdw xmm0, xmm1
  950. packssdw xmm2, xmm3
  951. STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  952. STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  953. %else
  954. DEQUANT_MMX m0, m1, %1
  955. STORE_WORDS m0, 0, 1, 4, 5
  956. STORE_WORDS m1, 2, 3, 6, 7
  957. DEQUANT_MMX m2, m3, %1
  958. STORE_WORDS m2, 8, 9, 12, 13
  959. STORE_WORDS m3, 10, 11, 14, 15
  960. %endif
  961. %endmacro
  962. %macro IDCT_DC_DEQUANT 1
  963. cglobal h264_luma_dc_dequant_idct, 3, 4, %1
  964. ; manually spill XMM registers for Win64 because
  965. ; the code here is initialized with INIT_MMX
  966. WIN64_SPILL_XMM %1
  967. movq m3, [r1+24]
  968. movq m2, [r1+16]
  969. movq m1, [r1+ 8]
  970. movq m0, [r1+ 0]
  971. WALSH4_1D 0,1,2,3,4
  972. TRANSPOSE4x4W 0,1,2,3,4
  973. WALSH4_1D 0,1,2,3,4
  974. ; shift, tmp, output, qmul
  975. %if WIN64
  976. DECLARE_REG_TMP 0,3,1,2
  977. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  978. xchg r0, t2
  979. %elif ARCH_X86_64
  980. DECLARE_REG_TMP 3,1,0,2
  981. %else
  982. DECLARE_REG_TMP 1,3,0,2
  983. %endif
  984. cmp t3d, 32767
  985. jg .big_qmul
  986. add t3d, 128 << 16
  987. DEQUANT_STORE 8
  988. RET
  989. .big_qmul:
  990. bsr t0d, t3d
  991. add t3d, 128 << 16
  992. mov t1d, 7
  993. cmp t0d, t1d
  994. cmovg t0d, t1d
  995. inc t1d
  996. shr t3d, t0b
  997. sub t1d, t0d
  998. %if cpuflag(sse2)
  999. movd xmm6, t1d
  1000. DEQUANT_STORE xmm6
  1001. %else
  1002. movd m6, t1d
  1003. DEQUANT_STORE m6
  1004. %endif
  1005. RET
  1006. %endmacro
  1007. INIT_MMX mmx
  1008. IDCT_DC_DEQUANT 0
  1009. INIT_MMX sse2
  1010. IDCT_DC_DEQUANT 7