You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1084 lines
28KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of Libav.
  13. ;*
  14. ;* Libav is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* Libav is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with Libav; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "libavutil/x86/x86util.asm"
  29. SECTION_RODATA
  30. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  31. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  32. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  33. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  34. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  35. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  36. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  37. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  38. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  39. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  40. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  41. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  42. %ifdef PIC
  43. %define npicregs 1
  44. %define scan8 picregq
  45. %else
  46. %define npicregs 0
  47. %define scan8 scan8_mem
  48. %endif
  49. cextern pw_32
  50. cextern pw_1
  51. SECTION .text
  52. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  53. %macro IDCT4_ADD 3
  54. ; Load dct coeffs
  55. movq m0, [%2]
  56. movq m1, [%2+8]
  57. movq m2, [%2+16]
  58. movq m3, [%2+24]
  59. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  60. mova m6, [pw_32]
  61. TRANSPOSE4x4W 0, 1, 2, 3, 4
  62. paddw m0, m6
  63. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  64. pxor m7, m7
  65. movq [%2+ 0], m7
  66. movq [%2+ 8], m7
  67. movq [%2+16], m7
  68. movq [%2+24], m7
  69. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  70. lea %1, [%1+%3*2]
  71. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  72. %endmacro
  73. INIT_MMX mmx
  74. ; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
  75. cglobal h264_idct_add_8, 3, 3, 0
  76. IDCT4_ADD r0, r1, r2
  77. RET
  78. %macro IDCT8_1D 2
  79. mova m0, m1
  80. psraw m1, 1
  81. mova m4, m5
  82. psraw m4, 1
  83. paddw m4, m5
  84. paddw m1, m0
  85. paddw m4, m7
  86. paddw m1, m5
  87. psubw m4, m0
  88. paddw m1, m3
  89. psubw m0, m3
  90. psubw m5, m3
  91. psraw m3, 1
  92. paddw m0, m7
  93. psubw m5, m7
  94. psraw m7, 1
  95. psubw m0, m3
  96. psubw m5, m7
  97. mova m7, m1
  98. psraw m1, 2
  99. mova m3, m4
  100. psraw m3, 2
  101. paddw m3, m0
  102. psraw m0, 2
  103. paddw m1, m5
  104. psraw m5, 2
  105. psubw m0, m4
  106. psubw m7, m5
  107. mova m5, m6
  108. psraw m6, 1
  109. mova m4, m2
  110. psraw m4, 1
  111. paddw m6, m2
  112. psubw m4, m5
  113. mova m2, %1
  114. mova m5, %2
  115. SUMSUB_BA w, 5, 2
  116. SUMSUB_BA w, 6, 5
  117. SUMSUB_BA w, 4, 2
  118. SUMSUB_BA w, 7, 6
  119. SUMSUB_BA w, 0, 4
  120. SUMSUB_BA w, 3, 2
  121. SUMSUB_BA w, 1, 5
  122. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  123. %endmacro
  124. %macro IDCT8_1D_FULL 1
  125. mova m7, [%1+112]
  126. mova m6, [%1+ 96]
  127. mova m5, [%1+ 80]
  128. mova m3, [%1+ 48]
  129. mova m2, [%1+ 32]
  130. mova m1, [%1+ 16]
  131. IDCT8_1D [%1], [%1+ 64]
  132. %endmacro
  133. ; %1=int16_t *block, %2=int16_t *dstblock
  134. %macro IDCT8_ADD_MMX_START 2
  135. IDCT8_1D_FULL %1
  136. mova [%1], m7
  137. TRANSPOSE4x4W 0, 1, 2, 3, 7
  138. mova m7, [%1]
  139. mova [%2 ], m0
  140. mova [%2+16], m1
  141. mova [%2+32], m2
  142. mova [%2+48], m3
  143. TRANSPOSE4x4W 4, 5, 6, 7, 3
  144. mova [%2+ 8], m4
  145. mova [%2+24], m5
  146. mova [%2+40], m6
  147. mova [%2+56], m7
  148. %endmacro
  149. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  150. %macro IDCT8_ADD_MMX_END 3-4
  151. IDCT8_1D_FULL %2
  152. mova [%2 ], m5
  153. mova [%2+16], m6
  154. mova [%2+32], m7
  155. pxor m7, m7
  156. %if %0 == 4
  157. movq [%4+ 0], m7
  158. movq [%4+ 8], m7
  159. movq [%4+ 16], m7
  160. movq [%4+ 24], m7
  161. movq [%4+ 32], m7
  162. movq [%4+ 40], m7
  163. movq [%4+ 48], m7
  164. movq [%4+ 56], m7
  165. movq [%4+ 64], m7
  166. movq [%4+ 72], m7
  167. movq [%4+ 80], m7
  168. movq [%4+ 88], m7
  169. movq [%4+ 96], m7
  170. movq [%4+104], m7
  171. movq [%4+112], m7
  172. movq [%4+120], m7
  173. %endif
  174. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  175. lea %1, [%1+%3*2]
  176. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  177. mova m0, [%2 ]
  178. mova m1, [%2+16]
  179. mova m2, [%2+32]
  180. lea %1, [%1+%3*2]
  181. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  182. lea %1, [%1+%3*2]
  183. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  184. %endmacro
  185. INIT_MMX mmx
  186. ; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
  187. cglobal h264_idct8_add_8, 3, 4, 0
  188. %assign pad 128+4-(stack_offset&7)
  189. SUB rsp, pad
  190. add word [r1], 32
  191. IDCT8_ADD_MMX_START r1 , rsp
  192. IDCT8_ADD_MMX_START r1+8, rsp+64
  193. lea r3, [r0+4]
  194. IDCT8_ADD_MMX_END r0 , rsp, r2, r1
  195. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  196. ADD rsp, pad
  197. RET
  198. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  199. %macro IDCT8_ADD_SSE 4
  200. IDCT8_1D_FULL %2
  201. %if ARCH_X86_64
  202. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  203. %else
  204. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  205. %endif
  206. paddw m0, [pw_32]
  207. %if ARCH_X86_64 == 0
  208. mova [%2 ], m0
  209. mova [%2+16], m4
  210. IDCT8_1D [%2], [%2+ 16]
  211. mova [%2 ], m6
  212. mova [%2+16], m7
  213. %else
  214. SWAP 0, 8
  215. SWAP 4, 9
  216. IDCT8_1D m8, m9
  217. SWAP 6, 8
  218. SWAP 7, 9
  219. %endif
  220. pxor m7, m7
  221. lea %4, [%3*3]
  222. STORE_DIFF m0, m6, m7, [%1 ]
  223. STORE_DIFF m1, m6, m7, [%1+%3 ]
  224. STORE_DIFF m2, m6, m7, [%1+%3*2]
  225. STORE_DIFF m3, m6, m7, [%1+%4 ]
  226. %if ARCH_X86_64 == 0
  227. mova m0, [%2 ]
  228. mova m1, [%2+16]
  229. %else
  230. SWAP 0, 8
  231. SWAP 1, 9
  232. %endif
  233. mova [%2+ 0], m7
  234. mova [%2+ 16], m7
  235. mova [%2+ 32], m7
  236. mova [%2+ 48], m7
  237. mova [%2+ 64], m7
  238. mova [%2+ 80], m7
  239. mova [%2+ 96], m7
  240. mova [%2+112], m7
  241. lea %1, [%1+%3*4]
  242. STORE_DIFF m4, m6, m7, [%1 ]
  243. STORE_DIFF m5, m6, m7, [%1+%3 ]
  244. STORE_DIFF m0, m6, m7, [%1+%3*2]
  245. STORE_DIFF m1, m6, m7, [%1+%4 ]
  246. %endmacro
  247. INIT_XMM sse2
  248. ; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
  249. cglobal h264_idct8_add_8, 3, 4, 10
  250. IDCT8_ADD_SSE r0, r1, r2, r3
  251. RET
  252. %macro DC_ADD_MMXEXT_INIT 2
  253. add %1, 32
  254. sar %1, 6
  255. movd m0, %1d
  256. lea %1, [%2*3]
  257. pshufw m0, m0, 0
  258. pxor m1, m1
  259. psubw m1, m0
  260. packuswb m0, m0
  261. packuswb m1, m1
  262. %endmacro
  263. %macro DC_ADD_MMXEXT_OP 4
  264. %1 m2, [%2 ]
  265. %1 m3, [%2+%3 ]
  266. %1 m4, [%2+%3*2]
  267. %1 m5, [%2+%4 ]
  268. paddusb m2, m0
  269. paddusb m3, m0
  270. paddusb m4, m0
  271. paddusb m5, m0
  272. psubusb m2, m1
  273. psubusb m3, m1
  274. psubusb m4, m1
  275. psubusb m5, m1
  276. %1 [%2 ], m2
  277. %1 [%2+%3 ], m3
  278. %1 [%2+%3*2], m4
  279. %1 [%2+%4 ], m5
  280. %endmacro
  281. INIT_MMX mmxext
  282. ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  283. %if ARCH_X86_64
  284. cglobal h264_idct_dc_add_8, 3, 4, 0
  285. movsx r3, word [r1]
  286. mov dword [r1], 0
  287. DC_ADD_MMXEXT_INIT r3, r2
  288. DC_ADD_MMXEXT_OP movh, r0, r2, r3
  289. RET
  290. ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  291. cglobal h264_idct8_dc_add_8, 3, 4, 0
  292. movsx r3, word [r1]
  293. mov dword [r1], 0
  294. DC_ADD_MMXEXT_INIT r3, r2
  295. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  296. lea r0, [r0+r2*4]
  297. DC_ADD_MMXEXT_OP mova, r0, r2, r3
  298. RET
  299. %else
  300. ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  301. cglobal h264_idct_dc_add_8, 2, 3, 0
  302. movsx r2, word [r1]
  303. mov dword [r1], 0
  304. mov r1, r2m
  305. DC_ADD_MMXEXT_INIT r2, r1
  306. DC_ADD_MMXEXT_OP movh, r0, r1, r2
  307. RET
  308. ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
  309. cglobal h264_idct8_dc_add_8, 2, 3, 0
  310. movsx r2, word [r1]
  311. mov dword [r1], 0
  312. mov r1, r2m
  313. DC_ADD_MMXEXT_INIT r2, r1
  314. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  315. lea r0, [r0+r1*4]
  316. DC_ADD_MMXEXT_OP mova, r0, r1, r2
  317. RET
  318. %endif
  319. INIT_MMX mmx
  320. ; void ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
  321. ; int16_t *block, int stride,
  322. ; const uint8_t nnzc[6 * 8])
  323. cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  324. xor r5, r5
  325. %ifdef PIC
  326. lea picregq, [scan8_mem]
  327. %endif
  328. .nextblock:
  329. movzx r6, byte [scan8+r5]
  330. movzx r6, byte [r4+r6]
  331. test r6, r6
  332. jz .skipblock
  333. mov r6d, dword [r1+r5*4]
  334. lea r6, [r0+r6]
  335. IDCT4_ADD r6, r2, r3
  336. .skipblock:
  337. inc r5
  338. add r2, 32
  339. cmp r5, 16
  340. jl .nextblock
  341. REP_RET
  342. ; void ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
  343. ; int16_t *block, int stride,
  344. ; const uint8_t nnzc[6 * 8])
  345. cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  346. %assign pad 128+4-(stack_offset&7)
  347. SUB rsp, pad
  348. xor r5, r5
  349. %ifdef PIC
  350. lea picregq, [scan8_mem]
  351. %endif
  352. .nextblock:
  353. movzx r6, byte [scan8+r5]
  354. movzx r6, byte [r4+r6]
  355. test r6, r6
  356. jz .skipblock
  357. mov r6d, dword [r1+r5*4]
  358. add r6, r0
  359. add word [r2], 32
  360. IDCT8_ADD_MMX_START r2 , rsp
  361. IDCT8_ADD_MMX_START r2+8, rsp+64
  362. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  363. mov r6d, dword [r1+r5*4]
  364. lea r6, [r0+r6+4]
  365. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  366. .skipblock:
  367. add r5, 4
  368. add r2, 128
  369. cmp r5, 16
  370. jl .nextblock
  371. ADD rsp, pad
  372. RET
  373. INIT_MMX mmxext
  374. ; void ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
  375. ; int16_t *block, int stride,
  376. ; const uint8_t nnzc[6 * 8])
  377. cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  378. xor r5, r5
  379. %ifdef PIC
  380. lea picregq, [scan8_mem]
  381. %endif
  382. .nextblock:
  383. movzx r6, byte [scan8+r5]
  384. movzx r6, byte [r4+r6]
  385. test r6, r6
  386. jz .skipblock
  387. cmp r6, 1
  388. jnz .no_dc
  389. movsx r6, word [r2]
  390. test r6, r6
  391. jz .no_dc
  392. mov word [r2], 0
  393. DC_ADD_MMXEXT_INIT r6, r3
  394. %if ARCH_X86_64 == 0
  395. %define dst2q r1
  396. %define dst2d r1d
  397. %endif
  398. mov dst2d, dword [r1+r5*4]
  399. lea dst2q, [r0+dst2q]
  400. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  401. %if ARCH_X86_64 == 0
  402. mov r1, r1m
  403. %endif
  404. inc r5
  405. add r2, 32
  406. cmp r5, 16
  407. jl .nextblock
  408. REP_RET
  409. .no_dc:
  410. mov r6d, dword [r1+r5*4]
  411. add r6, r0
  412. IDCT4_ADD r6, r2, r3
  413. .skipblock:
  414. inc r5
  415. add r2, 32
  416. cmp r5, 16
  417. jl .nextblock
  418. REP_RET
  419. INIT_MMX mmx
  420. ; void ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
  421. ; int16_t *block, int stride,
  422. ; const uint8_t nnzc[6 * 8])
  423. cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
  424. xor r5, r5
  425. %ifdef PIC
  426. lea picregq, [scan8_mem]
  427. %endif
  428. .nextblock:
  429. movzx r6, byte [scan8+r5]
  430. movzx r6, byte [r4+r6]
  431. or r6w, word [r2]
  432. test r6, r6
  433. jz .skipblock
  434. mov r6d, dword [r1+r5*4]
  435. add r6, r0
  436. IDCT4_ADD r6, r2, r3
  437. .skipblock:
  438. inc r5
  439. add r2, 32
  440. cmp r5, 16
  441. jl .nextblock
  442. REP_RET
  443. INIT_MMX mmxext
  444. ; void ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
  445. ; int16_t *block, int stride,
  446. ; const uint8_t nnzc[6 * 8])
  447. cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  448. xor r5, r5
  449. %ifdef PIC
  450. lea picregq, [scan8_mem]
  451. %endif
  452. .nextblock:
  453. movzx r6, byte [scan8+r5]
  454. movzx r6, byte [r4+r6]
  455. test r6, r6
  456. jz .try_dc
  457. mov r6d, dword [r1+r5*4]
  458. lea r6, [r0+r6]
  459. IDCT4_ADD r6, r2, r3
  460. inc r5
  461. add r2, 32
  462. cmp r5, 16
  463. jl .nextblock
  464. REP_RET
  465. .try_dc:
  466. movsx r6, word [r2]
  467. test r6, r6
  468. jz .skipblock
  469. mov word [r2], 0
  470. DC_ADD_MMXEXT_INIT r6, r3
  471. %if ARCH_X86_64 == 0
  472. %define dst2q r1
  473. %define dst2d r1d
  474. %endif
  475. mov dst2d, dword [r1+r5*4]
  476. add dst2q, r0
  477. DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
  478. %if ARCH_X86_64 == 0
  479. mov r1, r1m
  480. %endif
  481. .skipblock:
  482. inc r5
  483. add r2, 32
  484. cmp r5, 16
  485. jl .nextblock
  486. REP_RET
  487. ; void ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
  488. ; int16_t *block, int stride,
  489. ; const uint8_t nnzc[6 * 8])
  490. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  491. %assign pad 128+4-(stack_offset&7)
  492. SUB rsp, pad
  493. xor r5, r5
  494. %ifdef PIC
  495. lea picregq, [scan8_mem]
  496. %endif
  497. .nextblock:
  498. movzx r6, byte [scan8+r5]
  499. movzx r6, byte [r4+r6]
  500. test r6, r6
  501. jz .skipblock
  502. cmp r6, 1
  503. jnz .no_dc
  504. movsx r6, word [r2]
  505. test r6, r6
  506. jz .no_dc
  507. mov word [r2], 0
  508. DC_ADD_MMXEXT_INIT r6, r3
  509. %if ARCH_X86_64 == 0
  510. %define dst2q r1
  511. %define dst2d r1d
  512. %endif
  513. mov dst2d, dword [r1+r5*4]
  514. lea dst2q, [r0+dst2q]
  515. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  516. lea dst2q, [dst2q+r3*4]
  517. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  518. %if ARCH_X86_64 == 0
  519. mov r1, r1m
  520. %endif
  521. add r5, 4
  522. add r2, 128
  523. cmp r5, 16
  524. jl .nextblock
  525. ADD rsp, pad
  526. RET
  527. .no_dc:
  528. mov r6d, dword [r1+r5*4]
  529. add r6, r0
  530. add word [r2], 32
  531. IDCT8_ADD_MMX_START r2 , rsp
  532. IDCT8_ADD_MMX_START r2+8, rsp+64
  533. IDCT8_ADD_MMX_END r6 , rsp, r3, r2
  534. mov r6d, dword [r1+r5*4]
  535. lea r6, [r0+r6+4]
  536. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  537. .skipblock:
  538. add r5, 4
  539. add r2, 128
  540. cmp r5, 16
  541. jl .nextblock
  542. ADD rsp, pad
  543. RET
  544. INIT_XMM sse2
  545. ; void ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
  546. ; int16_t *block, int stride,
  547. ; const uint8_t nnzc[6 * 8])
  548. cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  549. xor r5, r5
  550. %ifdef PIC
  551. lea picregq, [scan8_mem]
  552. %endif
  553. .nextblock:
  554. movzx r6, byte [scan8+r5]
  555. movzx r6, byte [r4+r6]
  556. test r6, r6
  557. jz .skipblock
  558. cmp r6, 1
  559. jnz .no_dc
  560. movsx r6, word [r2]
  561. test r6, r6
  562. jz .no_dc
  563. INIT_MMX cpuname
  564. mov word [r2], 0
  565. DC_ADD_MMXEXT_INIT r6, r3
  566. %if ARCH_X86_64 == 0
  567. %define dst2q r1
  568. %define dst2d r1d
  569. %endif
  570. mov dst2d, dword [r1+r5*4]
  571. add dst2q, r0
  572. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  573. lea dst2q, [dst2q+r3*4]
  574. DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
  575. %if ARCH_X86_64 == 0
  576. mov r1, r1m
  577. %endif
  578. add r5, 4
  579. add r2, 128
  580. cmp r5, 16
  581. jl .nextblock
  582. REP_RET
  583. .no_dc:
  584. INIT_XMM cpuname
  585. mov dst2d, dword [r1+r5*4]
  586. add dst2q, r0
  587. IDCT8_ADD_SSE dst2q, r2, r3, r6
  588. %if ARCH_X86_64 == 0
  589. mov r1, r1m
  590. %endif
  591. .skipblock:
  592. add r5, 4
  593. add r2, 128
  594. cmp r5, 16
  595. jl .nextblock
  596. REP_RET
  597. INIT_MMX mmx
  598. h264_idct_add8_mmx_plane:
  599. .nextblock:
  600. movzx r6, byte [scan8+r5]
  601. movzx r6, byte [r4+r6]
  602. or r6w, word [r2]
  603. test r6, r6
  604. jz .skipblock
  605. %if ARCH_X86_64
  606. mov r0d, dword [r1+r5*4]
  607. add r0, [dst2q]
  608. %else
  609. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  610. mov r0, [r0]
  611. add r0, dword [r1+r5*4]
  612. %endif
  613. IDCT4_ADD r0, r2, r3
  614. .skipblock:
  615. inc r5
  616. add r2, 32
  617. test r5, 3
  618. jnz .nextblock
  619. rep ret
  620. ; void ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
  621. ; int16_t *block, int stride,
  622. ; const uint8_t nnzc[6 * 8])
  623. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  624. mov r5, 16
  625. add r2, 512
  626. %ifdef PIC
  627. lea picregq, [scan8_mem]
  628. %endif
  629. %if ARCH_X86_64
  630. mov dst2q, r0
  631. %endif
  632. call h264_idct_add8_mmx_plane
  633. mov r5, 32
  634. add r2, 384
  635. %if ARCH_X86_64
  636. add dst2q, gprsize
  637. %else
  638. add r0mp, gprsize
  639. %endif
  640. call h264_idct_add8_mmx_plane
  641. RET
  642. h264_idct_add8_mmxext_plane:
  643. .nextblock:
  644. movzx r6, byte [scan8+r5]
  645. movzx r6, byte [r4+r6]
  646. test r6, r6
  647. jz .try_dc
  648. %if ARCH_X86_64
  649. mov r0d, dword [r1+r5*4]
  650. add r0, [dst2q]
  651. %else
  652. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  653. mov r0, [r0]
  654. add r0, dword [r1+r5*4]
  655. %endif
  656. IDCT4_ADD r0, r2, r3
  657. inc r5
  658. add r2, 32
  659. test r5, 3
  660. jnz .nextblock
  661. rep ret
  662. .try_dc:
  663. movsx r6, word [r2]
  664. test r6, r6
  665. jz .skipblock
  666. mov word [r2], 0
  667. DC_ADD_MMXEXT_INIT r6, r3
  668. %if ARCH_X86_64
  669. mov r0d, dword [r1+r5*4]
  670. add r0, [dst2q]
  671. %else
  672. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  673. mov r0, [r0]
  674. add r0, dword [r1+r5*4]
  675. %endif
  676. DC_ADD_MMXEXT_OP movh, r0, r3, r6
  677. .skipblock:
  678. inc r5
  679. add r2, 32
  680. test r5, 3
  681. jnz .nextblock
  682. rep ret
  683. INIT_MMX mmxext
  684. ; void ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
  685. ; int16_t *block, int stride,
  686. ; const uint8_t nnzc[6 * 8])
  687. cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
  688. mov r5, 16
  689. add r2, 512
  690. %if ARCH_X86_64
  691. mov dst2q, r0
  692. %endif
  693. %ifdef PIC
  694. lea picregq, [scan8_mem]
  695. %endif
  696. call h264_idct_add8_mmxext_plane
  697. mov r5, 32
  698. add r2, 384
  699. %if ARCH_X86_64
  700. add dst2q, gprsize
  701. %else
  702. add r0mp, gprsize
  703. %endif
  704. call h264_idct_add8_mmxext_plane
  705. RET
  706. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  707. h264_idct_dc_add8_mmxext:
  708. movd m0, [r2 ] ; 0 0 X D
  709. mov word [r2+ 0], 0
  710. punpcklwd m0, [r2+32] ; x X d D
  711. mov word [r2+32], 0
  712. paddsw m0, [pw_32]
  713. psraw m0, 6
  714. punpcklwd m0, m0 ; d d D D
  715. pxor m1, m1 ; 0 0 0 0
  716. psubw m1, m0 ; -d-d-D-D
  717. packuswb m0, m1 ; -d-d-D-D d d D D
  718. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  719. punpcklwd m0, m0 ; d d d d D D D D
  720. lea r6, [r3*3]
  721. DC_ADD_MMXEXT_OP movq, r0, r3, r6
  722. ret
  723. ALIGN 16
  724. INIT_XMM sse2
  725. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  726. h264_add8x4_idct_sse2:
  727. movq m0, [r2+ 0]
  728. movq m1, [r2+ 8]
  729. movq m2, [r2+16]
  730. movq m3, [r2+24]
  731. movhps m0, [r2+32]
  732. movhps m1, [r2+40]
  733. movhps m2, [r2+48]
  734. movhps m3, [r2+56]
  735. IDCT4_1D w,0,1,2,3,4,5
  736. TRANSPOSE2x4x4W 0,1,2,3,4
  737. paddw m0, [pw_32]
  738. IDCT4_1D w,0,1,2,3,4,5
  739. pxor m7, m7
  740. mova [r2+ 0], m7
  741. mova [r2+16], m7
  742. mova [r2+32], m7
  743. mova [r2+48], m7
  744. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  745. lea r0, [r0+r3*2]
  746. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  747. ret
  748. %macro add16_sse2_cycle 2
  749. movzx r0, word [r4+%2]
  750. test r0, r0
  751. jz .cycle%1end
  752. mov r0d, dword [r1+%1*8]
  753. %if ARCH_X86_64
  754. add r0, r5
  755. %else
  756. add r0, r0m
  757. %endif
  758. call h264_add8x4_idct_sse2
  759. .cycle%1end:
  760. %if %1 < 7
  761. add r2, 64
  762. %endif
  763. %endmacro
  764. ; void ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
  765. ; int16_t *block, int stride,
  766. ; const uint8_t nnzc[6 * 8])
  767. cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
  768. %if ARCH_X86_64
  769. mov r5, r0
  770. %endif
  771. ; unrolling of the loop leads to an average performance gain of
  772. ; 20-25%
  773. add16_sse2_cycle 0, 0xc
  774. add16_sse2_cycle 1, 0x14
  775. add16_sse2_cycle 2, 0xe
  776. add16_sse2_cycle 3, 0x16
  777. add16_sse2_cycle 4, 0x1c
  778. add16_sse2_cycle 5, 0x24
  779. add16_sse2_cycle 6, 0x1e
  780. add16_sse2_cycle 7, 0x26
  781. RET
  782. %macro add16intra_sse2_cycle 2
  783. movzx r0, word [r4+%2]
  784. test r0, r0
  785. jz .try%1dc
  786. mov r0d, dword [r1+%1*8]
  787. %if ARCH_X86_64
  788. add r0, r7
  789. %else
  790. add r0, r0m
  791. %endif
  792. call h264_add8x4_idct_sse2
  793. jmp .cycle%1end
  794. .try%1dc:
  795. movsx r0, word [r2 ]
  796. or r0w, word [r2+32]
  797. jz .cycle%1end
  798. mov r0d, dword [r1+%1*8]
  799. %if ARCH_X86_64
  800. add r0, r7
  801. %else
  802. add r0, r0m
  803. %endif
  804. call h264_idct_dc_add8_mmxext
  805. .cycle%1end:
  806. %if %1 < 7
  807. add r2, 64
  808. %endif
  809. %endmacro
  810. ; void ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
  811. ; int16_t *block, int stride,
  812. ; const uint8_t nnzc[6 * 8])
  813. cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
  814. %if ARCH_X86_64
  815. mov r7, r0
  816. %endif
  817. add16intra_sse2_cycle 0, 0xc
  818. add16intra_sse2_cycle 1, 0x14
  819. add16intra_sse2_cycle 2, 0xe
  820. add16intra_sse2_cycle 3, 0x16
  821. add16intra_sse2_cycle 4, 0x1c
  822. add16intra_sse2_cycle 5, 0x24
  823. add16intra_sse2_cycle 6, 0x1e
  824. add16intra_sse2_cycle 7, 0x26
  825. RET
  826. %macro add8_sse2_cycle 2
  827. movzx r0, word [r4+%2]
  828. test r0, r0
  829. jz .try%1dc
  830. %if ARCH_X86_64
  831. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  832. add r0, [r7]
  833. %else
  834. mov r0, r0m
  835. mov r0, [r0]
  836. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  837. %endif
  838. call h264_add8x4_idct_sse2
  839. jmp .cycle%1end
  840. .try%1dc:
  841. movsx r0, word [r2 ]
  842. or r0w, word [r2+32]
  843. jz .cycle%1end
  844. %if ARCH_X86_64
  845. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  846. add r0, [r7]
  847. %else
  848. mov r0, r0m
  849. mov r0, [r0]
  850. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  851. %endif
  852. call h264_idct_dc_add8_mmxext
  853. .cycle%1end:
  854. %if %1 == 1
  855. add r2, 384+64
  856. %elif %1 < 3
  857. add r2, 64
  858. %endif
  859. %endmacro
  860. ; void ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
  861. ; int16_t *block, int stride,
  862. ; const uint8_t nnzc[6 * 8])
  863. cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
  864. add r2, 512
  865. %if ARCH_X86_64
  866. mov r7, r0
  867. %endif
  868. add8_sse2_cycle 0, 0x34
  869. add8_sse2_cycle 1, 0x3c
  870. %if ARCH_X86_64
  871. add r7, gprsize
  872. %else
  873. add r0mp, gprsize
  874. %endif
  875. add8_sse2_cycle 2, 0x5c
  876. add8_sse2_cycle 3, 0x64
  877. RET
  878. ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
  879. %macro WALSH4_1D 5
  880. SUMSUB_BADC w, %4, %3, %2, %1, %5
  881. SUMSUB_BADC w, %4, %2, %3, %1, %5
  882. SWAP %1, %4, %3
  883. %endmacro
  884. %macro DEQUANT_MMX 3
  885. mova m7, [pw_1]
  886. mova m4, %1
  887. punpcklwd %1, m7
  888. punpckhwd m4, m7
  889. mova m5, %2
  890. punpcklwd %2, m7
  891. punpckhwd m5, m7
  892. movd m7, t3d
  893. punpckldq m7, m7
  894. pmaddwd %1, m7
  895. pmaddwd %2, m7
  896. pmaddwd m4, m7
  897. pmaddwd m5, m7
  898. psrad %1, %3
  899. psrad %2, %3
  900. psrad m4, %3
  901. psrad m5, %3
  902. packssdw %1, m4
  903. packssdw %2, m5
  904. %endmacro
  905. %macro STORE_WORDS 5-9
  906. %if cpuflag(sse)
  907. movd t0d, %1
  908. psrldq %1, 4
  909. movd t1d, %1
  910. psrldq %1, 4
  911. mov [t2+%2*32], t0w
  912. mov [t2+%4*32], t1w
  913. shr t0d, 16
  914. shr t1d, 16
  915. mov [t2+%3*32], t0w
  916. mov [t2+%5*32], t1w
  917. movd t0d, %1
  918. psrldq %1, 4
  919. movd t1d, %1
  920. mov [t2+%6*32], t0w
  921. mov [t2+%8*32], t1w
  922. shr t0d, 16
  923. shr t1d, 16
  924. mov [t2+%7*32], t0w
  925. mov [t2+%9*32], t1w
  926. %else
  927. movd t0d, %1
  928. psrlq %1, 32
  929. movd t1d, %1
  930. mov [t2+%2*32], t0w
  931. mov [t2+%4*32], t1w
  932. shr t0d, 16
  933. shr t1d, 16
  934. mov [t2+%3*32], t0w
  935. mov [t2+%5*32], t1w
  936. %endif
  937. %endmacro
  938. %macro DEQUANT_STORE 1
  939. %if cpuflag(sse2)
  940. movd xmm4, t3d
  941. movq xmm5, [pw_1]
  942. pshufd xmm4, xmm4, 0
  943. movq2dq xmm0, m0
  944. movq2dq xmm1, m1
  945. movq2dq xmm2, m2
  946. movq2dq xmm3, m3
  947. punpcklwd xmm0, xmm5
  948. punpcklwd xmm1, xmm5
  949. punpcklwd xmm2, xmm5
  950. punpcklwd xmm3, xmm5
  951. pmaddwd xmm0, xmm4
  952. pmaddwd xmm1, xmm4
  953. pmaddwd xmm2, xmm4
  954. pmaddwd xmm3, xmm4
  955. psrad xmm0, %1
  956. psrad xmm1, %1
  957. psrad xmm2, %1
  958. psrad xmm3, %1
  959. packssdw xmm0, xmm1
  960. packssdw xmm2, xmm3
  961. STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  962. STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  963. %else
  964. DEQUANT_MMX m0, m1, %1
  965. STORE_WORDS m0, 0, 1, 4, 5
  966. STORE_WORDS m1, 2, 3, 6, 7
  967. DEQUANT_MMX m2, m3, %1
  968. STORE_WORDS m2, 8, 9, 12, 13
  969. STORE_WORDS m3, 10, 11, 14, 15
  970. %endif
  971. %endmacro
  972. %macro IDCT_DC_DEQUANT 1
  973. cglobal h264_luma_dc_dequant_idct, 3, 4, %1
  974. ; manually spill XMM registers for Win64 because
  975. ; the code here is initialized with INIT_MMX
  976. WIN64_SPILL_XMM %1
  977. movq m3, [r1+24]
  978. movq m2, [r1+16]
  979. movq m1, [r1+ 8]
  980. movq m0, [r1+ 0]
  981. WALSH4_1D 0,1,2,3,4
  982. TRANSPOSE4x4W 0,1,2,3,4
  983. WALSH4_1D 0,1,2,3,4
  984. ; shift, tmp, output, qmul
  985. %if WIN64
  986. DECLARE_REG_TMP 0,3,1,2
  987. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  988. xchg r0, t2
  989. %elif ARCH_X86_64
  990. DECLARE_REG_TMP 3,1,0,2
  991. %else
  992. DECLARE_REG_TMP 1,3,0,2
  993. %endif
  994. cmp t3d, 32767
  995. jg .big_qmul
  996. add t3d, 128 << 16
  997. DEQUANT_STORE 8
  998. RET
  999. .big_qmul:
  1000. bsr t0d, t3d
  1001. add t3d, 128 << 16
  1002. mov t1d, 7
  1003. cmp t0d, t1d
  1004. cmovg t0d, t1d
  1005. inc t1d
  1006. shr t3d, t0b
  1007. sub t1d, t0d
  1008. %if cpuflag(sse2)
  1009. movd xmm6, t1d
  1010. DEQUANT_STORE xmm6
  1011. %else
  1012. movd m6, t1d
  1013. DEQUANT_STORE m6
  1014. %endif
  1015. RET
  1016. %endmacro
  1017. INIT_MMX mmx
  1018. IDCT_DC_DEQUANT 0
  1019. INIT_MMX sse2
  1020. IDCT_DC_DEQUANT 7