You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1011 lines
25KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of Libav.
  13. ;*
  14. ;* Libav is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* Libav is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with Libav; if not, write to the Free Software
  26. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "x86inc.asm"
  29. %include "x86util.asm"
  30. SECTION_RODATA
  31. ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  32. scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
  33. db 6+1*8, 7+1*8, 6+2*8, 7+2*8
  34. db 4+3*8, 5+3*8, 4+4*8, 5+4*8
  35. db 6+3*8, 7+3*8, 6+4*8, 7+4*8
  36. db 1+1*8, 2+1*8
  37. db 1+2*8, 2+2*8
  38. db 1+4*8, 2+4*8
  39. db 1+5*8, 2+5*8
  40. %ifdef PIC
  41. %define scan8 r11
  42. %else
  43. %define scan8 scan8_mem
  44. %endif
  45. cextern pw_32
  46. cextern pw_1
  47. SECTION .text
  48. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  49. %macro IDCT4_ADD 3
  50. ; Load dct coeffs
  51. movq m0, [%2]
  52. movq m1, [%2+8]
  53. movq m2, [%2+16]
  54. movq m3, [%2+24]
  55. IDCT4_1D 0, 1, 2, 3, 4, 5
  56. mova m6, [pw_32]
  57. TRANSPOSE4x4W 0, 1, 2, 3, 4
  58. paddw m0, m6
  59. IDCT4_1D 0, 1, 2, 3, 4, 5
  60. pxor m7, m7
  61. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  62. lea %1, [%1+%3*2]
  63. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  64. %endmacro
  65. INIT_MMX
  66. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  67. cglobal h264_idct_add_mmx, 3, 3, 0
  68. IDCT4_ADD r0, r1, r2
  69. RET
  70. %macro IDCT8_1D 2
  71. mova m4, m5
  72. mova m0, m1
  73. psraw m4, 1
  74. psraw m1, 1
  75. paddw m4, m5
  76. paddw m1, m0
  77. paddw m4, m7
  78. paddw m1, m5
  79. psubw m4, m0
  80. paddw m1, m3
  81. psubw m0, m3
  82. psubw m5, m3
  83. paddw m0, m7
  84. psubw m5, m7
  85. psraw m3, 1
  86. psraw m7, 1
  87. psubw m0, m3
  88. psubw m5, m7
  89. mova m3, m4
  90. mova m7, m1
  91. psraw m1, 2
  92. psraw m3, 2
  93. paddw m3, m0
  94. psraw m0, 2
  95. paddw m1, m5
  96. psraw m5, 2
  97. psubw m0, m4
  98. psubw m7, m5
  99. mova m4, m2
  100. mova m5, m6
  101. psraw m4, 1
  102. psraw m6, 1
  103. psubw m4, m5
  104. paddw m6, m2
  105. mova m2, %1
  106. mova m5, %2
  107. SUMSUB_BA m5, m2
  108. SUMSUB_BA m6, m5
  109. SUMSUB_BA m4, m2
  110. SUMSUB_BA m7, m6
  111. SUMSUB_BA m0, m4
  112. SUMSUB_BA m3, m2
  113. SUMSUB_BA m1, m5
  114. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  115. %endmacro
  116. %macro IDCT8_1D_FULL 1
  117. mova m7, [%1+112]
  118. mova m6, [%1+ 96]
  119. mova m5, [%1+ 80]
  120. mova m3, [%1+ 48]
  121. mova m2, [%1+ 32]
  122. mova m1, [%1+ 16]
  123. IDCT8_1D [%1], [%1+ 64]
  124. %endmacro
  125. ; %1=int16_t *block, %2=int16_t *dstblock
  126. %macro IDCT8_ADD_MMX_START 2
  127. IDCT8_1D_FULL %1
  128. mova [%1], m7
  129. TRANSPOSE4x4W 0, 1, 2, 3, 7
  130. mova m7, [%1]
  131. mova [%2 ], m0
  132. mova [%2+16], m1
  133. mova [%2+32], m2
  134. mova [%2+48], m3
  135. TRANSPOSE4x4W 4, 5, 6, 7, 3
  136. mova [%2+ 8], m4
  137. mova [%2+24], m5
  138. mova [%2+40], m6
  139. mova [%2+56], m7
  140. %endmacro
  141. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  142. %macro IDCT8_ADD_MMX_END 3
  143. IDCT8_1D_FULL %2
  144. mova [%2 ], m5
  145. mova [%2+16], m6
  146. mova [%2+32], m7
  147. pxor m7, m7
  148. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  149. lea %1, [%1+%3*2]
  150. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  151. mova m0, [%2 ]
  152. mova m1, [%2+16]
  153. mova m2, [%2+32]
  154. lea %1, [%1+%3*2]
  155. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  156. lea %1, [%1+%3*2]
  157. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  158. %endmacro
  159. INIT_MMX
  160. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  161. cglobal h264_idct8_add_mmx, 3, 4, 0
  162. %assign pad 128+4-(stack_offset&7)
  163. SUB rsp, pad
  164. add word [r1], 32
  165. IDCT8_ADD_MMX_START r1 , rsp
  166. IDCT8_ADD_MMX_START r1+8, rsp+64
  167. lea r3, [r0+4]
  168. IDCT8_ADD_MMX_END r0 , rsp, r2
  169. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  170. ADD rsp, pad
  171. RET
  172. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  173. %macro IDCT8_ADD_SSE 4
  174. IDCT8_1D_FULL %2
  175. %ifdef ARCH_X86_64
  176. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  177. %else
  178. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  179. %endif
  180. paddw m0, [pw_32]
  181. %ifndef ARCH_X86_64
  182. mova [%2 ], m0
  183. mova [%2+16], m4
  184. IDCT8_1D [%2], [%2+ 16]
  185. mova [%2 ], m6
  186. mova [%2+16], m7
  187. %else
  188. SWAP 0, 8
  189. SWAP 4, 9
  190. IDCT8_1D m8, m9
  191. SWAP 6, 8
  192. SWAP 7, 9
  193. %endif
  194. pxor m7, m7
  195. lea %4, [%3*3]
  196. STORE_DIFF m0, m6, m7, [%1 ]
  197. STORE_DIFF m1, m6, m7, [%1+%3 ]
  198. STORE_DIFF m2, m6, m7, [%1+%3*2]
  199. STORE_DIFF m3, m6, m7, [%1+%4 ]
  200. %ifndef ARCH_X86_64
  201. mova m0, [%2 ]
  202. mova m1, [%2+16]
  203. %else
  204. SWAP 0, 8
  205. SWAP 1, 9
  206. %endif
  207. lea %1, [%1+%3*4]
  208. STORE_DIFF m4, m6, m7, [%1 ]
  209. STORE_DIFF m5, m6, m7, [%1+%3 ]
  210. STORE_DIFF m0, m6, m7, [%1+%3*2]
  211. STORE_DIFF m1, m6, m7, [%1+%4 ]
  212. %endmacro
  213. INIT_XMM
  214. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  215. cglobal h264_idct8_add_sse2, 3, 4, 10
  216. IDCT8_ADD_SSE r0, r1, r2, r3
  217. RET
  218. %macro DC_ADD_MMX2_INIT 2-3
  219. %if %0 == 2
  220. movsx %1, word [%1]
  221. add %1, 32
  222. sar %1, 6
  223. movd m0, %1d
  224. lea %1, [%2*3]
  225. %else
  226. add %3, 32
  227. sar %3, 6
  228. movd m0, %3d
  229. lea %3, [%2*3]
  230. %endif
  231. pshufw m0, m0, 0
  232. pxor m1, m1
  233. psubw m1, m0
  234. packuswb m0, m0
  235. packuswb m1, m1
  236. %endmacro
  237. %macro DC_ADD_MMX2_OP 3-4
  238. %1 m2, [%2 ]
  239. %1 m3, [%2+%3 ]
  240. %1 m4, [%2+%3*2]
  241. %1 m5, [%2+%4 ]
  242. paddusb m2, m0
  243. paddusb m3, m0
  244. paddusb m4, m0
  245. paddusb m5, m0
  246. psubusb m2, m1
  247. psubusb m3, m1
  248. psubusb m4, m1
  249. psubusb m5, m1
  250. %1 [%2 ], m2
  251. %1 [%2+%3 ], m3
  252. %1 [%2+%3*2], m4
  253. %1 [%2+%4 ], m5
  254. %endmacro
  255. INIT_MMX
  256. ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  257. cglobal h264_idct_dc_add_mmx2, 3, 3, 0
  258. DC_ADD_MMX2_INIT r1, r2
  259. DC_ADD_MMX2_OP movh, r0, r2, r1
  260. RET
  261. ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  262. cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
  263. DC_ADD_MMX2_INIT r1, r2
  264. DC_ADD_MMX2_OP mova, r0, r2, r1
  265. lea r0, [r0+r2*4]
  266. DC_ADD_MMX2_OP mova, r0, r2, r1
  267. RET
  268. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  269. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  270. cglobal h264_idct_add16_mmx, 5, 7, 0
  271. xor r5, r5
  272. %ifdef PIC
  273. lea r11, [scan8_mem]
  274. %endif
  275. .nextblock
  276. movzx r6, byte [scan8+r5]
  277. movzx r6, byte [r4+r6]
  278. test r6, r6
  279. jz .skipblock
  280. mov r6d, dword [r1+r5*4]
  281. lea r6, [r0+r6]
  282. IDCT4_ADD r6, r2, r3
  283. .skipblock
  284. inc r5
  285. add r2, 32
  286. cmp r5, 16
  287. jl .nextblock
  288. REP_RET
  289. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  290. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  291. cglobal h264_idct8_add4_mmx, 5, 7, 0
  292. %assign pad 128+4-(stack_offset&7)
  293. SUB rsp, pad
  294. xor r5, r5
  295. %ifdef PIC
  296. lea r11, [scan8_mem]
  297. %endif
  298. .nextblock
  299. movzx r6, byte [scan8+r5]
  300. movzx r6, byte [r4+r6]
  301. test r6, r6
  302. jz .skipblock
  303. mov r6d, dword [r1+r5*4]
  304. lea r6, [r0+r6]
  305. add word [r2], 32
  306. IDCT8_ADD_MMX_START r2 , rsp
  307. IDCT8_ADD_MMX_START r2+8, rsp+64
  308. IDCT8_ADD_MMX_END r6 , rsp, r3
  309. mov r6d, dword [r1+r5*4]
  310. lea r6, [r0+r6+4]
  311. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  312. .skipblock
  313. add r5, 4
  314. add r2, 128
  315. cmp r5, 16
  316. jl .nextblock
  317. ADD rsp, pad
  318. RET
  319. ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
  320. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  321. cglobal h264_idct_add16_mmx2, 5, 7, 0
  322. xor r5, r5
  323. %ifdef PIC
  324. lea r11, [scan8_mem]
  325. %endif
  326. .nextblock
  327. movzx r6, byte [scan8+r5]
  328. movzx r6, byte [r4+r6]
  329. test r6, r6
  330. jz .skipblock
  331. cmp r6, 1
  332. jnz .no_dc
  333. movsx r6, word [r2]
  334. test r6, r6
  335. jz .no_dc
  336. DC_ADD_MMX2_INIT r2, r3, r6
  337. %ifdef ARCH_X86_64
  338. %define dst_reg r10
  339. %define dst_regd r10d
  340. %else
  341. %define dst_reg r1
  342. %define dst_regd r1d
  343. %endif
  344. mov dst_regd, dword [r1+r5*4]
  345. lea dst_reg, [r0+dst_reg]
  346. DC_ADD_MMX2_OP movh, dst_reg, r3, r6
  347. %ifndef ARCH_X86_64
  348. mov r1, r1m
  349. %endif
  350. inc r5
  351. add r2, 32
  352. cmp r5, 16
  353. jl .nextblock
  354. REP_RET
  355. .no_dc
  356. mov r6d, dword [r1+r5*4]
  357. lea r6, [r0+r6]
  358. IDCT4_ADD r6, r2, r3
  359. .skipblock
  360. inc r5
  361. add r2, 32
  362. cmp r5, 16
  363. jl .nextblock
  364. REP_RET
  365. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  366. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  367. cglobal h264_idct_add16intra_mmx, 5, 7, 0
  368. xor r5, r5
  369. %ifdef PIC
  370. lea r11, [scan8_mem]
  371. %endif
  372. .nextblock
  373. movzx r6, byte [scan8+r5]
  374. movzx r6, byte [r4+r6]
  375. or r6w, word [r2]
  376. test r6, r6
  377. jz .skipblock
  378. mov r6d, dword [r1+r5*4]
  379. lea r6, [r0+r6]
  380. IDCT4_ADD r6, r2, r3
  381. .skipblock
  382. inc r5
  383. add r2, 32
  384. cmp r5, 16
  385. jl .nextblock
  386. REP_RET
  387. ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
  388. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  389. cglobal h264_idct_add16intra_mmx2, 5, 7, 0
  390. xor r5, r5
  391. %ifdef PIC
  392. lea r11, [scan8_mem]
  393. %endif
  394. .nextblock
  395. movzx r6, byte [scan8+r5]
  396. movzx r6, byte [r4+r6]
  397. test r6, r6
  398. jz .try_dc
  399. mov r6d, dword [r1+r5*4]
  400. lea r6, [r0+r6]
  401. IDCT4_ADD r6, r2, r3
  402. inc r5
  403. add r2, 32
  404. cmp r5, 16
  405. jl .nextblock
  406. REP_RET
  407. .try_dc
  408. movsx r6, word [r2]
  409. test r6, r6
  410. jz .skipblock
  411. DC_ADD_MMX2_INIT r2, r3, r6
  412. %ifdef ARCH_X86_64
  413. %define dst_reg r10
  414. %define dst_regd r10d
  415. %else
  416. %define dst_reg r1
  417. %define dst_regd r1d
  418. %endif
  419. mov dst_regd, dword [r1+r5*4]
  420. lea dst_reg, [r0+dst_reg]
  421. DC_ADD_MMX2_OP movh, dst_reg, r3, r6
  422. %ifndef ARCH_X86_64
  423. mov r1, r1m
  424. %endif
  425. .skipblock
  426. inc r5
  427. add r2, 32
  428. cmp r5, 16
  429. jl .nextblock
  430. REP_RET
  431. ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
  432. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  433. cglobal h264_idct8_add4_mmx2, 5, 7, 0
  434. %assign pad 128+4-(stack_offset&7)
  435. SUB rsp, pad
  436. xor r5, r5
  437. %ifdef PIC
  438. lea r11, [scan8_mem]
  439. %endif
  440. .nextblock
  441. movzx r6, byte [scan8+r5]
  442. movzx r6, byte [r4+r6]
  443. test r6, r6
  444. jz .skipblock
  445. cmp r6, 1
  446. jnz .no_dc
  447. movsx r6, word [r2]
  448. test r6, r6
  449. jz .no_dc
  450. DC_ADD_MMX2_INIT r2, r3, r6
  451. %ifdef ARCH_X86_64
  452. %define dst_reg r10
  453. %define dst_regd r10d
  454. %else
  455. %define dst_reg r1
  456. %define dst_regd r1d
  457. %endif
  458. mov dst_regd, dword [r1+r5*4]
  459. lea dst_reg, [r0+dst_reg]
  460. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  461. lea dst_reg, [dst_reg+r3*4]
  462. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  463. %ifndef ARCH_X86_64
  464. mov r1, r1m
  465. %endif
  466. add r5, 4
  467. add r2, 128
  468. cmp r5, 16
  469. jl .nextblock
  470. ADD rsp, pad
  471. RET
  472. .no_dc
  473. mov r6d, dword [r1+r5*4]
  474. lea r6, [r0+r6]
  475. add word [r2], 32
  476. IDCT8_ADD_MMX_START r2 , rsp
  477. IDCT8_ADD_MMX_START r2+8, rsp+64
  478. IDCT8_ADD_MMX_END r6 , rsp, r3
  479. mov r6d, dword [r1+r5*4]
  480. lea r6, [r0+r6+4]
  481. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  482. .skipblock
  483. add r5, 4
  484. add r2, 128
  485. cmp r5, 16
  486. jl .nextblock
  487. ADD rsp, pad
  488. RET
  489. INIT_XMM
  490. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  491. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  492. cglobal h264_idct8_add4_sse2, 5, 7, 10
  493. xor r5, r5
  494. %ifdef PIC
  495. lea r11, [scan8_mem]
  496. %endif
  497. .nextblock
  498. movzx r6, byte [scan8+r5]
  499. movzx r6, byte [r4+r6]
  500. test r6, r6
  501. jz .skipblock
  502. cmp r6, 1
  503. jnz .no_dc
  504. movsx r6, word [r2]
  505. test r6, r6
  506. jz .no_dc
  507. INIT_MMX
  508. DC_ADD_MMX2_INIT r2, r3, r6
  509. %ifdef ARCH_X86_64
  510. %define dst_reg r10
  511. %define dst_regd r10d
  512. %else
  513. %define dst_reg r1
  514. %define dst_regd r1d
  515. %endif
  516. mov dst_regd, dword [r1+r5*4]
  517. lea dst_reg, [r0+dst_reg]
  518. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  519. lea dst_reg, [dst_reg+r3*4]
  520. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  521. %ifndef ARCH_X86_64
  522. mov r1, r1m
  523. %endif
  524. add r5, 4
  525. add r2, 128
  526. cmp r5, 16
  527. jl .nextblock
  528. REP_RET
  529. .no_dc
  530. INIT_XMM
  531. mov dst_regd, dword [r1+r5*4]
  532. lea dst_reg, [r0+dst_reg]
  533. IDCT8_ADD_SSE dst_reg, r2, r3, r6
  534. %ifndef ARCH_X86_64
  535. mov r1, r1m
  536. %endif
  537. .skipblock
  538. add r5, 4
  539. add r2, 128
  540. cmp r5, 16
  541. jl .nextblock
  542. REP_RET
  543. INIT_MMX
  544. h264_idct_add8_mmx_plane:
  545. .nextblock
  546. movzx r6, byte [scan8+r5]
  547. movzx r6, byte [r4+r6]
  548. or r6w, word [r2]
  549. test r6, r6
  550. jz .skipblock
  551. %ifdef ARCH_X86_64
  552. mov r0d, dword [r1+r5*4]
  553. add r0, [r10]
  554. %else
  555. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  556. mov r0, [r0]
  557. add r0, dword [r1+r5*4]
  558. %endif
  559. IDCT4_ADD r0, r2, r3
  560. .skipblock
  561. inc r5
  562. add r2, 32
  563. test r5, 3
  564. jnz .nextblock
  565. rep ret
  566. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  567. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  568. cglobal h264_idct_add8_mmx, 5, 7, 0
  569. mov r5, 16
  570. add r2, 512
  571. %ifdef PIC
  572. lea r11, [scan8_mem]
  573. %endif
  574. %ifdef ARCH_X86_64
  575. mov r10, r0
  576. %endif
  577. call h264_idct_add8_mmx_plane
  578. %ifdef ARCH_X86_64
  579. add r10, gprsize
  580. %else
  581. add r0mp, gprsize
  582. %endif
  583. call h264_idct_add8_mmx_plane
  584. RET
  585. h264_idct_add8_mmx2_plane
  586. .nextblock
  587. movzx r6, byte [scan8+r5]
  588. movzx r6, byte [r4+r6]
  589. test r6, r6
  590. jz .try_dc
  591. %ifdef ARCH_X86_64
  592. mov r0d, dword [r1+r5*4]
  593. add r0, [r10]
  594. %else
  595. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  596. mov r0, [r0]
  597. add r0, dword [r1+r5*4]
  598. %endif
  599. IDCT4_ADD r0, r2, r3
  600. inc r5
  601. add r2, 32
  602. test r5, 3
  603. jnz .nextblock
  604. rep ret
  605. .try_dc
  606. movsx r6, word [r2]
  607. test r6, r6
  608. jz .skipblock
  609. DC_ADD_MMX2_INIT r2, r3, r6
  610. %ifdef ARCH_X86_64
  611. mov r0d, dword [r1+r5*4]
  612. add r0, [r10]
  613. %else
  614. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  615. mov r0, [r0]
  616. add r0, dword [r1+r5*4]
  617. %endif
  618. DC_ADD_MMX2_OP movh, r0, r3, r6
  619. .skipblock
  620. inc r5
  621. add r2, 32
  622. test r5, 3
  623. jnz .nextblock
  624. rep ret
  625. ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
  626. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  627. cglobal h264_idct_add8_mmx2, 5, 7, 0
  628. mov r5, 16
  629. add r2, 512
  630. %ifdef ARCH_X86_64
  631. mov r10, r0
  632. %endif
  633. %ifdef PIC
  634. lea r11, [scan8_mem]
  635. %endif
  636. call h264_idct_add8_mmx2_plane
  637. %ifdef ARCH_X86_64
  638. add r10, gprsize
  639. %else
  640. add r0mp, gprsize
  641. %endif
  642. call h264_idct_add8_mmx2_plane
  643. RET
  644. INIT_MMX
  645. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  646. h264_idct_dc_add8_mmx2:
  647. movd m0, [r2 ] ; 0 0 X D
  648. punpcklwd m0, [r2+32] ; x X d D
  649. paddsw m0, [pw_32]
  650. psraw m0, 6
  651. punpcklwd m0, m0 ; d d D D
  652. pxor m1, m1 ; 0 0 0 0
  653. psubw m1, m0 ; -d-d-D-D
  654. packuswb m0, m1 ; -d-d-D-D d d D D
  655. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  656. punpcklwd m0, m0 ; d d d d D D D D
  657. lea r6, [r3*3]
  658. DC_ADD_MMX2_OP movq, r0, r3, r6
  659. ret
  660. ALIGN 16
  661. INIT_XMM
  662. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  663. x264_add8x4_idct_sse2:
  664. movq m0, [r2+ 0]
  665. movq m1, [r2+ 8]
  666. movq m2, [r2+16]
  667. movq m3, [r2+24]
  668. movhps m0, [r2+32]
  669. movhps m1, [r2+40]
  670. movhps m2, [r2+48]
  671. movhps m3, [r2+56]
  672. IDCT4_1D 0,1,2,3,4,5
  673. TRANSPOSE2x4x4W 0,1,2,3,4
  674. paddw m0, [pw_32]
  675. IDCT4_1D 0,1,2,3,4,5
  676. pxor m7, m7
  677. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  678. lea r0, [r0+r3*2]
  679. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  680. ret
  681. %macro add16_sse2_cycle 2
  682. movzx r0, word [r4+%2]
  683. test r0, r0
  684. jz .cycle%1end
  685. mov r0d, dword [r1+%1*8]
  686. %ifdef ARCH_X86_64
  687. add r0, r10
  688. %else
  689. add r0, r0m
  690. %endif
  691. call x264_add8x4_idct_sse2
  692. .cycle%1end
  693. %if %1 < 7
  694. add r2, 64
  695. %endif
  696. %endmacro
  697. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  698. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  699. cglobal h264_idct_add16_sse2, 5, 5, 8
  700. %ifdef ARCH_X86_64
  701. mov r10, r0
  702. %endif
  703. ; unrolling of the loop leads to an average performance gain of
  704. ; 20-25%
  705. add16_sse2_cycle 0, 0xc
  706. add16_sse2_cycle 1, 0x14
  707. add16_sse2_cycle 2, 0xe
  708. add16_sse2_cycle 3, 0x16
  709. add16_sse2_cycle 4, 0x1c
  710. add16_sse2_cycle 5, 0x24
  711. add16_sse2_cycle 6, 0x1e
  712. add16_sse2_cycle 7, 0x26
  713. RET
  714. %macro add16intra_sse2_cycle 2
  715. movzx r0, word [r4+%2]
  716. test r0, r0
  717. jz .try%1dc
  718. mov r0d, dword [r1+%1*8]
  719. %ifdef ARCH_X86_64
  720. add r0, r10
  721. %else
  722. add r0, r0m
  723. %endif
  724. call x264_add8x4_idct_sse2
  725. jmp .cycle%1end
  726. .try%1dc
  727. movsx r0, word [r2 ]
  728. or r0w, word [r2+32]
  729. jz .cycle%1end
  730. mov r0d, dword [r1+%1*8]
  731. %ifdef ARCH_X86_64
  732. add r0, r10
  733. %else
  734. add r0, r0m
  735. %endif
  736. call h264_idct_dc_add8_mmx2
  737. .cycle%1end
  738. %if %1 < 7
  739. add r2, 64
  740. %endif
  741. %endmacro
  742. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  743. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  744. cglobal h264_idct_add16intra_sse2, 5, 7, 8
  745. %ifdef ARCH_X86_64
  746. mov r10, r0
  747. %endif
  748. add16intra_sse2_cycle 0, 0xc
  749. add16intra_sse2_cycle 1, 0x14
  750. add16intra_sse2_cycle 2, 0xe
  751. add16intra_sse2_cycle 3, 0x16
  752. add16intra_sse2_cycle 4, 0x1c
  753. add16intra_sse2_cycle 5, 0x24
  754. add16intra_sse2_cycle 6, 0x1e
  755. add16intra_sse2_cycle 7, 0x26
  756. RET
  757. %macro add8_sse2_cycle 2
  758. movzx r0, word [r4+%2]
  759. test r0, r0
  760. jz .try%1dc
  761. %ifdef ARCH_X86_64
  762. mov r0d, dword [r1+%1*8+64]
  763. add r0, [r10]
  764. %else
  765. mov r0, r0m
  766. mov r0, [r0]
  767. add r0, dword [r1+%1*8+64]
  768. %endif
  769. call x264_add8x4_idct_sse2
  770. jmp .cycle%1end
  771. .try%1dc
  772. movsx r0, word [r2 ]
  773. or r0w, word [r2+32]
  774. jz .cycle%1end
  775. %ifdef ARCH_X86_64
  776. mov r0d, dword [r1+%1*8+64]
  777. add r0, [r10]
  778. %else
  779. mov r0, r0m
  780. mov r0, [r0]
  781. add r0, dword [r1+%1*8+64]
  782. %endif
  783. call h264_idct_dc_add8_mmx2
  784. .cycle%1end
  785. %if %1 < 3
  786. add r2, 64
  787. %endif
  788. %endmacro
  789. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  790. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  791. cglobal h264_idct_add8_sse2, 5, 7, 8
  792. add r2, 512
  793. %ifdef ARCH_X86_64
  794. mov r10, r0
  795. %endif
  796. add8_sse2_cycle 0, 0x09
  797. add8_sse2_cycle 1, 0x11
  798. %ifdef ARCH_X86_64
  799. add r10, gprsize
  800. %else
  801. add r0mp, gprsize
  802. %endif
  803. add8_sse2_cycle 2, 0x21
  804. add8_sse2_cycle 3, 0x29
  805. RET
  806. ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
  807. %macro WALSH4_1D 5
  808. SUMSUB_BADC m%4, m%3, m%2, m%1, m%5
  809. SUMSUB_BADC m%4, m%2, m%3, m%1, m%5
  810. SWAP %1, %4, %3
  811. %endmacro
  812. %macro DEQUANT_MMX 3
  813. mova m7, [pw_1]
  814. mova m4, %1
  815. punpcklwd %1, m7
  816. punpckhwd m4, m7
  817. mova m5, %2
  818. punpcklwd %2, m7
  819. punpckhwd m5, m7
  820. movd m7, t3d
  821. punpckldq m7, m7
  822. pmaddwd %1, m7
  823. pmaddwd %2, m7
  824. pmaddwd m4, m7
  825. pmaddwd m5, m7
  826. psrad %1, %3
  827. psrad %2, %3
  828. psrad m4, %3
  829. psrad m5, %3
  830. packssdw %1, m4
  831. packssdw %2, m5
  832. %endmacro
  833. %macro STORE_WORDS_MMX 5
  834. movd t0d, %1
  835. psrlq %1, 32
  836. movd t1d, %1
  837. mov [t2+%2*32], t0w
  838. mov [t2+%4*32], t1w
  839. shr t0d, 16
  840. shr t1d, 16
  841. mov [t2+%3*32], t0w
  842. mov [t2+%5*32], t1w
  843. %endmacro
  844. %macro DEQUANT_STORE_MMX 1
  845. DEQUANT_MMX m0, m1, %1
  846. STORE_WORDS_MMX m0, 0, 1, 4, 5
  847. STORE_WORDS_MMX m1, 2, 3, 6, 7
  848. DEQUANT_MMX m2, m3, %1
  849. STORE_WORDS_MMX m2, 8, 9, 12, 13
  850. STORE_WORDS_MMX m3, 10, 11, 14, 15
  851. %endmacro
  852. %macro STORE_WORDS_SSE 9
  853. movd t0d, %1
  854. psrldq %1, 4
  855. movd t1d, %1
  856. psrldq %1, 4
  857. mov [t2+%2*32], t0w
  858. mov [t2+%4*32], t1w
  859. shr t0d, 16
  860. shr t1d, 16
  861. mov [t2+%3*32], t0w
  862. mov [t2+%5*32], t1w
  863. movd t0d, %1
  864. psrldq %1, 4
  865. movd t1d, %1
  866. mov [t2+%6*32], t0w
  867. mov [t2+%8*32], t1w
  868. shr t0d, 16
  869. shr t1d, 16
  870. mov [t2+%7*32], t0w
  871. mov [t2+%9*32], t1w
  872. %endmacro
  873. %macro DEQUANT_STORE_SSE2 1
  874. movd xmm4, t3d
  875. movq xmm5, [pw_1]
  876. pshufd xmm4, xmm4, 0
  877. movq2dq xmm0, m0
  878. movq2dq xmm1, m1
  879. movq2dq xmm2, m2
  880. movq2dq xmm3, m3
  881. punpcklwd xmm0, xmm5
  882. punpcklwd xmm1, xmm5
  883. punpcklwd xmm2, xmm5
  884. punpcklwd xmm3, xmm5
  885. pmaddwd xmm0, xmm4
  886. pmaddwd xmm1, xmm4
  887. pmaddwd xmm2, xmm4
  888. pmaddwd xmm3, xmm4
  889. psrad xmm0, %1
  890. psrad xmm1, %1
  891. psrad xmm2, %1
  892. psrad xmm3, %1
  893. packssdw xmm0, xmm1
  894. packssdw xmm2, xmm3
  895. STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  896. STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  897. %endmacro
  898. %macro IDCT_DC_DEQUANT 2
  899. cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
  900. movq m3, [r1+24]
  901. movq m2, [r1+16]
  902. movq m1, [r1+ 8]
  903. movq m0, [r1+ 0]
  904. WALSH4_1D 0,1,2,3,4
  905. TRANSPOSE4x4W 0,1,2,3,4
  906. WALSH4_1D 0,1,2,3,4
  907. ; shift, tmp, output, qmul
  908. %ifdef WIN64
  909. DECLARE_REG_TMP 0,3,1,2
  910. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  911. xchg r0, t2
  912. %elifdef ARCH_X86_64
  913. DECLARE_REG_TMP 3,1,0,2
  914. %else
  915. DECLARE_REG_TMP 1,3,0,2
  916. %endif
  917. cmp t3d, 32767
  918. jg .big_qmul
  919. add t3d, 128 << 16
  920. %ifidn %1,mmx
  921. DEQUANT_STORE_MMX 8
  922. %else
  923. DEQUANT_STORE_SSE2 8
  924. %endif
  925. RET
  926. .big_qmul:
  927. bsr t0d, t3d
  928. add t3d, 128 << 16
  929. mov t1d, 7
  930. cmp t0d, t1d
  931. cmovg t0d, t1d
  932. inc t1d
  933. shr t3d, t0b
  934. sub t1d, t0d
  935. %ifidn %1,mmx
  936. movd m6, t1d
  937. DEQUANT_STORE_MMX m6
  938. %else
  939. movd xmm6, t1d
  940. DEQUANT_STORE_SSE2 xmm6
  941. %endif
  942. RET
  943. %endmacro
  944. INIT_MMX
  945. IDCT_DC_DEQUANT mmx, 0
  946. IDCT_DC_DEQUANT sse2, 7