You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

857 lines
21KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of FFmpeg.
  13. ;*
  14. ;* FFmpeg is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* FFmpeg is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with FFmpeg; if not, write to the Free Software
  26. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "x86inc.asm"
  29. %include "x86util.asm"
  30. SECTION_RODATA
  31. ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  32. scan8_mem: db 4+1*8, 5+1*8, 4+2*8, 5+2*8
  33. db 6+1*8, 7+1*8, 6+2*8, 7+2*8
  34. db 4+3*8, 5+3*8, 4+4*8, 5+4*8
  35. db 6+3*8, 7+3*8, 6+4*8, 7+4*8
  36. db 1+1*8, 2+1*8
  37. db 1+2*8, 2+2*8
  38. db 1+4*8, 2+4*8
  39. db 1+5*8, 2+5*8
  40. %ifdef PIC
  41. %define scan8 r11
  42. %else
  43. %define scan8 scan8_mem
  44. %endif
  45. cextern pw_32
  46. SECTION .text
  47. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  48. %macro IDCT4_ADD 3
  49. ; Load dct coeffs
  50. movq m0, [%2]
  51. movq m1, [%2+8]
  52. movq m2, [%2+16]
  53. movq m3, [%2+24]
  54. IDCT4_1D 0, 1, 2, 3, 4, 5
  55. mova m6, [pw_32]
  56. TRANSPOSE4x4W 0, 1, 2, 3, 4
  57. paddw m0, m6
  58. IDCT4_1D 0, 1, 2, 3, 4, 5
  59. pxor m7, m7
  60. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  61. lea %1, [%1+%3*2]
  62. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  63. %endmacro
  64. INIT_MMX
  65. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  66. cglobal h264_idct_add_mmx, 3, 3, 0
  67. IDCT4_ADD r0, r1, r2
  68. RET
  69. %macro IDCT8_1D 2
  70. mova m4, m5
  71. mova m0, m1
  72. psraw m4, 1
  73. psraw m1, 1
  74. paddw m4, m5
  75. paddw m1, m0
  76. paddw m4, m7
  77. paddw m1, m5
  78. psubw m4, m0
  79. paddw m1, m3
  80. psubw m0, m3
  81. psubw m5, m3
  82. paddw m0, m7
  83. psubw m5, m7
  84. psraw m3, 1
  85. psraw m7, 1
  86. psubw m0, m3
  87. psubw m5, m7
  88. mova m3, m4
  89. mova m7, m1
  90. psraw m1, 2
  91. psraw m3, 2
  92. paddw m3, m0
  93. psraw m0, 2
  94. paddw m1, m5
  95. psraw m5, 2
  96. psubw m0, m4
  97. psubw m7, m5
  98. mova m4, m2
  99. mova m5, m6
  100. psraw m4, 1
  101. psraw m6, 1
  102. psubw m4, m5
  103. paddw m6, m2
  104. mova m2, %1
  105. mova m5, %2
  106. SUMSUB_BA m5, m2
  107. SUMSUB_BA m6, m5
  108. SUMSUB_BA m4, m2
  109. SUMSUB_BA m7, m6
  110. SUMSUB_BA m0, m4
  111. SUMSUB_BA m3, m2
  112. SUMSUB_BA m1, m5
  113. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  114. %endmacro
  115. %macro IDCT8_1D_FULL 1
  116. mova m7, [%1+112]
  117. mova m6, [%1+ 96]
  118. mova m5, [%1+ 80]
  119. mova m3, [%1+ 48]
  120. mova m2, [%1+ 32]
  121. mova m1, [%1+ 16]
  122. IDCT8_1D [%1], [%1+ 64]
  123. %endmacro
  124. ; %1=int16_t *block, %2=int16_t *dstblock
  125. %macro IDCT8_ADD_MMX_START 2
  126. IDCT8_1D_FULL %1
  127. mova [%1], m7
  128. TRANSPOSE4x4W 0, 1, 2, 3, 7
  129. mova m7, [%1]
  130. mova [%2 ], m0
  131. mova [%2+16], m1
  132. mova [%2+32], m2
  133. mova [%2+48], m3
  134. TRANSPOSE4x4W 4, 5, 6, 7, 3
  135. mova [%2+ 8], m4
  136. mova [%2+24], m5
  137. mova [%2+40], m6
  138. mova [%2+56], m7
  139. %endmacro
  140. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  141. %macro IDCT8_ADD_MMX_END 3
  142. IDCT8_1D_FULL %2
  143. mova [%2 ], m5
  144. mova [%2+16], m6
  145. mova [%2+32], m7
  146. pxor m7, m7
  147. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  148. lea %1, [%1+%3*2]
  149. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  150. mova m0, [%2 ]
  151. mova m1, [%2+16]
  152. mova m2, [%2+32]
  153. lea %1, [%1+%3*2]
  154. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  155. lea %1, [%1+%3*2]
  156. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  157. %endmacro
  158. INIT_MMX
  159. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  160. cglobal h264_idct8_add_mmx, 3, 4, 0
  161. %assign pad 128+4-(stack_offset&7)
  162. SUB rsp, pad
  163. add word [r1], 32
  164. IDCT8_ADD_MMX_START r1 , rsp
  165. IDCT8_ADD_MMX_START r1+8, rsp+64
  166. lea r3, [r0+4]
  167. IDCT8_ADD_MMX_END r0 , rsp, r2
  168. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  169. ADD rsp, pad
  170. RET
  171. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  172. %macro IDCT8_ADD_SSE 4
  173. IDCT8_1D_FULL %2
  174. %ifdef ARCH_X86_64
  175. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  176. %else
  177. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  178. %endif
  179. paddw m0, [pw_32]
  180. %ifndef ARCH_X86_64
  181. mova [%2 ], m0
  182. mova [%2+16], m4
  183. IDCT8_1D [%2], [%2+ 16]
  184. mova [%2 ], m6
  185. mova [%2+16], m7
  186. %else
  187. SWAP 0, 8
  188. SWAP 4, 9
  189. IDCT8_1D m8, m9
  190. SWAP 6, 8
  191. SWAP 7, 9
  192. %endif
  193. pxor m7, m7
  194. lea %4, [%3*3]
  195. STORE_DIFF m0, m6, m7, [%1 ]
  196. STORE_DIFF m1, m6, m7, [%1+%3 ]
  197. STORE_DIFF m2, m6, m7, [%1+%3*2]
  198. STORE_DIFF m3, m6, m7, [%1+%4 ]
  199. %ifndef ARCH_X86_64
  200. mova m0, [%2 ]
  201. mova m1, [%2+16]
  202. %else
  203. SWAP 0, 8
  204. SWAP 1, 9
  205. %endif
  206. lea %1, [%1+%3*4]
  207. STORE_DIFF m4, m6, m7, [%1 ]
  208. STORE_DIFF m5, m6, m7, [%1+%3 ]
  209. STORE_DIFF m0, m6, m7, [%1+%3*2]
  210. STORE_DIFF m1, m6, m7, [%1+%4 ]
  211. %endmacro
  212. INIT_XMM
  213. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  214. cglobal h264_idct8_add_sse2, 3, 4, 10
  215. IDCT8_ADD_SSE r0, r1, r2, r3
  216. RET
  217. %macro DC_ADD_MMX2_INIT 2-3
  218. %if %0 == 2
  219. movsx %1, word [%1]
  220. add %1, 32
  221. sar %1, 6
  222. movd m0, %1d
  223. lea %1, [%2*3]
  224. %else
  225. add %3, 32
  226. sar %3, 6
  227. movd m0, %3d
  228. lea %3, [%2*3]
  229. %endif
  230. pshufw m0, m0, 0
  231. pxor m1, m1
  232. psubw m1, m0
  233. packuswb m0, m0
  234. packuswb m1, m1
  235. %endmacro
  236. %macro DC_ADD_MMX2_OP 3-4
  237. %1 m2, [%2 ]
  238. %1 m3, [%2+%3 ]
  239. %1 m4, [%2+%3*2]
  240. %1 m5, [%2+%4 ]
  241. paddusb m2, m0
  242. paddusb m3, m0
  243. paddusb m4, m0
  244. paddusb m5, m0
  245. psubusb m2, m1
  246. psubusb m3, m1
  247. psubusb m4, m1
  248. psubusb m5, m1
  249. %1 [%2 ], m2
  250. %1 [%2+%3 ], m3
  251. %1 [%2+%3*2], m4
  252. %1 [%2+%4 ], m5
  253. %endmacro
  254. INIT_MMX
  255. ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  256. cglobal h264_idct_dc_add_mmx2, 3, 3, 0
  257. DC_ADD_MMX2_INIT r1, r2
  258. DC_ADD_MMX2_OP movh, r0, r2, r1
  259. RET
  260. ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  261. cglobal h264_idct8_dc_add_mmx2, 3, 3, 0
  262. DC_ADD_MMX2_INIT r1, r2
  263. DC_ADD_MMX2_OP mova, r0, r2, r1
  264. lea r0, [r0+r2*4]
  265. DC_ADD_MMX2_OP mova, r0, r2, r1
  266. RET
  267. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  268. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  269. cglobal h264_idct_add16_mmx, 5, 7, 0
  270. xor r5, r5
  271. %ifdef PIC
  272. lea r11, [scan8_mem]
  273. %endif
  274. .nextblock
  275. movzx r6, byte [scan8+r5]
  276. movzx r6, byte [r4+r6]
  277. test r6, r6
  278. jz .skipblock
  279. mov r6d, dword [r1+r5*4]
  280. lea r6, [r0+r6]
  281. IDCT4_ADD r6, r2, r3
  282. .skipblock
  283. inc r5
  284. add r2, 32
  285. cmp r5, 16
  286. jl .nextblock
  287. REP_RET
  288. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  289. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  290. cglobal h264_idct8_add4_mmx, 5, 7, 0
  291. %assign pad 128+4-(stack_offset&7)
  292. SUB rsp, pad
  293. xor r5, r5
  294. %ifdef PIC
  295. lea r11, [scan8_mem]
  296. %endif
  297. .nextblock
  298. movzx r6, byte [scan8+r5]
  299. movzx r6, byte [r4+r6]
  300. test r6, r6
  301. jz .skipblock
  302. mov r6d, dword [r1+r5*4]
  303. lea r6, [r0+r6]
  304. add word [r2], 32
  305. IDCT8_ADD_MMX_START r2 , rsp
  306. IDCT8_ADD_MMX_START r2+8, rsp+64
  307. IDCT8_ADD_MMX_END r6 , rsp, r3
  308. mov r6d, dword [r1+r5*4]
  309. lea r6, [r0+r6+4]
  310. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  311. .skipblock
  312. add r5, 4
  313. add r2, 128
  314. cmp r5, 16
  315. jl .nextblock
  316. ADD rsp, pad
  317. RET
  318. ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
  319. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  320. cglobal h264_idct_add16_mmx2, 5, 7, 0
  321. xor r5, r5
  322. %ifdef PIC
  323. lea r11, [scan8_mem]
  324. %endif
  325. .nextblock
  326. movzx r6, byte [scan8+r5]
  327. movzx r6, byte [r4+r6]
  328. test r6, r6
  329. jz .skipblock
  330. cmp r6, 1
  331. jnz .no_dc
  332. movsx r6, word [r2]
  333. test r6, r6
  334. jz .no_dc
  335. DC_ADD_MMX2_INIT r2, r3, r6
  336. %ifdef ARCH_X86_64
  337. %define dst_reg r10
  338. %define dst_regd r10d
  339. %else
  340. %define dst_reg r1
  341. %define dst_regd r1d
  342. %endif
  343. mov dst_regd, dword [r1+r5*4]
  344. lea dst_reg, [r0+dst_reg]
  345. DC_ADD_MMX2_OP movh, dst_reg, r3, r6
  346. %ifndef ARCH_X86_64
  347. mov r1, r1m
  348. %endif
  349. inc r5
  350. add r2, 32
  351. cmp r5, 16
  352. jl .nextblock
  353. REP_RET
  354. .no_dc
  355. mov r6d, dword [r1+r5*4]
  356. lea r6, [r0+r6]
  357. IDCT4_ADD r6, r2, r3
  358. .skipblock
  359. inc r5
  360. add r2, 32
  361. cmp r5, 16
  362. jl .nextblock
  363. REP_RET
  364. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  365. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  366. cglobal h264_idct_add16intra_mmx, 5, 7, 0
  367. xor r5, r5
  368. %ifdef PIC
  369. lea r11, [scan8_mem]
  370. %endif
  371. .nextblock
  372. movzx r6, byte [scan8+r5]
  373. movzx r6, byte [r4+r6]
  374. or r6w, word [r2]
  375. test r6, r6
  376. jz .skipblock
  377. mov r6d, dword [r1+r5*4]
  378. lea r6, [r0+r6]
  379. IDCT4_ADD r6, r2, r3
  380. .skipblock
  381. inc r5
  382. add r2, 32
  383. cmp r5, 16
  384. jl .nextblock
  385. REP_RET
  386. ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
  387. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  388. cglobal h264_idct_add16intra_mmx2, 5, 7, 0
  389. xor r5, r5
  390. %ifdef PIC
  391. lea r11, [scan8_mem]
  392. %endif
  393. .nextblock
  394. movzx r6, byte [scan8+r5]
  395. movzx r6, byte [r4+r6]
  396. test r6, r6
  397. jz .try_dc
  398. mov r6d, dword [r1+r5*4]
  399. lea r6, [r0+r6]
  400. IDCT4_ADD r6, r2, r3
  401. inc r5
  402. add r2, 32
  403. cmp r5, 16
  404. jl .nextblock
  405. REP_RET
  406. .try_dc
  407. movsx r6, word [r2]
  408. test r6, r6
  409. jz .skipblock
  410. DC_ADD_MMX2_INIT r2, r3, r6
  411. %ifdef ARCH_X86_64
  412. %define dst_reg r10
  413. %define dst_regd r10d
  414. %else
  415. %define dst_reg r1
  416. %define dst_regd r1d
  417. %endif
  418. mov dst_regd, dword [r1+r5*4]
  419. lea dst_reg, [r0+dst_reg]
  420. DC_ADD_MMX2_OP movh, dst_reg, r3, r6
  421. %ifndef ARCH_X86_64
  422. mov r1, r1m
  423. %endif
  424. .skipblock
  425. inc r5
  426. add r2, 32
  427. cmp r5, 16
  428. jl .nextblock
  429. REP_RET
  430. ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
  431. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  432. cglobal h264_idct8_add4_mmx2, 5, 7, 0
  433. %assign pad 128+4-(stack_offset&7)
  434. SUB rsp, pad
  435. xor r5, r5
  436. %ifdef PIC
  437. lea r11, [scan8_mem]
  438. %endif
  439. .nextblock
  440. movzx r6, byte [scan8+r5]
  441. movzx r6, byte [r4+r6]
  442. test r6, r6
  443. jz .skipblock
  444. cmp r6, 1
  445. jnz .no_dc
  446. movsx r6, word [r2]
  447. test r6, r6
  448. jz .no_dc
  449. DC_ADD_MMX2_INIT r2, r3, r6
  450. %ifdef ARCH_X86_64
  451. %define dst_reg r10
  452. %define dst_regd r10d
  453. %else
  454. %define dst_reg r1
  455. %define dst_regd r1d
  456. %endif
  457. mov dst_regd, dword [r1+r5*4]
  458. lea dst_reg, [r0+dst_reg]
  459. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  460. lea dst_reg, [dst_reg+r3*4]
  461. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  462. %ifndef ARCH_X86_64
  463. mov r1, r1m
  464. %endif
  465. add r5, 4
  466. add r2, 128
  467. cmp r5, 16
  468. jl .nextblock
  469. ADD rsp, pad
  470. RET
  471. .no_dc
  472. mov r6d, dword [r1+r5*4]
  473. lea r6, [r0+r6]
  474. add word [r2], 32
  475. IDCT8_ADD_MMX_START r2 , rsp
  476. IDCT8_ADD_MMX_START r2+8, rsp+64
  477. IDCT8_ADD_MMX_END r6 , rsp, r3
  478. mov r6d, dword [r1+r5*4]
  479. lea r6, [r0+r6+4]
  480. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  481. .skipblock
  482. add r5, 4
  483. add r2, 128
  484. cmp r5, 16
  485. jl .nextblock
  486. ADD rsp, pad
  487. RET
  488. INIT_XMM
  489. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  490. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  491. cglobal h264_idct8_add4_sse2, 5, 7, 10
  492. xor r5, r5
  493. %ifdef PIC
  494. lea r11, [scan8_mem]
  495. %endif
  496. .nextblock
  497. movzx r6, byte [scan8+r5]
  498. movzx r6, byte [r4+r6]
  499. test r6, r6
  500. jz .skipblock
  501. cmp r6, 1
  502. jnz .no_dc
  503. movsx r6, word [r2]
  504. test r6, r6
  505. jz .no_dc
  506. INIT_MMX
  507. DC_ADD_MMX2_INIT r2, r3, r6
  508. %ifdef ARCH_X86_64
  509. %define dst_reg r10
  510. %define dst_regd r10d
  511. %else
  512. %define dst_reg r1
  513. %define dst_regd r1d
  514. %endif
  515. mov dst_regd, dword [r1+r5*4]
  516. lea dst_reg, [r0+dst_reg]
  517. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  518. lea dst_reg, [dst_reg+r3*4]
  519. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  520. %ifndef ARCH_X86_64
  521. mov r1, r1m
  522. %endif
  523. add r5, 4
  524. add r2, 128
  525. cmp r5, 16
  526. jl .nextblock
  527. REP_RET
  528. .no_dc
  529. INIT_XMM
  530. mov dst_regd, dword [r1+r5*4]
  531. lea dst_reg, [r0+dst_reg]
  532. IDCT8_ADD_SSE dst_reg, r2, r3, r6
  533. %ifndef ARCH_X86_64
  534. mov r1, r1m
  535. %endif
  536. .skipblock
  537. add r5, 4
  538. add r2, 128
  539. cmp r5, 16
  540. jl .nextblock
  541. REP_RET
  542. INIT_MMX
  543. h264_idct_add8_mmx_plane:
  544. .nextblock
  545. movzx r6, byte [scan8+r5]
  546. movzx r6, byte [r4+r6]
  547. or r6w, word [r2]
  548. test r6, r6
  549. jz .skipblock
  550. %ifdef ARCH_X86_64
  551. mov r0d, dword [r1+r5*4]
  552. add r0, [r10]
  553. %else
  554. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  555. mov r0, [r0]
  556. add r0, dword [r1+r5*4]
  557. %endif
  558. IDCT4_ADD r0, r2, r3
  559. .skipblock
  560. inc r5
  561. add r2, 32
  562. test r5, 3
  563. jnz .nextblock
  564. rep ret
  565. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  566. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  567. cglobal h264_idct_add8_mmx, 5, 7, 0
  568. mov r5, 16
  569. add r2, 512
  570. %ifdef PIC
  571. lea r11, [scan8_mem]
  572. %endif
  573. %ifdef ARCH_X86_64
  574. mov r10, r0
  575. %endif
  576. call h264_idct_add8_mmx_plane
  577. %ifdef ARCH_X86_64
  578. add r10, gprsize
  579. %else
  580. add r0mp, gprsize
  581. %endif
  582. call h264_idct_add8_mmx_plane
  583. RET
  584. h264_idct_add8_mmx2_plane
  585. .nextblock
  586. movzx r6, byte [scan8+r5]
  587. movzx r6, byte [r4+r6]
  588. test r6, r6
  589. jz .try_dc
  590. %ifdef ARCH_X86_64
  591. mov r0d, dword [r1+r5*4]
  592. add r0, [r10]
  593. %else
  594. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  595. mov r0, [r0]
  596. add r0, dword [r1+r5*4]
  597. %endif
  598. IDCT4_ADD r0, r2, r3
  599. inc r5
  600. add r2, 32
  601. test r5, 3
  602. jnz .nextblock
  603. rep ret
  604. .try_dc
  605. movsx r6, word [r2]
  606. test r6, r6
  607. jz .skipblock
  608. DC_ADD_MMX2_INIT r2, r3, r6
  609. %ifdef ARCH_X86_64
  610. mov r0d, dword [r1+r5*4]
  611. add r0, [r10]
  612. %else
  613. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  614. mov r0, [r0]
  615. add r0, dword [r1+r5*4]
  616. %endif
  617. DC_ADD_MMX2_OP movh, r0, r3, r6
  618. .skipblock
  619. inc r5
  620. add r2, 32
  621. test r5, 3
  622. jnz .nextblock
  623. rep ret
  624. ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
  625. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  626. cglobal h264_idct_add8_mmx2, 5, 7, 0
  627. mov r5, 16
  628. add r2, 512
  629. %ifdef ARCH_X86_64
  630. mov r10, r0
  631. %endif
  632. %ifdef PIC
  633. lea r11, [scan8_mem]
  634. %endif
  635. call h264_idct_add8_mmx2_plane
  636. %ifdef ARCH_X86_64
  637. add r10, gprsize
  638. %else
  639. add r0mp, gprsize
  640. %endif
  641. call h264_idct_add8_mmx2_plane
  642. RET
  643. INIT_MMX
  644. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  645. h264_idct_dc_add8_mmx2:
  646. movd m0, [r2 ] ; 0 0 X D
  647. punpcklwd m0, [r2+32] ; x X d D
  648. paddsw m0, [pw_32]
  649. psraw m0, 6
  650. punpcklwd m0, m0 ; d d D D
  651. pxor m1, m1 ; 0 0 0 0
  652. psubw m1, m0 ; -d-d-D-D
  653. packuswb m0, m1 ; -d-d-D-D d d D D
  654. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  655. punpcklwd m0, m0 ; d d d d D D D D
  656. lea r6, [r3*3]
  657. DC_ADD_MMX2_OP movq, r0, r3, r6
  658. ret
  659. ALIGN 16
  660. INIT_XMM
  661. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  662. x264_add8x4_idct_sse2:
  663. movq m0, [r2+ 0]
  664. movq m1, [r2+ 8]
  665. movq m2, [r2+16]
  666. movq m3, [r2+24]
  667. movhps m0, [r2+32]
  668. movhps m1, [r2+40]
  669. movhps m2, [r2+48]
  670. movhps m3, [r2+56]
  671. IDCT4_1D 0,1,2,3,4,5
  672. TRANSPOSE2x4x4W 0,1,2,3,4
  673. paddw m0, [pw_32]
  674. IDCT4_1D 0,1,2,3,4,5
  675. pxor m7, m7
  676. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  677. lea r0, [r0+r3*2]
  678. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  679. ret
  680. %macro add16_sse2_cycle 2
  681. movzx r0, word [r4+%2]
  682. test r0, r0
  683. jz .cycle%1end
  684. mov r0d, dword [r1+%1*8]
  685. %ifdef ARCH_X86_64
  686. add r0, r10
  687. %else
  688. add r0, r0m
  689. %endif
  690. call x264_add8x4_idct_sse2
  691. .cycle%1end
  692. %if %1 < 7
  693. add r2, 64
  694. %endif
  695. %endmacro
  696. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  697. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  698. cglobal h264_idct_add16_sse2, 5, 5, 8
  699. %ifdef ARCH_X86_64
  700. mov r10, r0
  701. %endif
  702. ; unrolling of the loop leads to an average performance gain of
  703. ; 20-25%
  704. add16_sse2_cycle 0, 0xc
  705. add16_sse2_cycle 1, 0x14
  706. add16_sse2_cycle 2, 0xe
  707. add16_sse2_cycle 3, 0x16
  708. add16_sse2_cycle 4, 0x1c
  709. add16_sse2_cycle 5, 0x24
  710. add16_sse2_cycle 6, 0x1e
  711. add16_sse2_cycle 7, 0x26
  712. RET
  713. %macro add16intra_sse2_cycle 2
  714. movzx r0, word [r4+%2]
  715. test r0, r0
  716. jz .try%1dc
  717. mov r0d, dword [r1+%1*8]
  718. %ifdef ARCH_X86_64
  719. add r0, r10
  720. %else
  721. add r0, r0m
  722. %endif
  723. call x264_add8x4_idct_sse2
  724. jmp .cycle%1end
  725. .try%1dc
  726. movsx r0, word [r2 ]
  727. or r0w, word [r2+32]
  728. jz .cycle%1end
  729. mov r0d, dword [r1+%1*8]
  730. %ifdef ARCH_X86_64
  731. add r0, r10
  732. %else
  733. add r0, r0m
  734. %endif
  735. call h264_idct_dc_add8_mmx2
  736. .cycle%1end
  737. %if %1 < 7
  738. add r2, 64
  739. %endif
  740. %endmacro
  741. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  742. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  743. cglobal h264_idct_add16intra_sse2, 5, 7, 8
  744. %ifdef ARCH_X86_64
  745. mov r10, r0
  746. %endif
  747. add16intra_sse2_cycle 0, 0xc
  748. add16intra_sse2_cycle 1, 0x14
  749. add16intra_sse2_cycle 2, 0xe
  750. add16intra_sse2_cycle 3, 0x16
  751. add16intra_sse2_cycle 4, 0x1c
  752. add16intra_sse2_cycle 5, 0x24
  753. add16intra_sse2_cycle 6, 0x1e
  754. add16intra_sse2_cycle 7, 0x26
  755. RET
  756. %macro add8_sse2_cycle 2
  757. movzx r0, word [r4+%2]
  758. test r0, r0
  759. jz .try%1dc
  760. %ifdef ARCH_X86_64
  761. mov r0d, dword [r1+%1*8+64]
  762. add r0, [r10]
  763. %else
  764. mov r0, r0m
  765. mov r0, [r0]
  766. add r0, dword [r1+%1*8+64]
  767. %endif
  768. call x264_add8x4_idct_sse2
  769. jmp .cycle%1end
  770. .try%1dc
  771. movsx r0, word [r2 ]
  772. or r0w, word [r2+32]
  773. jz .cycle%1end
  774. %ifdef ARCH_X86_64
  775. mov r0d, dword [r1+%1*8+64]
  776. add r0, [r10]
  777. %else
  778. mov r0, r0m
  779. mov r0, [r0]
  780. add r0, dword [r1+%1*8+64]
  781. %endif
  782. call h264_idct_dc_add8_mmx2
  783. .cycle%1end
  784. %if %1 < 3
  785. add r2, 64
  786. %endif
  787. %endmacro
  788. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  789. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  790. cglobal h264_idct_add8_sse2, 5, 7, 8
  791. add r2, 512
  792. %ifdef ARCH_X86_64
  793. mov r10, r0
  794. %endif
  795. add8_sse2_cycle 0, 0x09
  796. add8_sse2_cycle 1, 0x11
  797. %ifdef ARCH_X86_64
  798. add r10, gprsize
  799. %else
  800. add r0mp, gprsize
  801. %endif
  802. add8_sse2_cycle 2, 0x21
  803. add8_sse2_cycle 3, 0x29
  804. RET