You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1021 lines
25KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2-optimized H.264 iDCT
  3. ;*****************************************************************************
  4. ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2003-2008 x264 project
  6. ;*
  7. ;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
  8. ;* Loren Merritt <lorenm@u.washington.edu>
  9. ;* Holger Lubitz <hal@duncan.ol.sub.de>
  10. ;* Min Chen <chenm001.163.com>
  11. ;*
  12. ;* This file is part of FFmpeg.
  13. ;*
  14. ;* FFmpeg is free software; you can redistribute it and/or
  15. ;* modify it under the terms of the GNU Lesser General Public
  16. ;* License as published by the Free Software Foundation; either
  17. ;* version 2.1 of the License, or (at your option) any later version.
  18. ;*
  19. ;* FFmpeg is distributed in the hope that it will be useful,
  20. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  21. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  22. ;* Lesser General Public License for more details.
  23. ;*
  24. ;* You should have received a copy of the GNU Lesser General Public
  25. ;* License along with FFmpeg; if not, write to the Free Software
  26. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  27. ;*****************************************************************************
  28. %include "x86inc.asm"
  29. %include "x86util.asm"
  30. SECTION_RODATA
  31. ; FIXME this table is a duplicate from h264data.h, and will be removed once the tables from, h264 have been split
  32. scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
  33. db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
  34. db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
  35. db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
  36. db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
  37. db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
  38. db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
  39. db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
  40. db 4+11*8, 5+11*8, 4+12*8, 5+12*8
  41. db 6+11*8, 7+11*8, 6+12*8, 7+12*8
  42. db 4+13*8, 5+13*8, 4+14*8, 5+14*8
  43. db 6+13*8, 7+13*8, 6+14*8, 7+14*8
  44. %ifdef PIC
  45. %define scan8 r11
  46. %else
  47. %define scan8 scan8_mem
  48. %endif
  49. cextern pw_32
  50. cextern pw_1
  51. SECTION .text
  52. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  53. %macro IDCT4_ADD 3
  54. ; Load dct coeffs
  55. movq m0, [%2]
  56. movq m1, [%2+8]
  57. movq m2, [%2+16]
  58. movq m3, [%2+24]
  59. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  60. mova m6, [pw_32]
  61. TRANSPOSE4x4W 0, 1, 2, 3, 4
  62. paddw m0, m6
  63. IDCT4_1D w, 0, 1, 2, 3, 4, 5
  64. pxor m7, m7
  65. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
  66. lea %1, [%1+%3*2]
  67. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
  68. %endmacro
  69. INIT_MMX
  70. ; ff_h264_idct_add_mmx(uint8_t *dst, int16_t *block, int stride)
  71. cglobal h264_idct_add_8_mmx, 3, 3, 0
  72. IDCT4_ADD r0, r1, r2
  73. RET
  74. %macro IDCT8_1D 2
  75. mova m0, m1
  76. psraw m1, 1
  77. mova m4, m5
  78. psraw m4, 1
  79. paddw m4, m5
  80. paddw m1, m0
  81. paddw m4, m7
  82. paddw m1, m5
  83. psubw m4, m0
  84. paddw m1, m3
  85. psubw m0, m3
  86. psubw m5, m3
  87. psraw m3, 1
  88. paddw m0, m7
  89. psubw m5, m7
  90. psraw m7, 1
  91. psubw m0, m3
  92. psubw m5, m7
  93. mova m7, m1
  94. psraw m1, 2
  95. mova m3, m4
  96. psraw m3, 2
  97. paddw m3, m0
  98. psraw m0, 2
  99. paddw m1, m5
  100. psraw m5, 2
  101. psubw m0, m4
  102. psubw m7, m5
  103. mova m5, m6
  104. psraw m6, 1
  105. mova m4, m2
  106. psraw m4, 1
  107. paddw m6, m2
  108. psubw m4, m5
  109. mova m2, %1
  110. mova m5, %2
  111. SUMSUB_BA w, 5, 2
  112. SUMSUB_BA w, 6, 5
  113. SUMSUB_BA w, 4, 2
  114. SUMSUB_BA w, 7, 6
  115. SUMSUB_BA w, 0, 4
  116. SUMSUB_BA w, 3, 2
  117. SUMSUB_BA w, 1, 5
  118. SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
  119. %endmacro
  120. %macro IDCT8_1D_FULL 1
  121. mova m7, [%1+112]
  122. mova m6, [%1+ 96]
  123. mova m5, [%1+ 80]
  124. mova m3, [%1+ 48]
  125. mova m2, [%1+ 32]
  126. mova m1, [%1+ 16]
  127. IDCT8_1D [%1], [%1+ 64]
  128. %endmacro
  129. ; %1=int16_t *block, %2=int16_t *dstblock
  130. %macro IDCT8_ADD_MMX_START 2
  131. IDCT8_1D_FULL %1
  132. mova [%1], m7
  133. TRANSPOSE4x4W 0, 1, 2, 3, 7
  134. mova m7, [%1]
  135. mova [%2 ], m0
  136. mova [%2+16], m1
  137. mova [%2+32], m2
  138. mova [%2+48], m3
  139. TRANSPOSE4x4W 4, 5, 6, 7, 3
  140. mova [%2+ 8], m4
  141. mova [%2+24], m5
  142. mova [%2+40], m6
  143. mova [%2+56], m7
  144. %endmacro
  145. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  146. %macro IDCT8_ADD_MMX_END 3
  147. IDCT8_1D_FULL %2
  148. mova [%2 ], m5
  149. mova [%2+16], m6
  150. mova [%2+32], m7
  151. pxor m7, m7
  152. STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
  153. lea %1, [%1+%3*2]
  154. STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
  155. mova m0, [%2 ]
  156. mova m1, [%2+16]
  157. mova m2, [%2+32]
  158. lea %1, [%1+%3*2]
  159. STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
  160. lea %1, [%1+%3*2]
  161. STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
  162. %endmacro
  163. INIT_MMX
  164. ; ff_h264_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
  165. cglobal h264_idct8_add_8_mmx, 3, 4, 0
  166. %assign pad 128+4-(stack_offset&7)
  167. SUB rsp, pad
  168. add word [r1], 32
  169. IDCT8_ADD_MMX_START r1 , rsp
  170. IDCT8_ADD_MMX_START r1+8, rsp+64
  171. lea r3, [r0+4]
  172. IDCT8_ADD_MMX_END r0 , rsp, r2
  173. IDCT8_ADD_MMX_END r3 , rsp+8, r2
  174. ADD rsp, pad
  175. RET
  176. ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
  177. %macro IDCT8_ADD_SSE 4
  178. IDCT8_1D_FULL %2
  179. %ifdef ARCH_X86_64
  180. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
  181. %else
  182. TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
  183. %endif
  184. paddw m0, [pw_32]
  185. %ifndef ARCH_X86_64
  186. mova [%2 ], m0
  187. mova [%2+16], m4
  188. IDCT8_1D [%2], [%2+ 16]
  189. mova [%2 ], m6
  190. mova [%2+16], m7
  191. %else
  192. SWAP 0, 8
  193. SWAP 4, 9
  194. IDCT8_1D m8, m9
  195. SWAP 6, 8
  196. SWAP 7, 9
  197. %endif
  198. pxor m7, m7
  199. lea %4, [%3*3]
  200. STORE_DIFF m0, m6, m7, [%1 ]
  201. STORE_DIFF m1, m6, m7, [%1+%3 ]
  202. STORE_DIFF m2, m6, m7, [%1+%3*2]
  203. STORE_DIFF m3, m6, m7, [%1+%4 ]
  204. %ifndef ARCH_X86_64
  205. mova m0, [%2 ]
  206. mova m1, [%2+16]
  207. %else
  208. SWAP 0, 8
  209. SWAP 1, 9
  210. %endif
  211. lea %1, [%1+%3*4]
  212. STORE_DIFF m4, m6, m7, [%1 ]
  213. STORE_DIFF m5, m6, m7, [%1+%3 ]
  214. STORE_DIFF m0, m6, m7, [%1+%3*2]
  215. STORE_DIFF m1, m6, m7, [%1+%4 ]
  216. %endmacro
  217. INIT_XMM
  218. ; ff_h264_idct8_add_sse2(uint8_t *dst, int16_t *block, int stride)
  219. cglobal h264_idct8_add_8_sse2, 3, 4, 10
  220. IDCT8_ADD_SSE r0, r1, r2, r3
  221. RET
  222. %macro DC_ADD_MMX2_INIT 2-3
  223. %if %0 == 2
  224. movsx %1, word [%1]
  225. add %1, 32
  226. sar %1, 6
  227. movd m0, %1d
  228. lea %1, [%2*3]
  229. %else
  230. add %3, 32
  231. sar %3, 6
  232. movd m0, %3d
  233. lea %3, [%2*3]
  234. %endif
  235. pshufw m0, m0, 0
  236. pxor m1, m1
  237. psubw m1, m0
  238. packuswb m0, m0
  239. packuswb m1, m1
  240. %endmacro
  241. %macro DC_ADD_MMX2_OP 4
  242. %1 m2, [%2 ]
  243. %1 m3, [%2+%3 ]
  244. %1 m4, [%2+%3*2]
  245. %1 m5, [%2+%4 ]
  246. paddusb m2, m0
  247. paddusb m3, m0
  248. paddusb m4, m0
  249. paddusb m5, m0
  250. psubusb m2, m1
  251. psubusb m3, m1
  252. psubusb m4, m1
  253. psubusb m5, m1
  254. %1 [%2 ], m2
  255. %1 [%2+%3 ], m3
  256. %1 [%2+%3*2], m4
  257. %1 [%2+%4 ], m5
  258. %endmacro
  259. INIT_MMX
  260. ; ff_h264_idct_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  261. cglobal h264_idct_dc_add_8_mmx2, 3, 3, 0
  262. DC_ADD_MMX2_INIT r1, r2
  263. DC_ADD_MMX2_OP movh, r0, r2, r1
  264. RET
  265. ; ff_h264_idct8_dc_add_mmx2(uint8_t *dst, int16_t *block, int stride)
  266. cglobal h264_idct8_dc_add_8_mmx2, 3, 3, 0
  267. DC_ADD_MMX2_INIT r1, r2
  268. DC_ADD_MMX2_OP mova, r0, r2, r1
  269. lea r0, [r0+r2*4]
  270. DC_ADD_MMX2_OP mova, r0, r2, r1
  271. RET
  272. ; ff_h264_idct_add16_mmx(uint8_t *dst, const int *block_offset,
  273. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  274. cglobal h264_idct_add16_8_mmx, 5, 7, 0
  275. xor r5, r5
  276. %ifdef PIC
  277. lea r11, [scan8_mem]
  278. %endif
  279. .nextblock
  280. movzx r6, byte [scan8+r5]
  281. movzx r6, byte [r4+r6]
  282. test r6, r6
  283. jz .skipblock
  284. mov r6d, dword [r1+r5*4]
  285. lea r6, [r0+r6]
  286. IDCT4_ADD r6, r2, r3
  287. .skipblock
  288. inc r5
  289. add r2, 32
  290. cmp r5, 16
  291. jl .nextblock
  292. REP_RET
  293. ; ff_h264_idct8_add4_mmx(uint8_t *dst, const int *block_offset,
  294. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  295. cglobal h264_idct8_add4_8_mmx, 5, 7, 0
  296. %assign pad 128+4-(stack_offset&7)
  297. SUB rsp, pad
  298. xor r5, r5
  299. %ifdef PIC
  300. lea r11, [scan8_mem]
  301. %endif
  302. .nextblock
  303. movzx r6, byte [scan8+r5]
  304. movzx r6, byte [r4+r6]
  305. test r6, r6
  306. jz .skipblock
  307. mov r6d, dword [r1+r5*4]
  308. add r6, r0
  309. add word [r2], 32
  310. IDCT8_ADD_MMX_START r2 , rsp
  311. IDCT8_ADD_MMX_START r2+8, rsp+64
  312. IDCT8_ADD_MMX_END r6 , rsp, r3
  313. mov r6d, dword [r1+r5*4]
  314. lea r6, [r0+r6+4]
  315. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  316. .skipblock
  317. add r5, 4
  318. add r2, 128
  319. cmp r5, 16
  320. jl .nextblock
  321. ADD rsp, pad
  322. RET
  323. ; ff_h264_idct_add16_mmx2(uint8_t *dst, const int *block_offset,
  324. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  325. cglobal h264_idct_add16_8_mmx2, 5, 7, 0
  326. xor r5, r5
  327. %ifdef PIC
  328. lea r11, [scan8_mem]
  329. %endif
  330. .nextblock
  331. movzx r6, byte [scan8+r5]
  332. movzx r6, byte [r4+r6]
  333. test r6, r6
  334. jz .skipblock
  335. cmp r6, 1
  336. jnz .no_dc
  337. movsx r6, word [r2]
  338. test r6, r6
  339. jz .no_dc
  340. DC_ADD_MMX2_INIT r2, r3, r6
  341. %ifdef ARCH_X86_64
  342. %define dst_reg r10
  343. %define dst_regd r10d
  344. %else
  345. %define dst_reg r1
  346. %define dst_regd r1d
  347. %endif
  348. mov dst_regd, dword [r1+r5*4]
  349. lea dst_reg, [r0+dst_reg]
  350. DC_ADD_MMX2_OP movh, dst_reg, r3, r6
  351. %ifndef ARCH_X86_64
  352. mov r1, r1m
  353. %endif
  354. inc r5
  355. add r2, 32
  356. cmp r5, 16
  357. jl .nextblock
  358. REP_RET
  359. .no_dc
  360. mov r6d, dword [r1+r5*4]
  361. add r6, r0
  362. IDCT4_ADD r6, r2, r3
  363. .skipblock
  364. inc r5
  365. add r2, 32
  366. cmp r5, 16
  367. jl .nextblock
  368. REP_RET
  369. ; ff_h264_idct_add16intra_mmx(uint8_t *dst, const int *block_offset,
  370. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  371. cglobal h264_idct_add16intra_8_mmx, 5, 7, 0
  372. xor r5, r5
  373. %ifdef PIC
  374. lea r11, [scan8_mem]
  375. %endif
  376. .nextblock
  377. movzx r6, byte [scan8+r5]
  378. movzx r6, byte [r4+r6]
  379. or r6w, word [r2]
  380. test r6, r6
  381. jz .skipblock
  382. mov r6d, dword [r1+r5*4]
  383. add r6, r0
  384. IDCT4_ADD r6, r2, r3
  385. .skipblock
  386. inc r5
  387. add r2, 32
  388. cmp r5, 16
  389. jl .nextblock
  390. REP_RET
  391. ; ff_h264_idct_add16intra_mmx2(uint8_t *dst, const int *block_offset,
  392. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  393. cglobal h264_idct_add16intra_8_mmx2, 5, 7, 0
  394. xor r5, r5
  395. %ifdef PIC
  396. lea r11, [scan8_mem]
  397. %endif
  398. .nextblock
  399. movzx r6, byte [scan8+r5]
  400. movzx r6, byte [r4+r6]
  401. test r6, r6
  402. jz .try_dc
  403. mov r6d, dword [r1+r5*4]
  404. lea r6, [r0+r6]
  405. IDCT4_ADD r6, r2, r3
  406. inc r5
  407. add r2, 32
  408. cmp r5, 16
  409. jl .nextblock
  410. REP_RET
  411. .try_dc
  412. movsx r6, word [r2]
  413. test r6, r6
  414. jz .skipblock
  415. DC_ADD_MMX2_INIT r2, r3, r6
  416. %ifdef ARCH_X86_64
  417. %define dst_reg r10
  418. %define dst_regd r10d
  419. %else
  420. %define dst_reg r1
  421. %define dst_regd r1d
  422. %endif
  423. mov dst_regd, dword [r1+r5*4]
  424. add dst_reg, r0
  425. DC_ADD_MMX2_OP movh, dst_reg, r3, r6
  426. %ifndef ARCH_X86_64
  427. mov r1, r1m
  428. %endif
  429. .skipblock
  430. inc r5
  431. add r2, 32
  432. cmp r5, 16
  433. jl .nextblock
  434. REP_RET
  435. ; ff_h264_idct8_add4_mmx2(uint8_t *dst, const int *block_offset,
  436. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  437. cglobal h264_idct8_add4_8_mmx2, 5, 7, 0
  438. %assign pad 128+4-(stack_offset&7)
  439. SUB rsp, pad
  440. xor r5, r5
  441. %ifdef PIC
  442. lea r11, [scan8_mem]
  443. %endif
  444. .nextblock
  445. movzx r6, byte [scan8+r5]
  446. movzx r6, byte [r4+r6]
  447. test r6, r6
  448. jz .skipblock
  449. cmp r6, 1
  450. jnz .no_dc
  451. movsx r6, word [r2]
  452. test r6, r6
  453. jz .no_dc
  454. DC_ADD_MMX2_INIT r2, r3, r6
  455. %ifdef ARCH_X86_64
  456. %define dst_reg r10
  457. %define dst_regd r10d
  458. %else
  459. %define dst_reg r1
  460. %define dst_regd r1d
  461. %endif
  462. mov dst_regd, dword [r1+r5*4]
  463. lea dst_reg, [r0+dst_reg]
  464. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  465. lea dst_reg, [dst_reg+r3*4]
  466. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  467. %ifndef ARCH_X86_64
  468. mov r1, r1m
  469. %endif
  470. add r5, 4
  471. add r2, 128
  472. cmp r5, 16
  473. jl .nextblock
  474. ADD rsp, pad
  475. RET
  476. .no_dc
  477. mov r6d, dword [r1+r5*4]
  478. add r6, r0
  479. add word [r2], 32
  480. IDCT8_ADD_MMX_START r2 , rsp
  481. IDCT8_ADD_MMX_START r2+8, rsp+64
  482. IDCT8_ADD_MMX_END r6 , rsp, r3
  483. mov r6d, dword [r1+r5*4]
  484. lea r6, [r0+r6+4]
  485. IDCT8_ADD_MMX_END r6 , rsp+8, r3
  486. .skipblock
  487. add r5, 4
  488. add r2, 128
  489. cmp r5, 16
  490. jl .nextblock
  491. ADD rsp, pad
  492. RET
  493. INIT_XMM
  494. ; ff_h264_idct8_add4_sse2(uint8_t *dst, const int *block_offset,
  495. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  496. cglobal h264_idct8_add4_8_sse2, 5, 7, 10
  497. xor r5, r5
  498. %ifdef PIC
  499. lea r11, [scan8_mem]
  500. %endif
  501. .nextblock
  502. movzx r6, byte [scan8+r5]
  503. movzx r6, byte [r4+r6]
  504. test r6, r6
  505. jz .skipblock
  506. cmp r6, 1
  507. jnz .no_dc
  508. movsx r6, word [r2]
  509. test r6, r6
  510. jz .no_dc
  511. INIT_MMX
  512. DC_ADD_MMX2_INIT r2, r3, r6
  513. %ifdef ARCH_X86_64
  514. %define dst_reg r10
  515. %define dst_regd r10d
  516. %else
  517. %define dst_reg r1
  518. %define dst_regd r1d
  519. %endif
  520. mov dst_regd, dword [r1+r5*4]
  521. add dst_reg, r0
  522. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  523. lea dst_reg, [dst_reg+r3*4]
  524. DC_ADD_MMX2_OP mova, dst_reg, r3, r6
  525. %ifndef ARCH_X86_64
  526. mov r1, r1m
  527. %endif
  528. add r5, 4
  529. add r2, 128
  530. cmp r5, 16
  531. jl .nextblock
  532. REP_RET
  533. .no_dc
  534. INIT_XMM
  535. mov dst_regd, dword [r1+r5*4]
  536. add dst_reg, r0
  537. IDCT8_ADD_SSE dst_reg, r2, r3, r6
  538. %ifndef ARCH_X86_64
  539. mov r1, r1m
  540. %endif
  541. .skipblock
  542. add r5, 4
  543. add r2, 128
  544. cmp r5, 16
  545. jl .nextblock
  546. REP_RET
  547. INIT_MMX
  548. h264_idct_add8_mmx_plane:
  549. .nextblock
  550. movzx r6, byte [scan8+r5]
  551. movzx r6, byte [r4+r6]
  552. or r6w, word [r2]
  553. test r6, r6
  554. jz .skipblock
  555. %ifdef ARCH_X86_64
  556. mov r0d, dword [r1+r5*4]
  557. add r0, [r10]
  558. %else
  559. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  560. mov r0, [r0]
  561. add r0, dword [r1+r5*4]
  562. %endif
  563. IDCT4_ADD r0, r2, r3
  564. .skipblock
  565. inc r5
  566. add r2, 32
  567. test r5, 3
  568. jnz .nextblock
  569. rep ret
  570. ; ff_h264_idct_add8_mmx(uint8_t **dest, const int *block_offset,
  571. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  572. cglobal h264_idct_add8_8_mmx, 5, 7, 0
  573. mov r5, 16
  574. add r2, 512
  575. %ifdef PIC
  576. lea r11, [scan8_mem]
  577. %endif
  578. %ifdef ARCH_X86_64
  579. mov r10, r0
  580. %endif
  581. call h264_idct_add8_mmx_plane
  582. mov r5, 32
  583. add r2, 384
  584. %ifdef ARCH_X86_64
  585. add r10, gprsize
  586. %else
  587. add r0mp, gprsize
  588. %endif
  589. call h264_idct_add8_mmx_plane
  590. RET
  591. h264_idct_add8_mmx2_plane
  592. .nextblock
  593. movzx r6, byte [scan8+r5]
  594. movzx r6, byte [r4+r6]
  595. test r6, r6
  596. jz .try_dc
  597. %ifdef ARCH_X86_64
  598. mov r0d, dword [r1+r5*4]
  599. add r0, [r10]
  600. %else
  601. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  602. mov r0, [r0]
  603. add r0, dword [r1+r5*4]
  604. %endif
  605. IDCT4_ADD r0, r2, r3
  606. inc r5
  607. add r2, 32
  608. test r5, 3
  609. jnz .nextblock
  610. rep ret
  611. .try_dc
  612. movsx r6, word [r2]
  613. test r6, r6
  614. jz .skipblock
  615. DC_ADD_MMX2_INIT r2, r3, r6
  616. %ifdef ARCH_X86_64
  617. mov r0d, dword [r1+r5*4]
  618. add r0, [r10]
  619. %else
  620. mov r0, r1m ; XXX r1m here is actually r0m of the calling func
  621. mov r0, [r0]
  622. add r0, dword [r1+r5*4]
  623. %endif
  624. DC_ADD_MMX2_OP movh, r0, r3, r6
  625. .skipblock
  626. inc r5
  627. add r2, 32
  628. test r5, 3
  629. jnz .nextblock
  630. rep ret
  631. ; ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset,
  632. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  633. cglobal h264_idct_add8_8_mmx2, 5, 7, 0
  634. mov r5, 16
  635. add r2, 512
  636. %ifdef ARCH_X86_64
  637. mov r10, r0
  638. %endif
  639. %ifdef PIC
  640. lea r11, [scan8_mem]
  641. %endif
  642. call h264_idct_add8_mmx2_plane
  643. mov r5, 32
  644. add r2, 384
  645. %ifdef ARCH_X86_64
  646. add r10, gprsize
  647. %else
  648. add r0mp, gprsize
  649. %endif
  650. call h264_idct_add8_mmx2_plane
  651. RET
  652. INIT_MMX
  653. ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
  654. h264_idct_dc_add8_mmx2:
  655. movd m0, [r2 ] ; 0 0 X D
  656. punpcklwd m0, [r2+32] ; x X d D
  657. paddsw m0, [pw_32]
  658. psraw m0, 6
  659. punpcklwd m0, m0 ; d d D D
  660. pxor m1, m1 ; 0 0 0 0
  661. psubw m1, m0 ; -d-d-D-D
  662. packuswb m0, m1 ; -d-d-D-D d d D D
  663. pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
  664. punpcklwd m0, m0 ; d d d d D D D D
  665. lea r6, [r3*3]
  666. DC_ADD_MMX2_OP movq, r0, r3, r6
  667. ret
  668. ALIGN 16
  669. INIT_XMM
  670. ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
  671. x264_add8x4_idct_sse2:
  672. movq m0, [r2+ 0]
  673. movq m1, [r2+ 8]
  674. movq m2, [r2+16]
  675. movq m3, [r2+24]
  676. movhps m0, [r2+32]
  677. movhps m1, [r2+40]
  678. movhps m2, [r2+48]
  679. movhps m3, [r2+56]
  680. IDCT4_1D w,0,1,2,3,4,5
  681. TRANSPOSE2x4x4W 0,1,2,3,4
  682. paddw m0, [pw_32]
  683. IDCT4_1D w,0,1,2,3,4,5
  684. pxor m7, m7
  685. STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
  686. lea r0, [r0+r3*2]
  687. STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
  688. ret
  689. %macro add16_sse2_cycle 2
  690. movzx r0, word [r4+%2]
  691. test r0, r0
  692. jz .cycle%1end
  693. mov r0d, dword [r1+%1*8]
  694. %ifdef ARCH_X86_64
  695. add r0, r10
  696. %else
  697. add r0, r0m
  698. %endif
  699. call x264_add8x4_idct_sse2
  700. .cycle%1end
  701. %if %1 < 7
  702. add r2, 64
  703. %endif
  704. %endmacro
  705. ; ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset,
  706. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  707. cglobal h264_idct_add16_8_sse2, 5, 5, 8
  708. %ifdef ARCH_X86_64
  709. mov r10, r0
  710. %endif
  711. ; unrolling of the loop leads to an average performance gain of
  712. ; 20-25%
  713. add16_sse2_cycle 0, 0xc
  714. add16_sse2_cycle 1, 0x14
  715. add16_sse2_cycle 2, 0xe
  716. add16_sse2_cycle 3, 0x16
  717. add16_sse2_cycle 4, 0x1c
  718. add16_sse2_cycle 5, 0x24
  719. add16_sse2_cycle 6, 0x1e
  720. add16_sse2_cycle 7, 0x26
  721. RET
  722. %macro add16intra_sse2_cycle 2
  723. movzx r0, word [r4+%2]
  724. test r0, r0
  725. jz .try%1dc
  726. mov r0d, dword [r1+%1*8]
  727. %ifdef ARCH_X86_64
  728. add r0, r10
  729. %else
  730. add r0, r0m
  731. %endif
  732. call x264_add8x4_idct_sse2
  733. jmp .cycle%1end
  734. .try%1dc
  735. movsx r0, word [r2 ]
  736. or r0w, word [r2+32]
  737. jz .cycle%1end
  738. mov r0d, dword [r1+%1*8]
  739. %ifdef ARCH_X86_64
  740. add r0, r10
  741. %else
  742. add r0, r0m
  743. %endif
  744. call h264_idct_dc_add8_mmx2
  745. .cycle%1end
  746. %if %1 < 7
  747. add r2, 64
  748. %endif
  749. %endmacro
  750. ; ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset,
  751. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  752. cglobal h264_idct_add16intra_8_sse2, 5, 7, 8
  753. %ifdef ARCH_X86_64
  754. mov r10, r0
  755. %endif
  756. add16intra_sse2_cycle 0, 0xc
  757. add16intra_sse2_cycle 1, 0x14
  758. add16intra_sse2_cycle 2, 0xe
  759. add16intra_sse2_cycle 3, 0x16
  760. add16intra_sse2_cycle 4, 0x1c
  761. add16intra_sse2_cycle 5, 0x24
  762. add16intra_sse2_cycle 6, 0x1e
  763. add16intra_sse2_cycle 7, 0x26
  764. RET
  765. %macro add8_sse2_cycle 2
  766. movzx r0, word [r4+%2]
  767. test r0, r0
  768. jz .try%1dc
  769. %ifdef ARCH_X86_64
  770. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  771. add r0, [r10]
  772. %else
  773. mov r0, r0m
  774. mov r0, [r0]
  775. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  776. %endif
  777. call x264_add8x4_idct_sse2
  778. jmp .cycle%1end
  779. .try%1dc
  780. movsx r0, word [r2 ]
  781. or r0w, word [r2+32]
  782. jz .cycle%1end
  783. %ifdef ARCH_X86_64
  784. mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  785. add r0, [r10]
  786. %else
  787. mov r0, r0m
  788. mov r0, [r0]
  789. add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
  790. %endif
  791. call h264_idct_dc_add8_mmx2
  792. .cycle%1end
  793. %if %1 == 1
  794. add r2, 384+64
  795. %elif %1 < 3
  796. add r2, 64
  797. %endif
  798. %endmacro
  799. ; ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset,
  800. ; DCTELEM *block, int stride, const uint8_t nnzc[6*8])
  801. cglobal h264_idct_add8_8_sse2, 5, 7, 8
  802. add r2, 512
  803. %ifdef ARCH_X86_64
  804. mov r10, r0
  805. %endif
  806. add8_sse2_cycle 0, 0x34
  807. add8_sse2_cycle 1, 0x3c
  808. %ifdef ARCH_X86_64
  809. add r10, gprsize
  810. %else
  811. add r0mp, gprsize
  812. %endif
  813. add8_sse2_cycle 2, 0x5c
  814. add8_sse2_cycle 3, 0x64
  815. RET
  816. ;void ff_h264_luma_dc_dequant_idct_mmx(DCTELEM *output, DCTELEM *input, int qmul)
  817. %macro WALSH4_1D 5
  818. SUMSUB_BADC w, %4, %3, %2, %1, %5
  819. SUMSUB_BADC w, %4, %2, %3, %1, %5
  820. SWAP %1, %4, %3
  821. %endmacro
  822. %macro DEQUANT_MMX 3
  823. mova m7, [pw_1]
  824. mova m4, %1
  825. punpcklwd %1, m7
  826. punpckhwd m4, m7
  827. mova m5, %2
  828. punpcklwd %2, m7
  829. punpckhwd m5, m7
  830. movd m7, t3d
  831. punpckldq m7, m7
  832. pmaddwd %1, m7
  833. pmaddwd %2, m7
  834. pmaddwd m4, m7
  835. pmaddwd m5, m7
  836. psrad %1, %3
  837. psrad %2, %3
  838. psrad m4, %3
  839. psrad m5, %3
  840. packssdw %1, m4
  841. packssdw %2, m5
  842. %endmacro
  843. %macro STORE_WORDS_MMX 5
  844. movd t0d, %1
  845. psrlq %1, 32
  846. movd t1d, %1
  847. mov [t2+%2*32], t0w
  848. mov [t2+%4*32], t1w
  849. shr t0d, 16
  850. shr t1d, 16
  851. mov [t2+%3*32], t0w
  852. mov [t2+%5*32], t1w
  853. %endmacro
  854. %macro DEQUANT_STORE_MMX 1
  855. DEQUANT_MMX m0, m1, %1
  856. STORE_WORDS_MMX m0, 0, 1, 4, 5
  857. STORE_WORDS_MMX m1, 2, 3, 6, 7
  858. DEQUANT_MMX m2, m3, %1
  859. STORE_WORDS_MMX m2, 8, 9, 12, 13
  860. STORE_WORDS_MMX m3, 10, 11, 14, 15
  861. %endmacro
  862. %macro STORE_WORDS_SSE 9
  863. movd t0d, %1
  864. psrldq %1, 4
  865. movd t1d, %1
  866. psrldq %1, 4
  867. mov [t2+%2*32], t0w
  868. mov [t2+%4*32], t1w
  869. shr t0d, 16
  870. shr t1d, 16
  871. mov [t2+%3*32], t0w
  872. mov [t2+%5*32], t1w
  873. movd t0d, %1
  874. psrldq %1, 4
  875. movd t1d, %1
  876. mov [t2+%6*32], t0w
  877. mov [t2+%8*32], t1w
  878. shr t0d, 16
  879. shr t1d, 16
  880. mov [t2+%7*32], t0w
  881. mov [t2+%9*32], t1w
  882. %endmacro
  883. %macro DEQUANT_STORE_SSE2 1
  884. movd xmm4, t3d
  885. movq xmm5, [pw_1]
  886. pshufd xmm4, xmm4, 0
  887. movq2dq xmm0, m0
  888. movq2dq xmm1, m1
  889. movq2dq xmm2, m2
  890. movq2dq xmm3, m3
  891. punpcklwd xmm0, xmm5
  892. punpcklwd xmm1, xmm5
  893. punpcklwd xmm2, xmm5
  894. punpcklwd xmm3, xmm5
  895. pmaddwd xmm0, xmm4
  896. pmaddwd xmm1, xmm4
  897. pmaddwd xmm2, xmm4
  898. pmaddwd xmm3, xmm4
  899. psrad xmm0, %1
  900. psrad xmm1, %1
  901. psrad xmm2, %1
  902. psrad xmm3, %1
  903. packssdw xmm0, xmm1
  904. packssdw xmm2, xmm3
  905. STORE_WORDS_SSE xmm0, 0, 1, 4, 5, 2, 3, 6, 7
  906. STORE_WORDS_SSE xmm2, 8, 9, 12, 13, 10, 11, 14, 15
  907. %endmacro
  908. %macro IDCT_DC_DEQUANT 2
  909. cglobal h264_luma_dc_dequant_idct_%1, 3,4,%2
  910. movq m3, [r1+24]
  911. movq m2, [r1+16]
  912. movq m1, [r1+ 8]
  913. movq m0, [r1+ 0]
  914. WALSH4_1D 0,1,2,3,4
  915. TRANSPOSE4x4W 0,1,2,3,4
  916. WALSH4_1D 0,1,2,3,4
  917. ; shift, tmp, output, qmul
  918. %ifdef WIN64
  919. DECLARE_REG_TMP 0,3,1,2
  920. ; we can't avoid this, because r0 is the shift register (ecx) on win64
  921. xchg r0, t2
  922. %elifdef ARCH_X86_64
  923. DECLARE_REG_TMP 3,1,0,2
  924. %else
  925. DECLARE_REG_TMP 1,3,0,2
  926. %endif
  927. cmp t3d, 32767
  928. jg .big_qmul
  929. add t3d, 128 << 16
  930. %ifidn %1,mmx
  931. DEQUANT_STORE_MMX 8
  932. %else
  933. DEQUANT_STORE_SSE2 8
  934. %endif
  935. RET
  936. .big_qmul:
  937. bsr t0d, t3d
  938. add t3d, 128 << 16
  939. mov t1d, 7
  940. cmp t0d, t1d
  941. cmovg t0d, t1d
  942. inc t1d
  943. shr t3d, t0b
  944. sub t1d, t0d
  945. %ifidn %1,mmx
  946. movd m6, t1d
  947. DEQUANT_STORE_MMX m6
  948. %else
  949. movd xmm6, t1d
  950. DEQUANT_STORE_SSE2 xmm6
  951. %endif
  952. RET
  953. %endmacro
  954. INIT_MMX
  955. IDCT_DC_DEQUANT mmx, 0
  956. IDCT_DC_DEQUANT sse2, 7