You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

664 lines
17KB

  1. ;******************************************************************************
  2. ;* MMX/SSSE3-optimized functions for H.264 chroma MC
  3. ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
  4. ;* 2005-2008 Loren Merritt
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. rnd_rv40_2d_tbl: times 4 dw 0
  25. times 4 dw 16
  26. times 4 dw 32
  27. times 4 dw 16
  28. times 4 dw 32
  29. times 4 dw 28
  30. times 4 dw 32
  31. times 4 dw 28
  32. times 4 dw 0
  33. times 4 dw 32
  34. times 4 dw 16
  35. times 4 dw 32
  36. times 4 dw 32
  37. times 4 dw 28
  38. times 4 dw 32
  39. times 4 dw 28
  40. rnd_rv40_1d_tbl: times 4 dw 0
  41. times 4 dw 2
  42. times 4 dw 4
  43. times 4 dw 2
  44. times 4 dw 4
  45. times 4 dw 3
  46. times 4 dw 4
  47. times 4 dw 3
  48. times 4 dw 0
  49. times 4 dw 4
  50. times 4 dw 2
  51. times 4 dw 4
  52. times 4 dw 4
  53. times 4 dw 3
  54. times 4 dw 4
  55. times 4 dw 3
  56. cextern pw_3
  57. cextern pw_4
  58. cextern pw_8
  59. pw_28: times 8 dw 28
  60. cextern pw_32
  61. cextern pw_64
  62. SECTION .text
  63. %macro mv0_pixels_mc8 0
  64. lea r4, [r2*2 ]
  65. .next4rows:
  66. movq mm0, [r1 ]
  67. movq mm1, [r1+r2]
  68. add r1, r4
  69. CHROMAMC_AVG mm0, [r0 ]
  70. CHROMAMC_AVG mm1, [r0+r2]
  71. movq [r0 ], mm0
  72. movq [r0+r2], mm1
  73. add r0, r4
  74. movq mm0, [r1 ]
  75. movq mm1, [r1+r2]
  76. add r1, r4
  77. CHROMAMC_AVG mm0, [r0 ]
  78. CHROMAMC_AVG mm1, [r0+r2]
  79. movq [r0 ], mm0
  80. movq [r0+r2], mm1
  81. add r0, r4
  82. sub r3d, 4
  83. jne .next4rows
  84. %endmacro
  85. %macro chroma_mc8_mmx_func 2-3
  86. %ifidn %2, rv40
  87. %ifdef PIC
  88. %define rnd_1d_rv40 r8
  89. %define rnd_2d_rv40 r8
  90. %define extra_regs 2
  91. %else ; no-PIC
  92. %define rnd_1d_rv40 rnd_rv40_1d_tbl
  93. %define rnd_2d_rv40 rnd_rv40_2d_tbl
  94. %define extra_regs 1
  95. %endif ; PIC
  96. %else
  97. %define extra_regs 0
  98. %endif ; rv40
  99. ; void ff_put/avg_h264_chroma_mc8_*(uint8_t *dst /* align 8 */,
  100. ; uint8_t *src /* align 1 */,
  101. ; ptrdiff_t stride, int h, int mx, int my)
  102. cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
  103. mov r6d, r5d
  104. or r6d, r4d
  105. jne .at_least_one_non_zero
  106. ; mx == 0 AND my == 0 - no filter needed
  107. mv0_pixels_mc8
  108. REP_RET
  109. .at_least_one_non_zero:
  110. %ifidn %2, rv40
  111. %if ARCH_X86_64
  112. mov r7, r5
  113. and r7, 6 ; &~1 for mx/my=[0,7]
  114. lea r7, [r7*4+r4]
  115. sar r7d, 1
  116. %define rnd_bias r7
  117. %define dest_reg r0
  118. %else ; x86-32
  119. mov r0, r5
  120. and r0, 6 ; &~1 for mx/my=[0,7]
  121. lea r0, [r0*4+r4]
  122. sar r0d, 1
  123. %define rnd_bias r0
  124. %define dest_reg r5
  125. %endif
  126. %else ; vc1, h264
  127. %define rnd_bias 0
  128. %define dest_reg r0
  129. %endif
  130. test r5d, r5d
  131. mov r6, 1
  132. je .my_is_zero
  133. test r4d, r4d
  134. mov r6, r2 ; dxy = x ? 1 : stride
  135. jne .both_non_zero
  136. .my_is_zero:
  137. ; mx == 0 XOR my == 0 - 1 dimensional filter only
  138. or r4d, r5d ; x + y
  139. %ifidn %2, rv40
  140. %ifdef PIC
  141. lea r8, [rnd_rv40_1d_tbl]
  142. %endif
  143. %if ARCH_X86_64 == 0
  144. mov r5, r0m
  145. %endif
  146. %endif
  147. movd m5, r4d
  148. movq m4, [pw_8]
  149. movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
  150. punpcklwd m5, m5
  151. punpckldq m5, m5 ; mm5 = B = x
  152. pxor m7, m7
  153. psubw m4, m5 ; mm4 = A = 8-x
  154. .next1drow:
  155. movq m0, [r1 ] ; mm0 = src[0..7]
  156. movq m2, [r1+r6] ; mm1 = src[1..8]
  157. movq m1, m0
  158. movq m3, m2
  159. punpcklbw m0, m7
  160. punpckhbw m1, m7
  161. punpcklbw m2, m7
  162. punpckhbw m3, m7
  163. pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
  164. pmullw m1, m4
  165. pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
  166. pmullw m3, m5
  167. paddw m0, m6
  168. paddw m1, m6
  169. paddw m0, m2
  170. paddw m1, m3
  171. psrlw m0, 3
  172. psrlw m1, 3
  173. packuswb m0, m1
  174. CHROMAMC_AVG m0, [dest_reg]
  175. movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
  176. add dest_reg, r2
  177. add r1, r2
  178. dec r3d
  179. jne .next1drow
  180. REP_RET
  181. .both_non_zero: ; general case, bilinear
  182. movd m4, r4d ; x
  183. movd m6, r5d ; y
  184. %ifidn %2, rv40
  185. %ifdef PIC
  186. lea r8, [rnd_rv40_2d_tbl]
  187. %endif
  188. %if ARCH_X86_64 == 0
  189. mov r5, r0m
  190. %endif
  191. %endif
  192. mov r6, rsp ; backup stack pointer
  193. and rsp, ~(mmsize-1) ; align stack
  194. sub rsp, 16 ; AA and DD
  195. punpcklwd m4, m4
  196. punpcklwd m6, m6
  197. punpckldq m4, m4 ; mm4 = x words
  198. punpckldq m6, m6 ; mm6 = y words
  199. movq m5, m4
  200. pmullw m4, m6 ; mm4 = x * y
  201. psllw m5, 3
  202. psllw m6, 3
  203. movq m7, m5
  204. paddw m7, m6
  205. movq [rsp+8], m4 ; DD = x * y
  206. psubw m5, m4 ; mm5 = B = 8x - xy
  207. psubw m6, m4 ; mm6 = C = 8y - xy
  208. paddw m4, [pw_64]
  209. psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
  210. pxor m7, m7
  211. movq [rsp ], m4
  212. movq m0, [r1 ] ; mm0 = src[0..7]
  213. movq m1, [r1+1] ; mm1 = src[1..8]
  214. .next2drow:
  215. add r1, r2
  216. movq m2, m0
  217. movq m3, m1
  218. punpckhbw m0, m7
  219. punpcklbw m1, m7
  220. punpcklbw m2, m7
  221. punpckhbw m3, m7
  222. pmullw m0, [rsp]
  223. pmullw m2, [rsp]
  224. pmullw m1, m5
  225. pmullw m3, m5
  226. paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
  227. paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
  228. movq m0, [r1]
  229. movq m1, m0
  230. punpcklbw m0, m7
  231. punpckhbw m1, m7
  232. pmullw m0, m6
  233. pmullw m1, m6
  234. paddw m2, m0
  235. paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
  236. movq m1, [r1+1]
  237. movq m0, m1
  238. movq m4, m1
  239. punpcklbw m0, m7
  240. punpckhbw m4, m7
  241. pmullw m0, [rsp+8]
  242. pmullw m4, [rsp+8]
  243. paddw m2, m0
  244. paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
  245. movq m0, [r1]
  246. paddw m2, [rnd_2d_%2+rnd_bias*8]
  247. paddw m3, [rnd_2d_%2+rnd_bias*8]
  248. psrlw m2, 6
  249. psrlw m3, 6
  250. packuswb m2, m3
  251. CHROMAMC_AVG m2, [dest_reg]
  252. movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
  253. add dest_reg, r2
  254. dec r3d
  255. jne .next2drow
  256. mov rsp, r6 ; restore stack pointer
  257. RET
  258. %endmacro
  259. %macro chroma_mc4_mmx_func 2
  260. %define extra_regs 0
  261. %ifidn %2, rv40
  262. %ifdef PIC
  263. %define extra_regs 1
  264. %endif ; PIC
  265. %endif ; rv40
  266. cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
  267. pxor m7, m7
  268. movd m2, r4d ; x
  269. movd m3, r5d ; y
  270. movq m4, [pw_8]
  271. movq m5, [pw_8]
  272. punpcklwd m2, m2
  273. punpcklwd m3, m3
  274. punpcklwd m2, m2
  275. punpcklwd m3, m3
  276. psubw m4, m2
  277. psubw m5, m3
  278. %ifidn %2, rv40
  279. %ifdef PIC
  280. lea r6, [rnd_rv40_2d_tbl]
  281. %define rnd_2d_rv40 r6
  282. %else
  283. %define rnd_2d_rv40 rnd_rv40_2d_tbl
  284. %endif
  285. and r5, 6 ; &~1 for mx/my=[0,7]
  286. lea r5, [r5*4+r4]
  287. sar r5d, 1
  288. %define rnd_bias r5
  289. %else ; vc1, h264
  290. %define rnd_bias 0
  291. %endif
  292. movd m0, [r1 ]
  293. movd m6, [r1+1]
  294. add r1, r2
  295. punpcklbw m0, m7
  296. punpcklbw m6, m7
  297. pmullw m0, m4
  298. pmullw m6, m2
  299. paddw m6, m0
  300. .next2rows:
  301. movd m0, [r1 ]
  302. movd m1, [r1+1]
  303. add r1, r2
  304. punpcklbw m0, m7
  305. punpcklbw m1, m7
  306. pmullw m0, m4
  307. pmullw m1, m2
  308. paddw m1, m0
  309. movq m0, m1
  310. pmullw m6, m5
  311. pmullw m1, m3
  312. paddw m6, [rnd_2d_%2+rnd_bias*8]
  313. paddw m1, m6
  314. psrlw m1, 6
  315. packuswb m1, m1
  316. CHROMAMC_AVG4 m1, m6, [r0]
  317. movd [r0], m1
  318. add r0, r2
  319. movd m6, [r1 ]
  320. movd m1, [r1+1]
  321. add r1, r2
  322. punpcklbw m6, m7
  323. punpcklbw m1, m7
  324. pmullw m6, m4
  325. pmullw m1, m2
  326. paddw m1, m6
  327. movq m6, m1
  328. pmullw m0, m5
  329. pmullw m1, m3
  330. paddw m0, [rnd_2d_%2+rnd_bias*8]
  331. paddw m1, m0
  332. psrlw m1, 6
  333. packuswb m1, m1
  334. CHROMAMC_AVG4 m1, m0, [r0]
  335. movd [r0], m1
  336. add r0, r2
  337. sub r3d, 2
  338. jnz .next2rows
  339. REP_RET
  340. %endmacro
  341. %macro chroma_mc2_mmx_func 2
  342. cglobal %1_%2_chroma_mc2, 6, 7, 0
  343. mov r6d, r4d
  344. shl r4d, 16
  345. sub r4d, r6d
  346. add r4d, 8
  347. imul r5d, r4d ; x*y<<16 | y*(8-x)
  348. shl r4d, 3
  349. sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
  350. movd m5, r4d
  351. movd m6, r5d
  352. punpckldq m5, m5 ; mm5 = {A,B,A,B}
  353. punpckldq m6, m6 ; mm6 = {C,D,C,D}
  354. pxor m7, m7
  355. movd m2, [r1]
  356. punpcklbw m2, m7
  357. pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
  358. .nextrow:
  359. add r1, r2
  360. movq m1, m2
  361. pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
  362. movd m0, [r1]
  363. punpcklbw m0, m7
  364. pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
  365. movq m2, m0
  366. pmaddwd m0, m6
  367. paddw m1, [rnd_2d_%2]
  368. paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
  369. psrlw m1, 6
  370. packssdw m1, m7
  371. packuswb m1, m7
  372. CHROMAMC_AVG4 m1, m3, [r0]
  373. movd r5d, m1
  374. mov [r0], r5w
  375. add r0, r2
  376. sub r3d, 1
  377. jnz .nextrow
  378. REP_RET
  379. %endmacro
  380. %define rnd_1d_h264 pw_4
  381. %define rnd_2d_h264 pw_32
  382. %define rnd_1d_vc1 pw_3
  383. %define rnd_2d_vc1 pw_28
  384. %macro NOTHING 2-3
  385. %endmacro
  386. %macro DIRECT_AVG 2
  387. PAVGB %1, %2
  388. %endmacro
  389. %macro COPY_AVG 3
  390. movd %2, %3
  391. PAVGB %1, %2
  392. %endmacro
  393. INIT_MMX mmx
  394. %define CHROMAMC_AVG NOTHING
  395. %define CHROMAMC_AVG4 NOTHING
  396. chroma_mc8_mmx_func put, h264, _rnd
  397. chroma_mc8_mmx_func put, vc1, _nornd
  398. chroma_mc8_mmx_func put, rv40
  399. chroma_mc4_mmx_func put, h264
  400. chroma_mc4_mmx_func put, rv40
  401. INIT_MMX mmxext
  402. chroma_mc2_mmx_func put, h264
  403. %define CHROMAMC_AVG DIRECT_AVG
  404. %define CHROMAMC_AVG4 COPY_AVG
  405. chroma_mc8_mmx_func avg, h264, _rnd
  406. chroma_mc8_mmx_func avg, vc1, _nornd
  407. chroma_mc8_mmx_func avg, rv40
  408. chroma_mc4_mmx_func avg, h264
  409. chroma_mc4_mmx_func avg, rv40
  410. chroma_mc2_mmx_func avg, h264
  411. INIT_MMX 3dnow
  412. chroma_mc8_mmx_func avg, h264, _rnd
  413. chroma_mc8_mmx_func avg, vc1, _nornd
  414. chroma_mc8_mmx_func avg, rv40
  415. chroma_mc4_mmx_func avg, h264
  416. chroma_mc4_mmx_func avg, rv40
  417. %macro chroma_mc8_ssse3_func 2-3
  418. cglobal %1_%2_chroma_mc8%3, 6, 7, 8
  419. mov r6d, r5d
  420. or r6d, r4d
  421. jne .at_least_one_non_zero
  422. ; mx == 0 AND my == 0 - no filter needed
  423. mv0_pixels_mc8
  424. REP_RET
  425. .at_least_one_non_zero:
  426. test r5d, r5d
  427. je .my_is_zero
  428. test r4d, r4d
  429. je .mx_is_zero
  430. ; general case, bilinear
  431. mov r6d, r4d
  432. shl r4d, 8
  433. sub r4, r6
  434. mov r6, 8
  435. add r4, 8 ; x*288+8 = x<<8 | (8-x)
  436. sub r6d, r5d
  437. imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
  438. imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
  439. movd m7, r6d
  440. movd m6, r4d
  441. movdqa m5, [rnd_2d_%2]
  442. movq m0, [r1 ]
  443. movq m1, [r1+1]
  444. pshuflw m7, m7, 0
  445. pshuflw m6, m6, 0
  446. punpcklbw m0, m1
  447. movlhps m7, m7
  448. movlhps m6, m6
  449. .next2rows:
  450. movq m1, [r1+r2*1 ]
  451. movq m2, [r1+r2*1+1]
  452. movq m3, [r1+r2*2 ]
  453. movq m4, [r1+r2*2+1]
  454. lea r1, [r1+r2*2]
  455. punpcklbw m1, m2
  456. movdqa m2, m1
  457. punpcklbw m3, m4
  458. movdqa m4, m3
  459. pmaddubsw m0, m7
  460. pmaddubsw m1, m6
  461. pmaddubsw m2, m7
  462. pmaddubsw m3, m6
  463. paddw m0, m5
  464. paddw m2, m5
  465. paddw m1, m0
  466. paddw m3, m2
  467. psrlw m1, 6
  468. movdqa m0, m4
  469. psrlw m3, 6
  470. %ifidn %1, avg
  471. movq m2, [r0 ]
  472. movhps m2, [r0+r2]
  473. %endif
  474. packuswb m1, m3
  475. CHROMAMC_AVG m1, m2
  476. movq [r0 ], m1
  477. movhps [r0+r2], m1
  478. sub r3d, 2
  479. lea r0, [r0+r2*2]
  480. jg .next2rows
  481. REP_RET
  482. .my_is_zero:
  483. mov r5d, r4d
  484. shl r4d, 8
  485. add r4, 8
  486. sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
  487. movd m7, r4d
  488. movdqa m6, [rnd_1d_%2]
  489. pshuflw m7, m7, 0
  490. movlhps m7, m7
  491. .next2xrows:
  492. movq m0, [r1 ]
  493. movq m1, [r1 +1]
  494. movq m2, [r1+r2 ]
  495. movq m3, [r1+r2+1]
  496. punpcklbw m0, m1
  497. punpcklbw m2, m3
  498. pmaddubsw m0, m7
  499. pmaddubsw m2, m7
  500. %ifidn %1, avg
  501. movq m4, [r0 ]
  502. movhps m4, [r0+r2]
  503. %endif
  504. paddw m0, m6
  505. paddw m2, m6
  506. psrlw m0, 3
  507. psrlw m2, 3
  508. packuswb m0, m2
  509. CHROMAMC_AVG m0, m4
  510. movq [r0 ], m0
  511. movhps [r0+r2], m0
  512. sub r3d, 2
  513. lea r0, [r0+r2*2]
  514. lea r1, [r1+r2*2]
  515. jg .next2xrows
  516. REP_RET
  517. .mx_is_zero:
  518. mov r4d, r5d
  519. shl r5d, 8
  520. add r5, 8
  521. sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
  522. movd m7, r5d
  523. movdqa m6, [rnd_1d_%2]
  524. pshuflw m7, m7, 0
  525. movlhps m7, m7
  526. .next2yrows:
  527. movq m0, [r1 ]
  528. movq m1, [r1+r2 ]
  529. movdqa m2, m1
  530. movq m3, [r1+r2*2]
  531. lea r1, [r1+r2*2]
  532. punpcklbw m0, m1
  533. punpcklbw m2, m3
  534. pmaddubsw m0, m7
  535. pmaddubsw m2, m7
  536. %ifidn %1, avg
  537. movq m4, [r0 ]
  538. movhps m4, [r0+r2]
  539. %endif
  540. paddw m0, m6
  541. paddw m2, m6
  542. psrlw m0, 3
  543. psrlw m2, 3
  544. packuswb m0, m2
  545. CHROMAMC_AVG m0, m4
  546. movq [r0 ], m0
  547. movhps [r0+r2], m0
  548. sub r3d, 2
  549. lea r0, [r0+r2*2]
  550. jg .next2yrows
  551. REP_RET
  552. %endmacro
  553. %macro chroma_mc4_ssse3_func 2
  554. cglobal %1_%2_chroma_mc4, 6, 7, 0
  555. mov r6, r4
  556. shl r4d, 8
  557. sub r4d, r6d
  558. mov r6, 8
  559. add r4d, 8 ; x*288+8
  560. sub r6d, r5d
  561. imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
  562. imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
  563. movd m7, r6d
  564. movd m6, r4d
  565. movq m5, [pw_32]
  566. movd m0, [r1 ]
  567. pshufw m7, m7, 0
  568. punpcklbw m0, [r1+1]
  569. pshufw m6, m6, 0
  570. .next2rows:
  571. movd m1, [r1+r2*1 ]
  572. movd m3, [r1+r2*2 ]
  573. punpcklbw m1, [r1+r2*1+1]
  574. punpcklbw m3, [r1+r2*2+1]
  575. lea r1, [r1+r2*2]
  576. movq m2, m1
  577. movq m4, m3
  578. pmaddubsw m0, m7
  579. pmaddubsw m1, m6
  580. pmaddubsw m2, m7
  581. pmaddubsw m3, m6
  582. paddw m0, m5
  583. paddw m2, m5
  584. paddw m1, m0
  585. paddw m3, m2
  586. psrlw m1, 6
  587. movq m0, m4
  588. psrlw m3, 6
  589. packuswb m1, m1
  590. packuswb m3, m3
  591. CHROMAMC_AVG m1, [r0 ]
  592. CHROMAMC_AVG m3, [r0+r2]
  593. movd [r0 ], m1
  594. movd [r0+r2], m3
  595. sub r3d, 2
  596. lea r0, [r0+r2*2]
  597. jg .next2rows
  598. REP_RET
  599. %endmacro
  600. %define CHROMAMC_AVG NOTHING
  601. INIT_XMM ssse3
  602. chroma_mc8_ssse3_func put, h264, _rnd
  603. chroma_mc8_ssse3_func put, vc1, _nornd
  604. INIT_MMX ssse3
  605. chroma_mc4_ssse3_func put, h264
  606. %define CHROMAMC_AVG DIRECT_AVG
  607. INIT_XMM ssse3
  608. chroma_mc8_ssse3_func avg, h264, _rnd
  609. chroma_mc8_ssse3_func avg, vc1, _nornd
  610. INIT_MMX ssse3
  611. chroma_mc4_ssse3_func avg, h264