You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

679 lines
17KB

  1. ;******************************************************************************
  2. ;* MMX/SSSE3-optimized functions for H264 chroma MC
  3. ;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
  4. ;* 2005-2008 Loren Merritt
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. rnd_rv40_2d_tbl: times 4 dw 0
  25. times 4 dw 16
  26. times 4 dw 32
  27. times 4 dw 16
  28. times 4 dw 32
  29. times 4 dw 28
  30. times 4 dw 32
  31. times 4 dw 28
  32. times 4 dw 0
  33. times 4 dw 32
  34. times 4 dw 16
  35. times 4 dw 32
  36. times 4 dw 32
  37. times 4 dw 28
  38. times 4 dw 32
  39. times 4 dw 28
  40. rnd_rv40_1d_tbl: times 4 dw 0
  41. times 4 dw 2
  42. times 4 dw 4
  43. times 4 dw 2
  44. times 4 dw 4
  45. times 4 dw 3
  46. times 4 dw 4
  47. times 4 dw 3
  48. times 4 dw 0
  49. times 4 dw 4
  50. times 4 dw 2
  51. times 4 dw 4
  52. times 4 dw 4
  53. times 4 dw 3
  54. times 4 dw 4
  55. times 4 dw 3
  56. cextern pw_3
  57. cextern pw_4
  58. cextern pw_8
  59. cextern pw_28
  60. cextern pw_32
  61. cextern pw_64
  62. SECTION .text
  63. %macro mv0_pixels_mc8 0
  64. lea r4, [r2*2 ]
  65. .next4rows:
  66. movq mm0, [r1 ]
  67. movq mm1, [r1+r2]
  68. add r1, r4
  69. CHROMAMC_AVG mm0, [r0 ]
  70. CHROMAMC_AVG mm1, [r0+r2]
  71. movq [r0 ], mm0
  72. movq [r0+r2], mm1
  73. add r0, r4
  74. movq mm0, [r1 ]
  75. movq mm1, [r1+r2]
  76. add r1, r4
  77. CHROMAMC_AVG mm0, [r0 ]
  78. CHROMAMC_AVG mm1, [r0+r2]
  79. movq [r0 ], mm0
  80. movq [r0+r2], mm1
  81. add r0, r4
  82. sub r3d, 4
  83. jne .next4rows
  84. %endmacro
  85. %macro chroma_mc8_mmx_func 2-3
  86. %ifidn %2, rv40
  87. %ifdef PIC
  88. %define rnd_1d_rv40 r8
  89. %define rnd_2d_rv40 r8
  90. %define extra_regs 2
  91. %else ; no-PIC
  92. %define rnd_1d_rv40 rnd_rv40_1d_tbl
  93. %define rnd_2d_rv40 rnd_rv40_2d_tbl
  94. %define extra_regs 1
  95. %endif ; PIC
  96. %else
  97. %define extra_regs 0
  98. %endif ; rv40
  99. ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
  100. ; int stride, int h, int mx, int my)
  101. cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
  102. %if ARCH_X86_64
  103. movsxd r2, r2d
  104. %endif
  105. mov r6d, r5d
  106. or r6d, r4d
  107. jne .at_least_one_non_zero
  108. ; mx == 0 AND my == 0 - no filter needed
  109. mv0_pixels_mc8
  110. REP_RET
  111. .at_least_one_non_zero:
  112. %ifidn %2, rv40
  113. %if ARCH_X86_64
  114. mov r7, r5
  115. and r7, 6 ; &~1 for mx/my=[0,7]
  116. lea r7, [r7*4+r4]
  117. sar r7d, 1
  118. %define rnd_bias r7
  119. %define dest_reg r0
  120. %else ; x86-32
  121. mov r0, r5
  122. and r0, 6 ; &~1 for mx/my=[0,7]
  123. lea r0, [r0*4+r4]
  124. sar r0d, 1
  125. %define rnd_bias r0
  126. %define dest_reg r5
  127. %endif
  128. %else ; vc1, h264
  129. %define rnd_bias 0
  130. %define dest_reg r0
  131. %endif
  132. test r5d, r5d
  133. mov r6, 1
  134. je .my_is_zero
  135. test r4d, r4d
  136. mov r6, r2 ; dxy = x ? 1 : stride
  137. jne .both_non_zero
  138. .my_is_zero:
  139. ; mx == 0 XOR my == 0 - 1 dimensional filter only
  140. or r4d, r5d ; x + y
  141. %ifidn %2, rv40
  142. %ifdef PIC
  143. lea r8, [rnd_rv40_1d_tbl]
  144. %endif
  145. %if ARCH_X86_64 == 0
  146. mov r5, r0m
  147. %endif
  148. %endif
  149. movd m5, r4d
  150. movq m4, [pw_8]
  151. movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
  152. punpcklwd m5, m5
  153. punpckldq m5, m5 ; mm5 = B = x
  154. pxor m7, m7
  155. psubw m4, m5 ; mm4 = A = 8-x
  156. .next1drow:
  157. movq m0, [r1 ] ; mm0 = src[0..7]
  158. movq m2, [r1+r6] ; mm1 = src[1..8]
  159. movq m1, m0
  160. movq m3, m2
  161. punpcklbw m0, m7
  162. punpckhbw m1, m7
  163. punpcklbw m2, m7
  164. punpckhbw m3, m7
  165. pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
  166. pmullw m1, m4
  167. pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
  168. pmullw m3, m5
  169. paddw m0, m6
  170. paddw m1, m6
  171. paddw m0, m2
  172. paddw m1, m3
  173. psrlw m0, 3
  174. psrlw m1, 3
  175. packuswb m0, m1
  176. CHROMAMC_AVG m0, [dest_reg]
  177. movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
  178. add dest_reg, r2
  179. add r1, r2
  180. dec r3d
  181. jne .next1drow
  182. REP_RET
  183. .both_non_zero: ; general case, bilinear
  184. movd m4, r4d ; x
  185. movd m6, r5d ; y
  186. %ifidn %2, rv40
  187. %ifdef PIC
  188. lea r8, [rnd_rv40_2d_tbl]
  189. %endif
  190. %if ARCH_X86_64 == 0
  191. mov r5, r0m
  192. %endif
  193. %endif
  194. mov r6, rsp ; backup stack pointer
  195. and rsp, ~(mmsize-1) ; align stack
  196. sub rsp, 16 ; AA and DD
  197. punpcklwd m4, m4
  198. punpcklwd m6, m6
  199. punpckldq m4, m4 ; mm4 = x words
  200. punpckldq m6, m6 ; mm6 = y words
  201. movq m5, m4
  202. pmullw m4, m6 ; mm4 = x * y
  203. psllw m5, 3
  204. psllw m6, 3
  205. movq m7, m5
  206. paddw m7, m6
  207. movq [rsp+8], m4 ; DD = x * y
  208. psubw m5, m4 ; mm5 = B = 8x - xy
  209. psubw m6, m4 ; mm6 = C = 8y - xy
  210. paddw m4, [pw_64]
  211. psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
  212. pxor m7, m7
  213. movq [rsp ], m4
  214. movq m0, [r1 ] ; mm0 = src[0..7]
  215. movq m1, [r1+1] ; mm1 = src[1..8]
  216. .next2drow:
  217. add r1, r2
  218. movq m2, m0
  219. movq m3, m1
  220. punpckhbw m0, m7
  221. punpcklbw m1, m7
  222. punpcklbw m2, m7
  223. punpckhbw m3, m7
  224. pmullw m0, [rsp]
  225. pmullw m2, [rsp]
  226. pmullw m1, m5
  227. pmullw m3, m5
  228. paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
  229. paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
  230. movq m0, [r1]
  231. movq m1, m0
  232. punpcklbw m0, m7
  233. punpckhbw m1, m7
  234. pmullw m0, m6
  235. pmullw m1, m6
  236. paddw m2, m0
  237. paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
  238. movq m1, [r1+1]
  239. movq m0, m1
  240. movq m4, m1
  241. punpcklbw m0, m7
  242. punpckhbw m4, m7
  243. pmullw m0, [rsp+8]
  244. pmullw m4, [rsp+8]
  245. paddw m2, m0
  246. paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
  247. movq m0, [r1]
  248. paddw m2, [rnd_2d_%2+rnd_bias*8]
  249. paddw m3, [rnd_2d_%2+rnd_bias*8]
  250. psrlw m2, 6
  251. psrlw m3, 6
  252. packuswb m2, m3
  253. CHROMAMC_AVG m2, [dest_reg]
  254. movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
  255. add dest_reg, r2
  256. dec r3d
  257. jne .next2drow
  258. mov rsp, r6 ; restore stack pointer
  259. RET
  260. %endmacro
  261. %macro chroma_mc4_mmx_func 2
  262. %define extra_regs 0
  263. %ifidn %2, rv40
  264. %ifdef PIC
  265. %define extra_regs 1
  266. %endif ; PIC
  267. %endif ; rv40
  268. cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
  269. %if ARCH_X86_64
  270. movsxd r2, r2d
  271. %endif
  272. pxor m7, m7
  273. movd m2, r4d ; x
  274. movd m3, r5d ; y
  275. movq m4, [pw_8]
  276. movq m5, [pw_8]
  277. punpcklwd m2, m2
  278. punpcklwd m3, m3
  279. punpcklwd m2, m2
  280. punpcklwd m3, m3
  281. psubw m4, m2
  282. psubw m5, m3
  283. %ifidn %2, rv40
  284. %ifdef PIC
  285. lea r6, [rnd_rv40_2d_tbl]
  286. %define rnd_2d_rv40 r6
  287. %else
  288. %define rnd_2d_rv40 rnd_rv40_2d_tbl
  289. %endif
  290. and r5, 6 ; &~1 for mx/my=[0,7]
  291. lea r5, [r5*4+r4]
  292. sar r5d, 1
  293. %define rnd_bias r5
  294. %else ; vc1, h264
  295. %define rnd_bias 0
  296. %endif
  297. movd m0, [r1 ]
  298. movd m6, [r1+1]
  299. add r1, r2
  300. punpcklbw m0, m7
  301. punpcklbw m6, m7
  302. pmullw m0, m4
  303. pmullw m6, m2
  304. paddw m6, m0
  305. .next2rows:
  306. movd m0, [r1 ]
  307. movd m1, [r1+1]
  308. add r1, r2
  309. punpcklbw m0, m7
  310. punpcklbw m1, m7
  311. pmullw m0, m4
  312. pmullw m1, m2
  313. paddw m1, m0
  314. movq m0, m1
  315. pmullw m6, m5
  316. pmullw m1, m3
  317. paddw m6, [rnd_2d_%2+rnd_bias*8]
  318. paddw m1, m6
  319. psrlw m1, 6
  320. packuswb m1, m1
  321. CHROMAMC_AVG4 m1, m6, [r0]
  322. movd [r0], m1
  323. add r0, r2
  324. movd m6, [r1 ]
  325. movd m1, [r1+1]
  326. add r1, r2
  327. punpcklbw m6, m7
  328. punpcklbw m1, m7
  329. pmullw m6, m4
  330. pmullw m1, m2
  331. paddw m1, m6
  332. movq m6, m1
  333. pmullw m0, m5
  334. pmullw m1, m3
  335. paddw m0, [rnd_2d_%2+rnd_bias*8]
  336. paddw m1, m0
  337. psrlw m1, 6
  338. packuswb m1, m1
  339. CHROMAMC_AVG4 m1, m0, [r0]
  340. movd [r0], m1
  341. add r0, r2
  342. sub r3d, 2
  343. jnz .next2rows
  344. REP_RET
  345. %endmacro
  346. %macro chroma_mc2_mmx_func 2
  347. cglobal %1_%2_chroma_mc2, 6, 7, 0
  348. %if ARCH_X86_64
  349. movsxd r2, r2d
  350. %endif
  351. mov r6d, r4d
  352. shl r4d, 16
  353. sub r4d, r6d
  354. add r4d, 8
  355. imul r5d, r4d ; x*y<<16 | y*(8-x)
  356. shl r4d, 3
  357. sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
  358. movd m5, r4d
  359. movd m6, r5d
  360. punpckldq m5, m5 ; mm5 = {A,B,A,B}
  361. punpckldq m6, m6 ; mm6 = {C,D,C,D}
  362. pxor m7, m7
  363. movd m2, [r1]
  364. punpcklbw m2, m7
  365. pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
  366. .nextrow:
  367. add r1, r2
  368. movq m1, m2
  369. pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
  370. movd m0, [r1]
  371. punpcklbw m0, m7
  372. pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
  373. movq m2, m0
  374. pmaddwd m0, m6
  375. paddw m1, [rnd_2d_%2]
  376. paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
  377. psrlw m1, 6
  378. packssdw m1, m7
  379. packuswb m1, m7
  380. CHROMAMC_AVG4 m1, m3, [r0]
  381. movd r5d, m1
  382. mov [r0], r5w
  383. add r0, r2
  384. sub r3d, 1
  385. jnz .nextrow
  386. REP_RET
  387. %endmacro
  388. %define rnd_1d_h264 pw_4
  389. %define rnd_2d_h264 pw_32
  390. %define rnd_1d_vc1 pw_3
  391. %define rnd_2d_vc1 pw_28
  392. %macro NOTHING 2-3
  393. %endmacro
  394. %macro DIRECT_AVG 2
  395. PAVGB %1, %2
  396. %endmacro
  397. %macro COPY_AVG 3
  398. movd %2, %3
  399. PAVGB %1, %2
  400. %endmacro
  401. INIT_MMX mmx
  402. %define CHROMAMC_AVG NOTHING
  403. %define CHROMAMC_AVG4 NOTHING
  404. chroma_mc8_mmx_func put, h264, _rnd
  405. chroma_mc8_mmx_func put, vc1, _nornd
  406. chroma_mc8_mmx_func put, rv40
  407. chroma_mc4_mmx_func put, h264
  408. chroma_mc4_mmx_func put, rv40
  409. INIT_MMX mmxext
  410. chroma_mc2_mmx_func put, h264
  411. %define CHROMAMC_AVG DIRECT_AVG
  412. %define CHROMAMC_AVG4 COPY_AVG
  413. chroma_mc8_mmx_func avg, h264, _rnd
  414. chroma_mc8_mmx_func avg, vc1, _nornd
  415. chroma_mc8_mmx_func avg, rv40
  416. chroma_mc4_mmx_func avg, h264
  417. chroma_mc4_mmx_func avg, rv40
  418. chroma_mc2_mmx_func avg, h264
  419. INIT_MMX 3dnow
  420. chroma_mc8_mmx_func avg, h264, _rnd
  421. chroma_mc8_mmx_func avg, vc1, _nornd
  422. chroma_mc8_mmx_func avg, rv40
  423. chroma_mc4_mmx_func avg, h264
  424. chroma_mc4_mmx_func avg, rv40
  425. %macro chroma_mc8_ssse3_func 2-3
  426. cglobal %1_%2_chroma_mc8%3, 6, 7, 8
  427. %if ARCH_X86_64
  428. movsxd r2, r2d
  429. %endif
  430. mov r6d, r5d
  431. or r6d, r4d
  432. jne .at_least_one_non_zero
  433. ; mx == 0 AND my == 0 - no filter needed
  434. mv0_pixels_mc8
  435. REP_RET
  436. .at_least_one_non_zero:
  437. test r5d, r5d
  438. je .my_is_zero
  439. test r4d, r4d
  440. je .mx_is_zero
  441. ; general case, bilinear
  442. mov r6d, r4d
  443. shl r4d, 8
  444. sub r4, r6
  445. mov r6, 8
  446. add r4, 8 ; x*288+8 = x<<8 | (8-x)
  447. sub r6d, r5d
  448. imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
  449. imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
  450. movd m7, r6d
  451. movd m6, r4d
  452. movdqa m5, [rnd_2d_%2]
  453. movq m0, [r1 ]
  454. movq m1, [r1+1]
  455. pshuflw m7, m7, 0
  456. pshuflw m6, m6, 0
  457. punpcklbw m0, m1
  458. movlhps m7, m7
  459. movlhps m6, m6
  460. .next2rows:
  461. movq m1, [r1+r2*1 ]
  462. movq m2, [r1+r2*1+1]
  463. movq m3, [r1+r2*2 ]
  464. movq m4, [r1+r2*2+1]
  465. lea r1, [r1+r2*2]
  466. punpcklbw m1, m2
  467. movdqa m2, m1
  468. punpcklbw m3, m4
  469. movdqa m4, m3
  470. pmaddubsw m0, m7
  471. pmaddubsw m1, m6
  472. pmaddubsw m2, m7
  473. pmaddubsw m3, m6
  474. paddw m0, m5
  475. paddw m2, m5
  476. paddw m1, m0
  477. paddw m3, m2
  478. psrlw m1, 6
  479. movdqa m0, m4
  480. psrlw m3, 6
  481. %ifidn %1, avg
  482. movq m2, [r0 ]
  483. movhps m2, [r0+r2]
  484. %endif
  485. packuswb m1, m3
  486. CHROMAMC_AVG m1, m2
  487. movq [r0 ], m1
  488. movhps [r0+r2], m1
  489. sub r3d, 2
  490. lea r0, [r0+r2*2]
  491. jg .next2rows
  492. REP_RET
  493. .my_is_zero:
  494. mov r5d, r4d
  495. shl r4d, 8
  496. add r4, 8
  497. sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
  498. movd m7, r4d
  499. movdqa m6, [rnd_1d_%2]
  500. pshuflw m7, m7, 0
  501. movlhps m7, m7
  502. .next2xrows:
  503. movq m0, [r1 ]
  504. movq m1, [r1 +1]
  505. movq m2, [r1+r2 ]
  506. movq m3, [r1+r2+1]
  507. punpcklbw m0, m1
  508. punpcklbw m2, m3
  509. pmaddubsw m0, m7
  510. pmaddubsw m2, m7
  511. %ifidn %1, avg
  512. movq m4, [r0 ]
  513. movhps m4, [r0+r2]
  514. %endif
  515. paddw m0, m6
  516. paddw m2, m6
  517. psrlw m0, 3
  518. psrlw m2, 3
  519. packuswb m0, m2
  520. CHROMAMC_AVG m0, m4
  521. movq [r0 ], m0
  522. movhps [r0+r2], m0
  523. sub r3d, 2
  524. lea r0, [r0+r2*2]
  525. lea r1, [r1+r2*2]
  526. jg .next2xrows
  527. REP_RET
  528. .mx_is_zero:
  529. mov r4d, r5d
  530. shl r5d, 8
  531. add r5, 8
  532. sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
  533. movd m7, r5d
  534. movdqa m6, [rnd_1d_%2]
  535. pshuflw m7, m7, 0
  536. movlhps m7, m7
  537. .next2yrows:
  538. movq m0, [r1 ]
  539. movq m1, [r1+r2 ]
  540. movdqa m2, m1
  541. movq m3, [r1+r2*2]
  542. lea r1, [r1+r2*2]
  543. punpcklbw m0, m1
  544. punpcklbw m2, m3
  545. pmaddubsw m0, m7
  546. pmaddubsw m2, m7
  547. %ifidn %1, avg
  548. movq m4, [r0 ]
  549. movhps m4, [r0+r2]
  550. %endif
  551. paddw m0, m6
  552. paddw m2, m6
  553. psrlw m0, 3
  554. psrlw m2, 3
  555. packuswb m0, m2
  556. CHROMAMC_AVG m0, m4
  557. movq [r0 ], m0
  558. movhps [r0+r2], m0
  559. sub r3d, 2
  560. lea r0, [r0+r2*2]
  561. jg .next2yrows
  562. REP_RET
  563. %endmacro
  564. %macro chroma_mc4_ssse3_func 2
  565. cglobal %1_%2_chroma_mc4, 6, 7, 0
  566. %if ARCH_X86_64
  567. movsxd r2, r2d
  568. %endif
  569. mov r6, r4
  570. shl r4d, 8
  571. sub r4d, r6d
  572. mov r6, 8
  573. add r4d, 8 ; x*288+8
  574. sub r6d, r5d
  575. imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
  576. imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
  577. movd m7, r6d
  578. movd m6, r4d
  579. movq m5, [pw_32]
  580. movd m0, [r1 ]
  581. pshufw m7, m7, 0
  582. punpcklbw m0, [r1+1]
  583. pshufw m6, m6, 0
  584. .next2rows:
  585. movd m1, [r1+r2*1 ]
  586. movd m3, [r1+r2*2 ]
  587. punpcklbw m1, [r1+r2*1+1]
  588. punpcklbw m3, [r1+r2*2+1]
  589. lea r1, [r1+r2*2]
  590. movq m2, m1
  591. movq m4, m3
  592. pmaddubsw m0, m7
  593. pmaddubsw m1, m6
  594. pmaddubsw m2, m7
  595. pmaddubsw m3, m6
  596. paddw m0, m5
  597. paddw m2, m5
  598. paddw m1, m0
  599. paddw m3, m2
  600. psrlw m1, 6
  601. movq m0, m4
  602. psrlw m3, 6
  603. packuswb m1, m1
  604. packuswb m3, m3
  605. CHROMAMC_AVG m1, [r0 ]
  606. CHROMAMC_AVG m3, [r0+r2]
  607. movd [r0 ], m1
  608. movd [r0+r2], m3
  609. sub r3d, 2
  610. lea r0, [r0+r2*2]
  611. jg .next2rows
  612. REP_RET
  613. %endmacro
  614. %define CHROMAMC_AVG NOTHING
  615. INIT_XMM ssse3
  616. chroma_mc8_ssse3_func put, h264, _rnd
  617. chroma_mc8_ssse3_func put, vc1, _nornd
  618. INIT_MMX ssse3
  619. chroma_mc4_ssse3_func put, h264
  620. %define CHROMAMC_AVG DIRECT_AVG
  621. INIT_XMM ssse3
  622. chroma_mc8_ssse3_func avg, h264, _rnd
  623. chroma_mc8_ssse3_func avg, vc1, _nornd
  624. INIT_MMX ssse3
  625. chroma_mc4_ssse3_func avg, h264