You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

603 lines
14KB

  1. ;******************************************************************************
  2. ;* H.264 intra prediction asm optimizations
  3. ;* Copyright (c) 2010 Jason Garrett-Glaser
  4. ;*
  5. ;* This file is part of FFmpeg.
  6. ;*
  7. ;* FFmpeg is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* FFmpeg is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with FFmpeg; if not, write to the Free Software
  19. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "x86inc.asm"
  22. SECTION_RODATA
  23. tm_shuf: times 8 db 0x03, 0x80
  24. SECTION .text
  25. cextern pb_1
  26. cextern pb_3
  27. ;-----------------------------------------------------------------------------
  28. ; void pred16x16_vertical(uint8_t *src, int stride)
  29. ;-----------------------------------------------------------------------------
  30. cglobal pred16x16_vertical_mmx, 2,3
  31. sub r0, r1
  32. mov r2, 8
  33. movq mm0, [r0+0]
  34. movq mm1, [r0+8]
  35. .loop:
  36. movq [r0+r1*1+0], mm0
  37. movq [r0+r1*1+8], mm1
  38. movq [r0+r1*2+0], mm0
  39. movq [r0+r1*2+8], mm1
  40. lea r0, [r0+r1*2]
  41. dec r2
  42. jg .loop
  43. REP_RET
  44. cglobal pred16x16_vertical_sse, 2,3
  45. sub r0, r1
  46. mov r2, 4
  47. movaps xmm0, [r0]
  48. .loop:
  49. movaps [r0+r1*1], xmm0
  50. movaps [r0+r1*2], xmm0
  51. lea r0, [r0+r1*2]
  52. movaps [r0+r1*1], xmm0
  53. movaps [r0+r1*2], xmm0
  54. lea r0, [r0+r1*2]
  55. dec r2
  56. jg .loop
  57. REP_RET
  58. ;-----------------------------------------------------------------------------
  59. ; void pred16x16_horizontal(uint8_t *src, int stride)
  60. ;-----------------------------------------------------------------------------
  61. %macro PRED16x16_H 1
  62. cglobal pred16x16_horizontal_%1, 2,3
  63. mov r2, 8
  64. %ifidn %1, ssse3
  65. mova m2, [pb_3]
  66. %endif
  67. .loop:
  68. movd m0, [r0+r1*0-4]
  69. movd m1, [r0+r1*1-4]
  70. %ifidn %1, ssse3
  71. pshufb m0, m2
  72. pshufb m1, m2
  73. %else
  74. punpcklbw m0, m0
  75. punpcklbw m1, m1
  76. %ifidn %1, mmxext
  77. pshufw m0, m0, 0xff
  78. pshufw m1, m1, 0xff
  79. %else
  80. punpckhwd m0, m0
  81. punpckhwd m1, m1
  82. punpckhdq m0, m0
  83. punpckhdq m1, m1
  84. %endif
  85. mova [r0+r1*0+8], m0
  86. mova [r0+r1*1+8], m1
  87. %endif
  88. mova [r0+r1*0], m0
  89. mova [r0+r1*1], m1
  90. lea r0, [r0+r1*2]
  91. dec r2
  92. jg .loop
  93. REP_RET
  94. %endmacro
  95. INIT_MMX
  96. PRED16x16_H mmx
  97. PRED16x16_H mmxext
  98. INIT_XMM
  99. PRED16x16_H ssse3
  100. ;-----------------------------------------------------------------------------
  101. ; void pred16x16_dc(uint8_t *src, int stride)
  102. ;-----------------------------------------------------------------------------
  103. %macro PRED16x16_DC 1
  104. cglobal pred16x16_dc_%1, 2,7
  105. mov r4, r0
  106. sub r0, r1
  107. pxor mm0, mm0
  108. pxor mm1, mm1
  109. psadbw mm0, [r0+0]
  110. psadbw mm1, [r0+8]
  111. dec r0
  112. movzx r5d, byte [r0+r1*1]
  113. paddw mm0, mm1
  114. movd r6d, mm0
  115. lea r0, [r0+r1*2]
  116. %rep 7
  117. movzx r2d, byte [r0+r1*0]
  118. movzx r3d, byte [r0+r1*1]
  119. add r5d, r2d
  120. add r6d, r3d
  121. lea r0, [r0+r1*2]
  122. %endrep
  123. movzx r2d, byte [r0+r1*0]
  124. add r5d, r6d
  125. lea r2d, [r2+r5+16]
  126. shr r2d, 5
  127. %ifidn %1, mmxext
  128. movd m0, r2d
  129. punpcklbw m0, m0
  130. pshufw m0, m0, 0
  131. %elifidn %1, sse2
  132. movd m0, r2d
  133. punpcklbw m0, m0
  134. pshuflw m0, m0, 0
  135. punpcklqdq m0, m0
  136. %elifidn %1, ssse3
  137. pxor m1, m1
  138. movd m0, r2d
  139. pshufb m0, m1
  140. %endif
  141. %if mmsize==8
  142. mov r3d, 8
  143. .loop:
  144. mova [r4+r1*0+0], m0
  145. mova [r4+r1*0+8], m0
  146. mova [r4+r1*1+0], m0
  147. mova [r4+r1*1+8], m0
  148. %else
  149. mov r3d, 4
  150. .loop:
  151. mova [r4+r1*0], m0
  152. mova [r4+r1*1], m0
  153. lea r4, [r4+r1*2]
  154. mova [r4+r1*0], m0
  155. mova [r4+r1*1], m0
  156. %endif
  157. lea r4, [r4+r1*2]
  158. dec r3d
  159. jg .loop
  160. REP_RET
  161. %endmacro
  162. INIT_MMX
  163. PRED16x16_DC mmxext
  164. INIT_XMM
  165. PRED16x16_DC sse2
  166. PRED16x16_DC ssse3
  167. ;-----------------------------------------------------------------------------
  168. ; void pred16x16_tm_vp8(uint8_t *src, int stride)
  169. ;-----------------------------------------------------------------------------
  170. %macro PRED16x16_TM_MMX 1
  171. cglobal pred16x16_tm_vp8_%1, 2,5
  172. sub r0, r1
  173. pxor mm7, mm7
  174. movq mm0, [r0+0]
  175. movq mm2, [r0+8]
  176. movq mm1, mm0
  177. movq mm3, mm2
  178. punpcklbw mm0, mm7
  179. punpckhbw mm1, mm7
  180. punpcklbw mm2, mm7
  181. punpckhbw mm3, mm7
  182. movzx r3d, byte [r0-1]
  183. mov r4d, 16
  184. .loop:
  185. movzx r2d, byte [r0+r1-1]
  186. sub r2d, r3d
  187. movd mm4, r2d
  188. %ifidn %1, mmx
  189. punpcklwd mm4, mm4
  190. punpckldq mm4, mm4
  191. %else
  192. pshufw mm4, mm4, 0
  193. %endif
  194. movq mm5, mm4
  195. movq mm6, mm4
  196. movq mm7, mm4
  197. paddw mm4, mm0
  198. paddw mm5, mm1
  199. paddw mm6, mm2
  200. paddw mm7, mm3
  201. packuswb mm4, mm5
  202. packuswb mm6, mm7
  203. movq [r0+r1+0], mm4
  204. movq [r0+r1+8], mm6
  205. add r0, r1
  206. dec r4d
  207. jg .loop
  208. REP_RET
  209. %endmacro
  210. PRED16x16_TM_MMX mmx
  211. PRED16x16_TM_MMX mmxext
  212. cglobal pred16x16_tm_vp8_sse2, 2,6,6
  213. sub r0, r1
  214. pxor xmm2, xmm2
  215. movdqa xmm0, [r0]
  216. movdqa xmm1, xmm0
  217. punpcklbw xmm0, xmm2
  218. punpckhbw xmm1, xmm2
  219. movzx r4d, byte [r0-1]
  220. mov r5d, 8
  221. .loop:
  222. movzx r2d, byte [r0+r1*1-1]
  223. movzx r3d, byte [r0+r1*2-1]
  224. sub r2d, r4d
  225. sub r3d, r4d
  226. movd xmm2, r2d
  227. movd xmm4, r3d
  228. pshuflw xmm2, xmm2, 0
  229. pshuflw xmm4, xmm4, 0
  230. punpcklqdq xmm2, xmm2
  231. punpcklqdq xmm4, xmm4
  232. movdqa xmm3, xmm2
  233. movdqa xmm5, xmm4
  234. paddw xmm2, xmm0
  235. paddw xmm3, xmm1
  236. paddw xmm4, xmm0
  237. paddw xmm5, xmm1
  238. packuswb xmm2, xmm3
  239. packuswb xmm4, xmm5
  240. movdqa [r0+r1*1], xmm2
  241. movdqa [r0+r1*2], xmm4
  242. lea r0, [r0+r1*2]
  243. dec r5d
  244. jg .loop
  245. REP_RET
  246. ;-----------------------------------------------------------------------------
  247. ; void pred8x8_vertical(uint8_t *src, int stride)
  248. ;-----------------------------------------------------------------------------
  249. cglobal pred8x8_vertical_mmx, 2,2
  250. sub r0, r1
  251. movq mm0, [r0]
  252. %rep 3
  253. movq [r0+r1*1], mm0
  254. movq [r0+r1*2], mm0
  255. lea r0, [r0+r1*2]
  256. %endrep
  257. movq [r0+r1*1], mm0
  258. movq [r0+r1*2], mm0
  259. RET
  260. ;-----------------------------------------------------------------------------
  261. ; void pred8x8_horizontal(uint8_t *src, int stride)
  262. ;-----------------------------------------------------------------------------
  263. %macro PRED8x8_H 1
  264. cglobal pred8x8_horizontal_%1, 2,3
  265. mov r2, 4
  266. %ifidn %1, ssse3
  267. mova m2, [pb_3]
  268. %endif
  269. .loop:
  270. movd m0, [r0+r1*0-4]
  271. movd m1, [r0+r1*1-4]
  272. %ifidn %1, ssse3
  273. pshufb m0, m2
  274. pshufb m1, m2
  275. %else
  276. punpcklbw m0, m0
  277. punpcklbw m1, m1
  278. %ifidn %1, mmxext
  279. pshufw m0, m0, 0xff
  280. pshufw m1, m1, 0xff
  281. %else
  282. punpckhwd m0, m0
  283. punpckhwd m1, m1
  284. punpckhdq m0, m0
  285. punpckhdq m1, m1
  286. %endif
  287. %endif
  288. mova [r0+r1*0], m0
  289. mova [r0+r1*1], m1
  290. lea r0, [r0+r1*2]
  291. dec r2
  292. jg .loop
  293. REP_RET
  294. %endmacro
  295. INIT_MMX
  296. PRED8x8_H mmx
  297. PRED8x8_H mmxext
  298. PRED8x8_H ssse3
  299. ;-----------------------------------------------------------------------------
  300. ; void pred8x8_dc_rv40(uint8_t *src, int stride)
  301. ;-----------------------------------------------------------------------------
  302. cglobal pred8x8_dc_rv40_mmxext, 2,7
  303. mov r4, r0
  304. sub r0, r1
  305. pxor mm0, mm0
  306. psadbw mm0, [r0]
  307. dec r0
  308. movzx r5d, byte [r0+r1*1]
  309. movd r6d, mm0
  310. lea r0, [r0+r1*2]
  311. %rep 3
  312. movzx r2d, byte [r0+r1*0]
  313. movzx r3d, byte [r0+r1*1]
  314. add r5d, r2d
  315. add r6d, r3d
  316. lea r0, [r0+r1*2]
  317. %endrep
  318. movzx r2d, byte [r0+r1*0]
  319. add r5d, r6d
  320. lea r2d, [r2+r5+8]
  321. shr r2d, 4
  322. movd mm0, r2d
  323. punpcklbw mm0, mm0
  324. pshufw mm0, mm0, 0
  325. mov r3d, 4
  326. .loop:
  327. movq [r4+r1*0], mm0
  328. movq [r4+r1*1], mm0
  329. lea r4, [r4+r1*2]
  330. dec r3d
  331. jg .loop
  332. REP_RET
  333. ;-----------------------------------------------------------------------------
  334. ; void pred8x8_tm_vp8(uint8_t *src, int stride)
  335. ;-----------------------------------------------------------------------------
  336. %macro PRED8x8_TM_MMX 1
  337. cglobal pred8x8_tm_vp8_%1, 2,6
  338. sub r0, r1
  339. pxor mm7, mm7
  340. movq mm0, [r0]
  341. movq mm1, mm0
  342. punpcklbw mm0, mm7
  343. punpckhbw mm1, mm7
  344. movzx r4d, byte [r0-1]
  345. mov r5d, 4
  346. .loop:
  347. movzx r2d, byte [r0+r1*1-1]
  348. movzx r3d, byte [r0+r1*2-1]
  349. sub r2d, r4d
  350. sub r3d, r4d
  351. movd mm2, r2d
  352. movd mm4, r3d
  353. %ifidn %1, mmx
  354. punpcklwd mm2, mm2
  355. punpcklwd mm4, mm4
  356. punpckldq mm2, mm2
  357. punpckldq mm4, mm4
  358. %else
  359. pshufw mm2, mm2, 0
  360. pshufw mm4, mm4, 0
  361. %endif
  362. movq mm3, mm2
  363. movq mm5, mm4
  364. paddw mm2, mm0
  365. paddw mm3, mm1
  366. paddw mm4, mm0
  367. paddw mm5, mm1
  368. packuswb mm2, mm3
  369. packuswb mm4, mm5
  370. movq [r0+r1*1], mm2
  371. movq [r0+r1*2], mm4
  372. lea r0, [r0+r1*2]
  373. dec r5d
  374. jg .loop
  375. REP_RET
  376. %endmacro
  377. PRED8x8_TM_MMX mmx
  378. PRED8x8_TM_MMX mmxext
  379. cglobal pred8x8_tm_vp8_sse2, 2,6,4
  380. sub r0, r1
  381. pxor xmm1, xmm1
  382. movq xmm0, [r0]
  383. punpcklbw xmm0, xmm1
  384. movzx r4d, byte [r0-1]
  385. mov r5d, 4
  386. .loop:
  387. movzx r2d, byte [r0+r1*1-1]
  388. movzx r3d, byte [r0+r1*2-1]
  389. sub r2d, r4d
  390. sub r3d, r4d
  391. movd xmm2, r2d
  392. movd xmm3, r3d
  393. pshuflw xmm2, xmm2, 0
  394. pshuflw xmm3, xmm3, 0
  395. punpcklqdq xmm2, xmm2
  396. punpcklqdq xmm3, xmm3
  397. paddw xmm2, xmm0
  398. paddw xmm3, xmm0
  399. packuswb xmm2, xmm3
  400. movq [r0+r1*1], xmm2
  401. movhps [r0+r1*2], xmm2
  402. lea r0, [r0+r1*2]
  403. dec r5d
  404. jg .loop
  405. REP_RET
  406. cglobal pred8x8_tm_vp8_ssse3, 2,3,6
  407. sub r0, r1
  408. movdqa xmm4, [tm_shuf]
  409. pxor xmm1, xmm1
  410. movq xmm0, [r0]
  411. punpcklbw xmm0, xmm1
  412. movd xmm5, [r0-4]
  413. pshufb xmm5, xmm4
  414. mov r2d, 4
  415. .loop:
  416. movd xmm2, [r0+r1*1-4]
  417. movd xmm3, [r0+r1*2-4]
  418. pshufb xmm2, xmm4
  419. pshufb xmm3, xmm4
  420. psubw xmm2, xmm5
  421. psubw xmm3, xmm5
  422. paddw xmm2, xmm0
  423. paddw xmm3, xmm0
  424. packuswb xmm2, xmm3
  425. movq [r0+r1*1], xmm2
  426. movhps [r0+r1*2], xmm2
  427. lea r0, [r0+r1*2]
  428. dec r2d
  429. jg .loop
  430. REP_RET
  431. ;-----------------------------------------------------------------------------
  432. ; void pred4x4_dc_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  433. ;-----------------------------------------------------------------------------
  434. cglobal pred4x4_dc_mmxext, 3,5
  435. pxor mm7, mm7
  436. mov r4, r0
  437. sub r0, r2
  438. movd mm0, [r0]
  439. psadbw mm0, mm7
  440. movzx r1d, byte [r0+r2*1-1]
  441. movd r3d, mm0
  442. add r3d, r1d
  443. movzx r1d, byte [r0+r2*2-1]
  444. lea r0, [r0+r2*2]
  445. add r3d, r1d
  446. movzx r1d, byte [r0+r2*1-1]
  447. add r3d, r1d
  448. movzx r1d, byte [r0+r2*2-1]
  449. add r3d, r1d
  450. add r3d, 4
  451. shr r3d, 3
  452. imul r3d, 0x01010101
  453. mov [r4+r2*0], r3d
  454. mov [r0+r2*0], r3d
  455. mov [r0+r2*1], r3d
  456. mov [r0+r2*2], r3d
  457. RET
  458. ;-----------------------------------------------------------------------------
  459. ; void pred4x4_tm_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  460. ;-----------------------------------------------------------------------------
  461. %macro PRED4x4_TM_MMX 1
  462. cglobal pred4x4_tm_vp8_%1, 3,6
  463. sub r0, r2
  464. pxor mm7, mm7
  465. movd mm0, [r0]
  466. punpcklbw mm0, mm7
  467. movzx r4d, byte [r0-1]
  468. mov r5d, 2
  469. .loop:
  470. movzx r1d, byte [r0+r2*1-1]
  471. movzx r3d, byte [r0+r2*2-1]
  472. sub r1d, r4d
  473. sub r3d, r4d
  474. movd mm2, r1d
  475. movd mm4, r3d
  476. %ifidn %1, mmx
  477. punpcklwd mm2, mm2
  478. punpcklwd mm4, mm4
  479. punpckldq mm2, mm2
  480. punpckldq mm4, mm4
  481. %else
  482. pshufw mm2, mm2, 0
  483. pshufw mm4, mm4, 0
  484. %endif
  485. paddw mm2, mm0
  486. paddw mm4, mm0
  487. packuswb mm2, mm2
  488. packuswb mm4, mm4
  489. movd [r0+r2*1], mm2
  490. movd [r0+r2*2], mm4
  491. lea r0, [r0+r2*2]
  492. dec r5d
  493. jg .loop
  494. REP_RET
  495. %endmacro
  496. PRED4x4_TM_MMX mmx
  497. PRED4x4_TM_MMX mmxext
  498. cglobal pred4x4_tm_vp8_ssse3, 3,3
  499. sub r0, r2
  500. movq mm6, [tm_shuf]
  501. pxor mm1, mm1
  502. movd mm0, [r0]
  503. punpcklbw mm0, mm1
  504. movd mm7, [r0-4]
  505. pshufb mm7, mm6
  506. lea r1, [r0+r2*2]
  507. movd mm2, [r0+r2*1-4]
  508. movd mm3, [r0+r2*2-4]
  509. movd mm4, [r1+r2*1-4]
  510. movd mm5, [r1+r2*2-4]
  511. pshufb mm2, mm6
  512. pshufb mm3, mm6
  513. pshufb mm4, mm6
  514. pshufb mm5, mm6
  515. psubw mm2, mm7
  516. psubw mm3, mm7
  517. psubw mm4, mm7
  518. psubw mm5, mm7
  519. paddw mm2, mm0
  520. paddw mm3, mm0
  521. paddw mm4, mm0
  522. paddw mm5, mm0
  523. packuswb mm2, mm2
  524. packuswb mm3, mm3
  525. packuswb mm4, mm4
  526. packuswb mm5, mm5
  527. movd [r0+r2*1], mm2
  528. movd [r0+r2*2], mm3
  529. movd [r1+r2*1], mm4
  530. movd [r1+r2*2], mm5
  531. RET
  532. ; dest, left, right, src, tmp
  533. ; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
  534. %macro PRED4x4_LOWPASS 5
  535. mova %5, %2
  536. pavgb %2, %3
  537. pxor %3, %5
  538. mova %1, %4
  539. pand %3, [pb_1]
  540. psubusb %2, %3
  541. pavgb %1, %2
  542. %endmacro
  543. ;-----------------------------------------------------------------------------
  544. ; void pred4x4_vertical_vp8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
  545. ;-----------------------------------------------------------------------------
  546. INIT_MMX
  547. cglobal pred4x4_vertical_vp8_mmxext, 3,3
  548. sub r0, r2
  549. movd m1, [r0-1]
  550. movd m0, [r0]
  551. mova m2, m0 ;t0 t1 t2 t3
  552. punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
  553. lea r1, [r0+r2*2]
  554. psrlq m0, 8 ;t1 t2 t3 t4
  555. PRED4x4_LOWPASS m3, m1, m0, m2, m4
  556. movd [r0+r2*1], m3
  557. movd [r0+r2*2], m3
  558. movd [r1+r2*1], m3
  559. movd [r1+r2*2], m3
  560. RET