You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1032 lines
28KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86inc.asm"
  23. %include "x86util.asm"
  24. SECTION_RODATA
  25. fourtap_filter_hw_m: times 4 dw -6, 123
  26. times 4 dw 12, -1
  27. times 4 dw -9, 93
  28. times 4 dw 50, -6
  29. times 4 dw -6, 50
  30. times 4 dw 93, -9
  31. times 4 dw -1, 12
  32. times 4 dw 123, -6
  33. sixtap_filter_hw_m: times 4 dw 2, -11
  34. times 4 dw 108, 36
  35. times 4 dw -8, 1
  36. times 4 dw 3, -16
  37. times 4 dw 77, 77
  38. times 4 dw -16, 3
  39. times 4 dw 1, -8
  40. times 4 dw 36, 108
  41. times 4 dw -11, 2
  42. fourtap_filter_hb_m: times 8 db -6, -1
  43. times 8 db 123, 12
  44. times 8 db -9, -6
  45. times 8 db 93, 50
  46. times 8 db -6, -9
  47. times 8 db 50, 93
  48. times 8 db -1, -6
  49. times 8 db 12, 123
  50. sixtap_filter_hb_m: times 8 db 2, 1
  51. times 8 db -11, 108
  52. times 8 db 36, -8
  53. times 8 db 3, 3
  54. times 8 db -16, 77
  55. times 8 db 77, -16
  56. times 8 db 1, 2
  57. times 8 db -8, 36
  58. times 8 db 108, -11
  59. fourtap_filter_v_m: times 8 dw -6
  60. times 8 dw 123
  61. times 8 dw 12
  62. times 8 dw -1
  63. times 8 dw -9
  64. times 8 dw 93
  65. times 8 dw 50
  66. times 8 dw -6
  67. times 8 dw -6
  68. times 8 dw 50
  69. times 8 dw 93
  70. times 8 dw -9
  71. times 8 dw -1
  72. times 8 dw 12
  73. times 8 dw 123
  74. times 8 dw -6
  75. sixtap_filter_v_m: times 8 dw 2
  76. times 8 dw -11
  77. times 8 dw 108
  78. times 8 dw 36
  79. times 8 dw -8
  80. times 8 dw 1
  81. times 8 dw 3
  82. times 8 dw -16
  83. times 8 dw 77
  84. times 8 dw 77
  85. times 8 dw -16
  86. times 8 dw 3
  87. times 8 dw 1
  88. times 8 dw -8
  89. times 8 dw 36
  90. times 8 dw 108
  91. times 8 dw -11
  92. times 8 dw 2
  93. bilinear_filter_vw_m: times 8 dw 1
  94. times 8 dw 2
  95. times 8 dw 3
  96. times 8 dw 4
  97. times 8 dw 5
  98. times 8 dw 6
  99. times 8 dw 7
  100. bilinear_filter_vb_m: times 8 db 7, 1
  101. times 8 db 6, 2
  102. times 8 db 5, 3
  103. times 8 db 4, 4
  104. times 8 db 3, 5
  105. times 8 db 2, 6
  106. times 8 db 1, 7
  107. %ifdef PIC
  108. %define fourtap_filter_hw r11
  109. %define sixtap_filter_hw r11
  110. %define fourtap_filter_hb r11
  111. %define sixtap_filter_hb r11
  112. %define fourtap_filter_v r11
  113. %define sixtap_filter_v r11
  114. %define bilinear_filter_vw r11
  115. %define bilinear_filter_vb r11
  116. %else
  117. %define fourtap_filter_hw fourtap_filter_hw_m
  118. %define sixtap_filter_hw sixtap_filter_hw_m
  119. %define fourtap_filter_hb fourtap_filter_hb_m
  120. %define sixtap_filter_hb sixtap_filter_hb_m
  121. %define fourtap_filter_v fourtap_filter_v_m
  122. %define sixtap_filter_v sixtap_filter_v_m
  123. %define bilinear_filter_vw bilinear_filter_vw_m
  124. %define bilinear_filter_vb bilinear_filter_vb_m
  125. %endif
  126. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  127. filter_h4_shuf: db 0, 3, 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10
  128. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  129. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  130. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  131. pw_20091: times 4 dw 20091
  132. pw_17734: times 4 dw 17734
  133. cextern pw_3
  134. cextern pw_4
  135. cextern pw_64
  136. SECTION .text
  137. ;-----------------------------------------------------------------------------
  138. ; subpel MC functions:
  139. ;
  140. ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  141. ; uint8_t *src, int srcstride,
  142. ; int height, int mx, int my);
  143. ;-----------------------------------------------------------------------------
  144. ; 4x4 block, H-only 4-tap filter
  145. cglobal put_vp8_epel4_h4_mmxext, 6, 6
  146. shl r5d, 4
  147. %ifdef PIC
  148. lea r11, [fourtap_filter_hw_m]
  149. %endif
  150. movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
  151. movq mm5, [fourtap_filter_hw+r5]
  152. movq mm7, [pw_64]
  153. pxor mm6, mm6
  154. .nextrow
  155. movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
  156. ; first set of 2 pixels
  157. movq mm2, mm1 ; byte ABCD..
  158. punpcklbw mm1, mm6 ; byte->word ABCD
  159. pshufw mm0, mm2, 9 ; byte CDEF..
  160. punpcklbw mm0, mm6 ; byte->word CDEF
  161. pshufw mm3, mm1, 0x94 ; word ABBC
  162. pshufw mm1, mm0, 0x94 ; word CDDE
  163. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  164. movq mm0, mm1 ; backup for second set of pixels
  165. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  166. paddd mm3, mm1 ; finish 1st 2px
  167. ; second set of 2 pixels, use backup of above
  168. punpckhbw mm2, mm6 ; byte->word EFGH
  169. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  170. pshufw mm1, mm2, 0x94 ; word EFFG
  171. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  172. paddd mm0, mm1 ; finish 2nd 2px
  173. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  174. packssdw mm3, mm0 ; merge dword->word (4px)
  175. paddsw mm3, mm7 ; rounding
  176. psraw mm3, 7
  177. packuswb mm3, mm6 ; clip and word->bytes
  178. movd [r0], mm3 ; store
  179. ; go to next line
  180. add r0, r1
  181. add r2, r3
  182. dec r4 ; next row
  183. jg .nextrow
  184. REP_RET
  185. ; 4x4 block, H-only 6-tap filter
  186. cglobal put_vp8_epel4_h6_mmxext, 6, 6
  187. lea r5d, [r5*3]
  188. %ifdef PIC
  189. lea r11, [sixtap_filter_hw_m]
  190. %endif
  191. movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
  192. movq mm5, [sixtap_filter_hw+r5*8-32]
  193. movq mm6, [sixtap_filter_hw+r5*8-16]
  194. movq mm7, [pw_64]
  195. pxor mm3, mm3
  196. .nextrow
  197. movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
  198. ; first set of 2 pixels
  199. movq mm2, mm1 ; byte ABCD..
  200. punpcklbw mm1, mm3 ; byte->word ABCD
  201. pshufw mm0, mm2, 0x9 ; byte CDEF..
  202. punpckhbw mm2, mm3 ; byte->word EFGH
  203. punpcklbw mm0, mm3 ; byte->word CDEF
  204. pshufw mm1, mm1, 0x94 ; word ABBC
  205. pshufw mm2, mm2, 0x94 ; word EFFG
  206. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  207. pshufw mm3, mm0, 0x94 ; word CDDE
  208. movq mm0, mm3 ; backup for second set of pixels
  209. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  210. paddd mm1, mm3 ; add to 1st 2px cache
  211. movq mm3, mm2 ; backup for second set of pixels
  212. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  213. paddd mm1, mm2 ; finish 1st 2px
  214. ; second set of 2 pixels, use backup of above
  215. movd mm2, [r2+3] ; byte FGHI (prevent overreads)
  216. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  217. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  218. paddd mm0, mm3 ; add to 2nd 2px cache
  219. pxor mm3, mm3
  220. punpcklbw mm2, mm3 ; byte->word FGHI
  221. pshufw mm2, mm2, 0xE9 ; word GHHI
  222. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  223. paddd mm0, mm2 ; finish 2nd 2px
  224. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  225. packssdw mm1, mm0 ; merge dword->word (4px)
  226. paddsw mm1, mm7 ; rounding
  227. psraw mm1, 7
  228. packuswb mm1, mm3 ; clip and word->bytes
  229. movd [r0], mm1 ; store
  230. ; go to next line
  231. add r0, r1
  232. add r2, r3
  233. dec r4 ; next row
  234. jg .nextrow
  235. REP_RET
  236. ; 4x4 block, H-only 4-tap filter
  237. INIT_XMM
  238. cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
  239. shl r5d, 4
  240. %ifdef PIC
  241. lea r11, [fourtap_filter_hw_m]
  242. %endif
  243. mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
  244. mova m6, [fourtap_filter_hw+r5]
  245. pxor m7, m7
  246. .nextrow
  247. movh m0, [r2-1]
  248. punpcklbw m0, m7 ; ABCDEFGH
  249. mova m1, m0
  250. mova m2, m0
  251. mova m3, m0
  252. psrldq m1, 2 ; BCDEFGH
  253. psrldq m2, 4 ; CDEFGH
  254. psrldq m3, 6 ; DEFGH
  255. punpcklwd m0, m1 ; ABBCCDDE
  256. punpcklwd m2, m3 ; CDDEEFFG
  257. pmaddwd m0, m5
  258. pmaddwd m2, m6
  259. paddd m0, m2
  260. movh m1, [r2+3]
  261. punpcklbw m1, m7 ; ABCDEFGH
  262. mova m2, m1
  263. mova m3, m1
  264. mova m4, m1
  265. psrldq m2, 2 ; BCDEFGH
  266. psrldq m3, 4 ; CDEFGH
  267. psrldq m4, 6 ; DEFGH
  268. punpcklwd m1, m2 ; ABBCCDDE
  269. punpcklwd m3, m4 ; CDDEEFFG
  270. pmaddwd m1, m5
  271. pmaddwd m3, m6
  272. paddd m1, m3
  273. packssdw m0, m1
  274. paddsw m0, [pw_64]
  275. psraw m0, 7
  276. packuswb m0, m7
  277. movh [r0], m0 ; store
  278. ; go to next line
  279. add r0, r1
  280. add r2, r3
  281. dec r4 ; next row
  282. jg .nextrow
  283. REP_RET
  284. cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
  285. lea r5d, [r5*3]
  286. %ifdef PIC
  287. lea r11, [sixtap_filter_hw_m]
  288. %endif
  289. lea r5, [sixtap_filter_hw+r5*8]
  290. pxor m7, m7
  291. .nextrow
  292. movu m0, [r2-2]
  293. mova m6, m0
  294. mova m4, m0
  295. punpcklbw m0, m7 ; ABCDEFGHI
  296. mova m1, m0
  297. mova m2, m0
  298. mova m3, m0
  299. psrldq m1, 2 ; BCDEFGH
  300. psrldq m2, 4 ; CDEFGH
  301. psrldq m3, 6 ; DEFGH
  302. psrldq m4, 4
  303. punpcklbw m4, m7 ; EFGH
  304. mova m5, m4
  305. psrldq m5, 2 ; FGH
  306. punpcklwd m0, m1 ; ABBCCDDE
  307. punpcklwd m2, m3 ; CDDEEFFG
  308. punpcklwd m4, m5 ; EFFGGHHI
  309. pmaddwd m0, [r5-48]
  310. pmaddwd m2, [r5-32]
  311. pmaddwd m4, [r5-16]
  312. paddd m0, m2
  313. paddd m0, m4
  314. psrldq m6, 4
  315. mova m4, m6
  316. punpcklbw m6, m7 ; ABCDEFGHI
  317. mova m1, m6
  318. mova m2, m6
  319. mova m3, m6
  320. psrldq m1, 2 ; BCDEFGH
  321. psrldq m2, 4 ; CDEFGH
  322. psrldq m3, 6 ; DEFGH
  323. psrldq m4, 4
  324. punpcklbw m4, m7 ; EFGH
  325. mova m5, m4
  326. psrldq m5, 2 ; FGH
  327. punpcklwd m6, m1 ; ABBCCDDE
  328. punpcklwd m2, m3 ; CDDEEFFG
  329. punpcklwd m4, m5 ; EFFGGHHI
  330. pmaddwd m6, [r5-48]
  331. pmaddwd m2, [r5-32]
  332. pmaddwd m4, [r5-16]
  333. paddd m6, m2
  334. paddd m6, m4
  335. packssdw m0, m6
  336. paddsw m0, [pw_64]
  337. psraw m0, 7
  338. packuswb m0, m7
  339. movh [r0], m0 ; store
  340. ; go to next line
  341. add r0, r1
  342. add r2, r3
  343. dec r4 ; next row
  344. jg .nextrow
  345. REP_RET
  346. cglobal put_vp8_epel8_h4_ssse3, 6, 6, 7
  347. shl r5d, 4
  348. mova m2, [pw_64]
  349. mova m3, [filter_h4_shuf]
  350. mova m4, [filter_h6_shuf2]
  351. %ifdef PIC
  352. lea r11, [fourtap_filter_hb_m]
  353. %endif
  354. mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
  355. mova m6, [fourtap_filter_hb+r5]
  356. .nextrow
  357. movu m0, [r2-1]
  358. mova m1, m0
  359. pshufb m0, m3
  360. pshufb m1, m4
  361. pmaddubsw m0, m5
  362. pmaddubsw m1, m6
  363. paddsw m0, m2
  364. paddsw m0, m1
  365. psraw m0, 7
  366. packuswb m0, m0
  367. movh [r0], m0 ; store
  368. ; go to next line
  369. add r0, r1
  370. add r2, r3
  371. dec r4 ; next row
  372. jg .nextrow
  373. REP_RET
  374. cglobal put_vp8_epel8_h6_ssse3, 6, 6, 8
  375. lea r5d, [r5*3]
  376. mova m3, [filter_h6_shuf1]
  377. mova m4, [filter_h6_shuf2]
  378. %ifdef PIC
  379. lea r11, [sixtap_filter_hb_m]
  380. %endif
  381. mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
  382. mova m6, [sixtap_filter_hb+r5*8-32]
  383. mova m7, [sixtap_filter_hb+r5*8-16]
  384. .nextrow
  385. movu m0, [r2-2]
  386. mova m1, m0
  387. mova m2, m0
  388. pshufb m0, m3
  389. pshufb m1, m4
  390. pshufb m2, [filter_h6_shuf3]
  391. pmaddubsw m0, m5
  392. pmaddubsw m1, m6
  393. pmaddubsw m2, m7
  394. paddsw m0, m1
  395. paddsw m0, m2
  396. paddsw m0, [pw_64]
  397. psraw m0, 7
  398. packuswb m0, m0
  399. movh [r0], m0 ; store
  400. ; go to next line
  401. add r0, r1
  402. add r2, r3
  403. dec r4 ; next row
  404. jg .nextrow
  405. REP_RET
  406. %macro FILTER_V 3
  407. ; 4x4 block, V-only 4-tap filter
  408. cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
  409. shl r6d, 5
  410. %ifdef PIC
  411. lea r11, [fourtap_filter_v_m]
  412. %endif
  413. lea r6, [fourtap_filter_v+r6-32]
  414. mova m6, [pw_64]
  415. pxor m7, m7
  416. mova m5, [r6+48]
  417. ; read 3 lines
  418. sub r2, r3
  419. movh m0, [r2]
  420. movh m1, [r2+ r3]
  421. movh m2, [r2+2*r3]
  422. add r2, r3
  423. punpcklbw m0, m7
  424. punpcklbw m1, m7
  425. punpcklbw m2, m7
  426. .nextrow
  427. ; first calculate negative taps (to prevent losing positive overflows)
  428. movh m4, [r2+2*r3] ; read new row
  429. punpcklbw m4, m7
  430. mova m3, m4
  431. pmullw m0, [r6+0]
  432. pmullw m4, m5
  433. paddsw m4, m0
  434. ; then calculate positive taps
  435. mova m0, m1
  436. pmullw m1, [r6+16]
  437. paddsw m4, m1
  438. mova m1, m2
  439. pmullw m2, [r6+32]
  440. paddsw m4, m2
  441. mova m2, m3
  442. ; round/clip/store
  443. paddsw m4, m6
  444. psraw m4, 7
  445. packuswb m4, m7
  446. movh [r0], m4
  447. ; go to next line
  448. add r0, r1
  449. add r2, r3
  450. dec r4 ; next row
  451. jg .nextrow
  452. REP_RET
  453. ; 4x4 block, V-only 6-tap filter
  454. cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
  455. shl r6d, 4
  456. lea r6, [r6*3]
  457. %ifdef PIC
  458. lea r11, [sixtap_filter_v_m]
  459. %endif
  460. lea r6, [sixtap_filter_v+r6-96]
  461. pxor m7, m7
  462. ; read 5 lines
  463. sub r2, r3
  464. sub r2, r3
  465. movh m0, [r2]
  466. movh m1, [r2+r3]
  467. movh m2, [r2+r3*2]
  468. lea r2, [r2+r3*2]
  469. add r2, r3
  470. movh m3, [r2]
  471. movh m4, [r2+r3]
  472. punpcklbw m0, m7
  473. punpcklbw m1, m7
  474. punpcklbw m2, m7
  475. punpcklbw m3, m7
  476. punpcklbw m4, m7
  477. .nextrow
  478. ; first calculate negative taps (to prevent losing positive overflows)
  479. mova m5, m1
  480. pmullw m5, [r6+16]
  481. mova m6, m4
  482. pmullw m6, [r6+64]
  483. paddsw m6, m5
  484. ; then calculate positive taps
  485. movh m5, [r2+2*r3] ; read new row
  486. punpcklbw m5, m7
  487. pmullw m0, [r6+0]
  488. paddsw m6, m0
  489. mova m0, m1
  490. mova m1, m2
  491. pmullw m2, [r6+32]
  492. paddsw m6, m2
  493. mova m2, m3
  494. pmullw m3, [r6+48]
  495. paddsw m6, m3
  496. mova m3, m4
  497. mova m4, m5
  498. pmullw m5, [r6+80]
  499. paddsw m6, m5
  500. ; round/clip/store
  501. paddsw m6, [pw_64]
  502. psraw m6, 7
  503. packuswb m6, m7
  504. movh [r0], m6
  505. ; go to next line
  506. add r0, r1
  507. add r2, r3
  508. dec r4 ; next row
  509. jg .nextrow
  510. REP_RET
  511. %endmacro
  512. INIT_MMX
  513. FILTER_V mmxext, 4, 0
  514. INIT_XMM
  515. FILTER_V sse2, 8, 8
  516. cglobal put_vp8_epel8_v4_ssse3, 7, 7, 8
  517. shl r6d, 4
  518. %ifdef PIC
  519. lea r11, [fourtap_filter_hb_m]
  520. %endif
  521. mova m5, [fourtap_filter_hb+r6-16]
  522. mova m6, [fourtap_filter_hb+r6]
  523. mova m7, [pw_64]
  524. ; read 3 lines
  525. sub r2, r3
  526. movh m0, [r2]
  527. movh m1, [r2+ r3]
  528. movh m2, [r2+2*r3]
  529. add r2, r3
  530. .nextrow
  531. movh m3, [r2+2*r3] ; read new row
  532. mova m4, m0
  533. mova m0, m1
  534. punpcklbw m4, m3
  535. punpcklbw m1, m2
  536. pmaddubsw m4, m5
  537. pmaddubsw m1, m6
  538. paddsw m4, m1
  539. mova m1, m2
  540. paddsw m4, m7
  541. mova m2, m3
  542. psraw m4, 7
  543. packuswb m4, m4
  544. movh [r0], m4
  545. ; go to next line
  546. add r0, r1
  547. add r2, r3
  548. dec r4 ; next row
  549. jg .nextrow
  550. REP_RET
  551. cglobal put_vp8_epel8_v6_ssse3, 7, 7, 8
  552. lea r6d, [r6*3]
  553. %ifdef PIC
  554. lea r11, [sixtap_filter_hb_m]
  555. %endif
  556. lea r6, [sixtap_filter_hb+r6*8]
  557. ; read 5 lines
  558. sub r2, r3
  559. sub r2, r3
  560. movh m0, [r2]
  561. movh m1, [r2+r3]
  562. movh m2, [r2+r3*2]
  563. lea r2, [r2+r3*2]
  564. add r2, r3
  565. movh m3, [r2]
  566. movh m4, [r2+r3]
  567. .nextrow
  568. movh m5, [r2+2*r3] ; read new row
  569. mova m6, m0
  570. punpcklbw m6, m5
  571. mova m0, m1
  572. punpcklbw m1, m2
  573. mova m7, m3
  574. punpcklbw m7, m4
  575. pmaddubsw m6, [r6-48]
  576. pmaddubsw m1, [r6-32]
  577. pmaddubsw m7, [r6-16]
  578. paddsw m6, m1
  579. paddsw m6, m7
  580. mova m1, m2
  581. paddsw m6, [pw_64]
  582. mova m2, m3
  583. psraw m6, 7
  584. mova m3, m4
  585. packuswb m6, m6
  586. mova m4, m5
  587. movh [r0], m6
  588. ; go to next line
  589. add r0, r1
  590. add r2, r3
  591. dec r4 ; next row
  592. jg .nextrow
  593. REP_RET
  594. %macro FILTER_BILINEAR 3
  595. cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
  596. mov r5d, 8*16
  597. shl r6d, 4
  598. sub r5d, r6d
  599. %ifdef PIC
  600. lea r11, [bilinear_filter_vw_m]
  601. %endif
  602. pxor m6, m6
  603. mova m4, [bilinear_filter_vw+r5-16]
  604. mova m5, [bilinear_filter_vw+r6-16]
  605. .nextrow
  606. movh m0, [r2+r3*0]
  607. movh m1, [r2+r3*1]
  608. movh m3, [r2+r3*2]
  609. punpcklbw m0, m6
  610. punpcklbw m1, m6
  611. punpcklbw m3, m6
  612. mova m2, m1
  613. pmullw m0, m4
  614. pmullw m1, m5
  615. pmullw m2, m4
  616. pmullw m3, m5
  617. paddsw m0, m1
  618. paddsw m2, m3
  619. psraw m0, 2
  620. psraw m2, 2
  621. pavgw m0, m6
  622. pavgw m2, m6
  623. %ifidn %1, mmxext
  624. packuswb m0, m0
  625. packuswb m2, m2
  626. movh [r0+r1*0], m0
  627. movh [r0+r1*1], m2
  628. %else
  629. packuswb m0, m2
  630. movh [r0+r1*0], m0
  631. movhps [r0+r1*1], m0
  632. %endif
  633. lea r0, [r0+r1*2]
  634. lea r2, [r2+r3*2]
  635. sub r4, 2
  636. jg .nextrow
  637. REP_RET
  638. cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
  639. mov r6d, 8*16
  640. shl r5d, 4
  641. sub r6d, r5d
  642. %ifdef PIC
  643. lea r11, [bilinear_filter_vw_m]
  644. %endif
  645. pxor m6, m6
  646. mova m4, [bilinear_filter_vw+r6-16]
  647. mova m5, [bilinear_filter_vw+r5-16]
  648. .nextrow
  649. movh m0, [r2+r3*0+0]
  650. movh m1, [r2+r3*0+1]
  651. movh m2, [r2+r3*1+0]
  652. movh m3, [r2+r3*1+1]
  653. punpcklbw m0, m6
  654. punpcklbw m1, m6
  655. punpcklbw m2, m6
  656. punpcklbw m3, m6
  657. pmullw m0, m4
  658. pmullw m1, m5
  659. pmullw m2, m4
  660. pmullw m3, m5
  661. paddsw m0, m1
  662. paddsw m2, m3
  663. psraw m0, 2
  664. psraw m2, 2
  665. pavgw m0, m6
  666. pavgw m2, m6
  667. %ifidn %1, mmxext
  668. packuswb m0, m0
  669. packuswb m2, m2
  670. movh [r0+r1*0], m0
  671. movh [r0+r1*1], m2
  672. %else
  673. packuswb m0, m2
  674. movh [r0+r1*0], m0
  675. movhps [r0+r1*1], m0
  676. %endif
  677. lea r0, [r0+r1*2]
  678. lea r2, [r2+r3*2]
  679. sub r4, 2
  680. jg .nextrow
  681. REP_RET
  682. %endmacro
  683. INIT_MMX
  684. FILTER_BILINEAR mmxext, 4, 0
  685. INIT_XMM
  686. FILTER_BILINEAR sse2, 8, 7
  687. cglobal put_vp8_bilinear8_v_ssse3, 7,7,5
  688. shl r6d, 4
  689. %ifdef PIC
  690. lea r11, [bilinear_filter_vb_m]
  691. %endif
  692. pxor m4, m4
  693. mova m3, [bilinear_filter_vb+r6-16]
  694. .nextrow
  695. movh m0, [r2+r3*0]
  696. movh m1, [r2+r3*1]
  697. movh m2, [r2+r3*2]
  698. punpcklbw m0, m1
  699. punpcklbw m1, m2
  700. pmaddubsw m0, m3
  701. pmaddubsw m1, m3
  702. psraw m0, 2
  703. psraw m1, 2
  704. pavgw m0, m4
  705. pavgw m1, m4
  706. packuswb m0, m1
  707. movh [r0+r1*0], m0
  708. movhps [r0+r1*1], m0
  709. lea r0, [r0+r1*2]
  710. lea r2, [r2+r3*2]
  711. sub r4, 2
  712. jg .nextrow
  713. REP_RET
  714. cglobal put_vp8_bilinear8_h_ssse3, 7,7,5
  715. shl r5d, 4
  716. %ifdef PIC
  717. lea r11, [bilinear_filter_vb_m]
  718. %endif
  719. pxor m4, m4
  720. mova m2, [filter_h2_shuf]
  721. mova m3, [bilinear_filter_vb+r5-16]
  722. .nextrow
  723. movu m0, [r2+r3*0]
  724. movu m1, [r2+r3*1]
  725. pshufb m0, m2
  726. pshufb m1, m2
  727. pmaddubsw m0, m3
  728. pmaddubsw m1, m3
  729. psraw m0, 2
  730. psraw m1, 2
  731. pavgw m0, m4
  732. pavgw m1, m4
  733. packuswb m0, m1
  734. movh [r0+r1*0], m0
  735. movhps [r0+r1*1], m0
  736. lea r0, [r0+r1*2]
  737. lea r2, [r2+r3*2]
  738. sub r4, 2
  739. jg .nextrow
  740. REP_RET
  741. cglobal put_vp8_pixels8_mmx, 5,5
  742. .nextrow:
  743. movq mm0, [r2+r3*0]
  744. movq mm1, [r2+r3*1]
  745. lea r2, [r2+r3*2]
  746. movq [r0+r1*0], mm0
  747. movq [r0+r1*1], mm1
  748. lea r0, [r0+r1*2]
  749. sub r4d, 2
  750. jg .nextrow
  751. REP_RET
  752. cglobal put_vp8_pixels16_mmx, 5,5
  753. .nextrow:
  754. movq mm0, [r2+r3*0+0]
  755. movq mm1, [r2+r3*0+8]
  756. movq mm2, [r2+r3*1+0]
  757. movq mm3, [r2+r3*1+8]
  758. lea r2, [r2+r3*2]
  759. movq [r0+r1*0+0], mm0
  760. movq [r0+r1*0+8], mm1
  761. movq [r0+r1*1+0], mm2
  762. movq [r0+r1*1+8], mm3
  763. lea r0, [r0+r1*2]
  764. sub r4d, 2
  765. jg .nextrow
  766. REP_RET
  767. cglobal put_vp8_pixels16_sse, 5,5,2
  768. .nextrow:
  769. movups xmm0, [r2+r3*0]
  770. movups xmm1, [r2+r3*1]
  771. lea r2, [r2+r3*2]
  772. movaps [r0+r1*0], xmm0
  773. movaps [r0+r1*1], xmm1
  774. lea r0, [r0+r1*2]
  775. sub r4d, 2
  776. jg .nextrow
  777. REP_RET
  778. ;-----------------------------------------------------------------------------
  779. ; IDCT functions:
  780. ;
  781. ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  782. ;-----------------------------------------------------------------------------
  783. cglobal vp8_idct_dc_add_mmx, 3, 3
  784. ; load data
  785. movd mm0, [r1]
  786. ; calculate DC
  787. paddw mm0, [pw_4]
  788. pxor mm1, mm1
  789. psraw mm0, 3
  790. psubw mm1, mm0
  791. packuswb mm0, mm0
  792. packuswb mm1, mm1
  793. punpcklbw mm0, mm0
  794. punpcklbw mm1, mm1
  795. punpcklwd mm0, mm0
  796. punpcklwd mm1, mm1
  797. ; add DC
  798. lea r1, [r0+r2*2]
  799. movd mm2, [r0]
  800. movd mm3, [r0+r2]
  801. movd mm4, [r1]
  802. movd mm5, [r1+r2]
  803. paddusb mm2, mm0
  804. paddusb mm3, mm0
  805. paddusb mm4, mm0
  806. paddusb mm5, mm0
  807. psubusb mm2, mm1
  808. psubusb mm3, mm1
  809. psubusb mm4, mm1
  810. psubusb mm5, mm1
  811. movd [r0], mm2
  812. movd [r0+r2], mm3
  813. movd [r1], mm4
  814. movd [r1+r2], mm5
  815. RET
  816. cglobal vp8_idct_dc_add_sse4, 3, 3, 6
  817. ; load data
  818. movd xmm0, [r1]
  819. lea r1, [r0+r2*2]
  820. pxor xmm1, xmm1
  821. movq xmm2, [pw_4]
  822. ; calculate DC
  823. paddw xmm0, xmm2
  824. movd xmm2, [r0]
  825. movd xmm3, [r0+r2]
  826. movd xmm4, [r1]
  827. movd xmm5, [r1+r2]
  828. psraw xmm0, 3
  829. pshuflw xmm0, xmm0, 0
  830. punpcklqdq xmm0, xmm0
  831. punpckldq xmm2, xmm3
  832. punpckldq xmm4, xmm5
  833. punpcklbw xmm2, xmm1
  834. punpcklbw xmm4, xmm1
  835. paddw xmm2, xmm0
  836. paddw xmm4, xmm0
  837. packuswb xmm2, xmm4
  838. movd [r0], xmm2
  839. pextrd [r0+r2], xmm2, 1
  840. pextrd [r1], xmm2, 2
  841. pextrd [r1+r2], xmm2, 3
  842. RET
  843. ;-----------------------------------------------------------------------------
  844. ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  845. ;-----------------------------------------------------------------------------
  846. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  847. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  848. %macro VP8_MULTIPLY_SUMSUB 4
  849. mova %3, %1
  850. mova %4, %2
  851. pmulhw %3, m6 ;20091(1)
  852. pmulhw %4, m6 ;20091(2)
  853. paddw %3, %1
  854. paddw %4, %2
  855. paddw %1, %1
  856. paddw %2, %2
  857. pmulhw %1, m7 ;35468(1)
  858. pmulhw %2, m7 ;35468(2)
  859. psubw %1, %4
  860. paddw %2, %3
  861. %endmacro
  862. ; calculate x0=%1+%3; x1=%1-%3
  863. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  864. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  865. ; %5/%6 are temporary registers
  866. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  867. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  868. SUMSUB_BA m%3, m%1, m%5 ;t0, t1
  869. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  870. SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
  871. SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
  872. SWAP %4, %1
  873. SWAP %4, %3
  874. %endmacro
  875. INIT_MMX
  876. cglobal vp8_idct_add_mmx, 3, 3
  877. ; load block data
  878. movq m0, [r1]
  879. movq m1, [r1+8]
  880. movq m2, [r1+16]
  881. movq m3, [r1+24]
  882. movq m6, [pw_20091]
  883. movq m7, [pw_17734]
  884. ; actual IDCT
  885. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  886. TRANSPOSE4x4W 0, 1, 2, 3, 4
  887. paddw m0, [pw_4]
  888. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  889. TRANSPOSE4x4W 0, 1, 2, 3, 4
  890. ; store
  891. pxor m4, m4
  892. lea r1, [r0+2*r2]
  893. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
  894. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
  895. RET
  896. ;-----------------------------------------------------------------------------
  897. ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
  898. ;-----------------------------------------------------------------------------
  899. %macro SCATTER_WHT 1
  900. pextrw r1d, m0, %1
  901. pextrw r2d, m1, %1
  902. mov [r0+2*16*0], r1w
  903. mov [r0+2*16*1], r2w
  904. pextrw r1d, m2, %1
  905. pextrw r2d, m3, %1
  906. mov [r0+2*16*2], r1w
  907. mov [r0+2*16*3], r2w
  908. %endmacro
  909. %macro HADAMARD4_1D 4
  910. SUMSUB_BADC m%2, m%1, m%4, m%3
  911. SUMSUB_BADC m%4, m%2, m%3, m%1
  912. SWAP %1, %4, %3
  913. %endmacro
  914. INIT_MMX
  915. cglobal vp8_luma_dc_wht_mmxext, 2,3
  916. movq m0, [r1]
  917. movq m1, [r1+8]
  918. movq m2, [r1+16]
  919. movq m3, [r1+24]
  920. HADAMARD4_1D 0, 1, 2, 3
  921. TRANSPOSE4x4W 0, 1, 2, 3, 4
  922. paddw m0, [pw_3]
  923. HADAMARD4_1D 0, 1, 2, 3
  924. psraw m0, 3
  925. psraw m1, 3
  926. psraw m2, 3
  927. psraw m3, 3
  928. SCATTER_WHT 0
  929. add r0, 2*16*4
  930. SCATTER_WHT 1
  931. add r0, 2*16*4
  932. SCATTER_WHT 2
  933. add r0, 2*16*4
  934. SCATTER_WHT 3
  935. RET