You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1226 lines
34KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. fourtap_filter_hw_m: times 4 dw -6, 123
  25. times 4 dw 12, -1
  26. times 4 dw -9, 93
  27. times 4 dw 50, -6
  28. times 4 dw -6, 50
  29. times 4 dw 93, -9
  30. times 4 dw -1, 12
  31. times 4 dw 123, -6
  32. sixtap_filter_hw_m: times 4 dw 2, -11
  33. times 4 dw 108, 36
  34. times 4 dw -8, 1
  35. times 4 dw 3, -16
  36. times 4 dw 77, 77
  37. times 4 dw -16, 3
  38. times 4 dw 1, -8
  39. times 4 dw 36, 108
  40. times 4 dw -11, 2
  41. fourtap_filter_hb_m: times 8 db -6, 123
  42. times 8 db 12, -1
  43. times 8 db -9, 93
  44. times 8 db 50, -6
  45. times 8 db -6, 50
  46. times 8 db 93, -9
  47. times 8 db -1, 12
  48. times 8 db 123, -6
  49. sixtap_filter_hb_m: times 8 db 2, 1
  50. times 8 db -11, 108
  51. times 8 db 36, -8
  52. times 8 db 3, 3
  53. times 8 db -16, 77
  54. times 8 db 77, -16
  55. times 8 db 1, 2
  56. times 8 db -8, 36
  57. times 8 db 108, -11
  58. fourtap_filter_v_m: times 8 dw -6
  59. times 8 dw 123
  60. times 8 dw 12
  61. times 8 dw -1
  62. times 8 dw -9
  63. times 8 dw 93
  64. times 8 dw 50
  65. times 8 dw -6
  66. times 8 dw -6
  67. times 8 dw 50
  68. times 8 dw 93
  69. times 8 dw -9
  70. times 8 dw -1
  71. times 8 dw 12
  72. times 8 dw 123
  73. times 8 dw -6
  74. sixtap_filter_v_m: times 8 dw 2
  75. times 8 dw -11
  76. times 8 dw 108
  77. times 8 dw 36
  78. times 8 dw -8
  79. times 8 dw 1
  80. times 8 dw 3
  81. times 8 dw -16
  82. times 8 dw 77
  83. times 8 dw 77
  84. times 8 dw -16
  85. times 8 dw 3
  86. times 8 dw 1
  87. times 8 dw -8
  88. times 8 dw 36
  89. times 8 dw 108
  90. times 8 dw -11
  91. times 8 dw 2
  92. bilinear_filter_vw_m: times 8 dw 1
  93. times 8 dw 2
  94. times 8 dw 3
  95. times 8 dw 4
  96. times 8 dw 5
  97. times 8 dw 6
  98. times 8 dw 7
  99. bilinear_filter_vb_m: times 8 db 7, 1
  100. times 8 db 6, 2
  101. times 8 db 5, 3
  102. times 8 db 4, 4
  103. times 8 db 3, 5
  104. times 8 db 2, 6
  105. times 8 db 1, 7
  106. %ifdef PIC
  107. %define fourtap_filter_hw picregq
  108. %define sixtap_filter_hw picregq
  109. %define fourtap_filter_hb picregq
  110. %define sixtap_filter_hb picregq
  111. %define fourtap_filter_v picregq
  112. %define sixtap_filter_v picregq
  113. %define bilinear_filter_vw picregq
  114. %define bilinear_filter_vb picregq
  115. %define npicregs 1
  116. %else
  117. %define fourtap_filter_hw fourtap_filter_hw_m
  118. %define sixtap_filter_hw sixtap_filter_hw_m
  119. %define fourtap_filter_hb fourtap_filter_hb_m
  120. %define sixtap_filter_hb sixtap_filter_hb_m
  121. %define fourtap_filter_v fourtap_filter_v_m
  122. %define sixtap_filter_v sixtap_filter_v_m
  123. %define bilinear_filter_vw bilinear_filter_vw_m
  124. %define bilinear_filter_vb bilinear_filter_vb_m
  125. %define npicregs 0
  126. %endif
  127. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  128. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  129. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  130. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  131. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  132. pw_256: times 8 dw 256
  133. pw_20091: times 4 dw 20091
  134. pw_17734: times 4 dw 17734
  135. cextern pw_3
  136. cextern pw_4
  137. cextern pw_64
  138. SECTION .text
  139. ;-------------------------------------------------------------------------------
  140. ; subpel MC functions:
  141. ;
  142. ; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  143. ; uint8_t *src, int srcstride,
  144. ; int height, int mx, int my);
  145. ;-------------------------------------------------------------------------------
  146. %macro FILTER_SSSE3 1
  147. cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
  148. lea mxd, [mxq*3]
  149. mova m3, [filter_h6_shuf2]
  150. mova m4, [filter_h6_shuf3]
  151. %ifdef PIC
  152. lea picregq, [sixtap_filter_hb_m]
  153. %endif
  154. mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
  155. mova m6, [sixtap_filter_hb+mxq*8-32]
  156. mova m7, [sixtap_filter_hb+mxq*8-16]
  157. .nextrow:
  158. movu m0, [srcq-2]
  159. mova m1, m0
  160. mova m2, m0
  161. %if mmsize == 8
  162. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  163. ; shuffle with a memory operand
  164. punpcklbw m0, [srcq+3]
  165. %else
  166. pshufb m0, [filter_h6_shuf1]
  167. %endif
  168. pshufb m1, m3
  169. pshufb m2, m4
  170. pmaddubsw m0, m5
  171. pmaddubsw m1, m6
  172. pmaddubsw m2, m7
  173. paddsw m0, m1
  174. paddsw m0, m2
  175. pmulhrsw m0, [pw_256]
  176. packuswb m0, m0
  177. movh [dstq], m0 ; store
  178. ; go to next line
  179. add dstq, dststrideq
  180. add srcq, srcstrideq
  181. dec heightd ; next row
  182. jg .nextrow
  183. REP_RET
  184. cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  185. shl mxd, 4
  186. mova m2, [pw_256]
  187. mova m3, [filter_h2_shuf]
  188. mova m4, [filter_h4_shuf]
  189. %ifdef PIC
  190. lea picregq, [fourtap_filter_hb_m]
  191. %endif
  192. mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
  193. mova m6, [fourtap_filter_hb+mxq]
  194. .nextrow:
  195. movu m0, [srcq-1]
  196. mova m1, m0
  197. pshufb m0, m3
  198. pshufb m1, m4
  199. pmaddubsw m0, m5
  200. pmaddubsw m1, m6
  201. paddsw m0, m1
  202. pmulhrsw m0, m2
  203. packuswb m0, m0
  204. movh [dstq], m0 ; store
  205. ; go to next line
  206. add dstq, dststrideq
  207. add srcq, srcstrideq
  208. dec heightd ; next row
  209. jg .nextrow
  210. REP_RET
  211. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  212. shl myd, 4
  213. %ifdef PIC
  214. lea picregq, [fourtap_filter_hb_m]
  215. %endif
  216. mova m5, [fourtap_filter_hb+myq-16]
  217. mova m6, [fourtap_filter_hb+myq]
  218. mova m7, [pw_256]
  219. ; read 3 lines
  220. sub srcq, srcstrideq
  221. movh m0, [srcq]
  222. movh m1, [srcq+ srcstrideq]
  223. movh m2, [srcq+2*srcstrideq]
  224. add srcq, srcstrideq
  225. .nextrow:
  226. movh m3, [srcq+2*srcstrideq] ; read new row
  227. mova m4, m0
  228. mova m0, m1
  229. punpcklbw m4, m1
  230. mova m1, m2
  231. punpcklbw m2, m3
  232. pmaddubsw m4, m5
  233. pmaddubsw m2, m6
  234. paddsw m4, m2
  235. mova m2, m3
  236. pmulhrsw m4, m7
  237. packuswb m4, m4
  238. movh [dstq], m4
  239. ; go to next line
  240. add dstq, dststrideq
  241. add srcq, srcstrideq
  242. dec heightd ; next row
  243. jg .nextrow
  244. REP_RET
  245. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  246. lea myd, [myq*3]
  247. %ifdef PIC
  248. lea picregq, [sixtap_filter_hb_m]
  249. %endif
  250. lea myq, [sixtap_filter_hb+myq*8]
  251. ; read 5 lines
  252. sub srcq, srcstrideq
  253. sub srcq, srcstrideq
  254. movh m0, [srcq]
  255. movh m1, [srcq+srcstrideq]
  256. movh m2, [srcq+srcstrideq*2]
  257. lea srcq, [srcq+srcstrideq*2]
  258. add srcq, srcstrideq
  259. movh m3, [srcq]
  260. movh m4, [srcq+srcstrideq]
  261. .nextrow:
  262. movh m5, [srcq+2*srcstrideq] ; read new row
  263. mova m6, m0
  264. punpcklbw m6, m5
  265. mova m0, m1
  266. punpcklbw m1, m2
  267. mova m7, m3
  268. punpcklbw m7, m4
  269. pmaddubsw m6, [myq-48]
  270. pmaddubsw m1, [myq-32]
  271. pmaddubsw m7, [myq-16]
  272. paddsw m6, m1
  273. paddsw m6, m7
  274. mova m1, m2
  275. mova m2, m3
  276. pmulhrsw m6, [pw_256]
  277. mova m3, m4
  278. packuswb m6, m6
  279. mova m4, m5
  280. movh [dstq], m6
  281. ; go to next line
  282. add dstq, dststrideq
  283. add srcq, srcstrideq
  284. dec heightd ; next row
  285. jg .nextrow
  286. REP_RET
  287. %endmacro
  288. INIT_MMX ssse3
  289. FILTER_SSSE3 4
  290. INIT_XMM ssse3
  291. FILTER_SSSE3 8
  292. ; 4x4 block, H-only 4-tap filter
  293. INIT_MMX mmxext
  294. cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  295. shl mxd, 4
  296. %ifdef PIC
  297. lea picregq, [fourtap_filter_hw_m]
  298. %endif
  299. movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
  300. movq mm5, [fourtap_filter_hw+mxq]
  301. movq mm7, [pw_64]
  302. pxor mm6, mm6
  303. .nextrow:
  304. movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
  305. ; first set of 2 pixels
  306. movq mm2, mm1 ; byte ABCD..
  307. punpcklbw mm1, mm6 ; byte->word ABCD
  308. pshufw mm0, mm2, 9 ; byte CDEF..
  309. punpcklbw mm0, mm6 ; byte->word CDEF
  310. pshufw mm3, mm1, 0x94 ; word ABBC
  311. pshufw mm1, mm0, 0x94 ; word CDDE
  312. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  313. movq mm0, mm1 ; backup for second set of pixels
  314. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  315. paddd mm3, mm1 ; finish 1st 2px
  316. ; second set of 2 pixels, use backup of above
  317. punpckhbw mm2, mm6 ; byte->word EFGH
  318. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  319. pshufw mm1, mm2, 0x94 ; word EFFG
  320. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  321. paddd mm0, mm1 ; finish 2nd 2px
  322. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  323. packssdw mm3, mm0 ; merge dword->word (4px)
  324. paddsw mm3, mm7 ; rounding
  325. psraw mm3, 7
  326. packuswb mm3, mm6 ; clip and word->bytes
  327. movd [dstq], mm3 ; store
  328. ; go to next line
  329. add dstq, dststrideq
  330. add srcq, srcstrideq
  331. dec heightd ; next row
  332. jg .nextrow
  333. REP_RET
  334. ; 4x4 block, H-only 6-tap filter
  335. INIT_MMX mmxext
  336. cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  337. lea mxd, [mxq*3]
  338. %ifdef PIC
  339. lea picregq, [sixtap_filter_hw_m]
  340. %endif
  341. movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
  342. movq mm5, [sixtap_filter_hw+mxq*8-32]
  343. movq mm6, [sixtap_filter_hw+mxq*8-16]
  344. movq mm7, [pw_64]
  345. pxor mm3, mm3
  346. .nextrow:
  347. movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
  348. ; first set of 2 pixels
  349. movq mm2, mm1 ; byte ABCD..
  350. punpcklbw mm1, mm3 ; byte->word ABCD
  351. pshufw mm0, mm2, 0x9 ; byte CDEF..
  352. punpckhbw mm2, mm3 ; byte->word EFGH
  353. punpcklbw mm0, mm3 ; byte->word CDEF
  354. pshufw mm1, mm1, 0x94 ; word ABBC
  355. pshufw mm2, mm2, 0x94 ; word EFFG
  356. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  357. pshufw mm3, mm0, 0x94 ; word CDDE
  358. movq mm0, mm3 ; backup for second set of pixels
  359. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  360. paddd mm1, mm3 ; add to 1st 2px cache
  361. movq mm3, mm2 ; backup for second set of pixels
  362. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  363. paddd mm1, mm2 ; finish 1st 2px
  364. ; second set of 2 pixels, use backup of above
  365. movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
  366. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  367. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  368. paddd mm0, mm3 ; add to 2nd 2px cache
  369. pxor mm3, mm3
  370. punpcklbw mm2, mm3 ; byte->word FGHI
  371. pshufw mm2, mm2, 0xE9 ; word GHHI
  372. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  373. paddd mm0, mm2 ; finish 2nd 2px
  374. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  375. packssdw mm1, mm0 ; merge dword->word (4px)
  376. paddsw mm1, mm7 ; rounding
  377. psraw mm1, 7
  378. packuswb mm1, mm3 ; clip and word->bytes
  379. movd [dstq], mm1 ; store
  380. ; go to next line
  381. add dstq, dststrideq
  382. add srcq, srcstrideq
  383. dec heightd ; next row
  384. jg .nextrow
  385. REP_RET
  386. INIT_XMM sse2
  387. cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
  388. shl mxd, 5
  389. %ifdef PIC
  390. lea picregq, [fourtap_filter_v_m]
  391. %endif
  392. lea mxq, [fourtap_filter_v+mxq-32]
  393. pxor m7, m7
  394. mova m4, [pw_64]
  395. mova m5, [mxq+ 0]
  396. mova m6, [mxq+16]
  397. %ifdef m8
  398. mova m8, [mxq+32]
  399. mova m9, [mxq+48]
  400. %endif
  401. .nextrow:
  402. movq m0, [srcq-1]
  403. movq m1, [srcq-0]
  404. movq m2, [srcq+1]
  405. movq m3, [srcq+2]
  406. punpcklbw m0, m7
  407. punpcklbw m1, m7
  408. punpcklbw m2, m7
  409. punpcklbw m3, m7
  410. pmullw m0, m5
  411. pmullw m1, m6
  412. %ifdef m8
  413. pmullw m2, m8
  414. pmullw m3, m9
  415. %else
  416. pmullw m2, [mxq+32]
  417. pmullw m3, [mxq+48]
  418. %endif
  419. paddsw m0, m1
  420. paddsw m2, m3
  421. paddsw m0, m2
  422. paddsw m0, m4
  423. psraw m0, 7
  424. packuswb m0, m7
  425. movh [dstq], m0 ; store
  426. ; go to next line
  427. add dstq, dststrideq
  428. add srcq, srcstrideq
  429. dec heightd ; next row
  430. jg .nextrow
  431. REP_RET
  432. INIT_XMM sse2
  433. cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
  434. lea mxd, [mxq*3]
  435. shl mxd, 4
  436. %ifdef PIC
  437. lea picregq, [sixtap_filter_v_m]
  438. %endif
  439. lea mxq, [sixtap_filter_v+mxq-96]
  440. pxor m7, m7
  441. mova m6, [pw_64]
  442. %ifdef m8
  443. mova m8, [mxq+ 0]
  444. mova m9, [mxq+16]
  445. mova m10, [mxq+32]
  446. mova m11, [mxq+48]
  447. mova m12, [mxq+64]
  448. mova m13, [mxq+80]
  449. %endif
  450. .nextrow:
  451. movq m0, [srcq-2]
  452. movq m1, [srcq-1]
  453. movq m2, [srcq-0]
  454. movq m3, [srcq+1]
  455. movq m4, [srcq+2]
  456. movq m5, [srcq+3]
  457. punpcklbw m0, m7
  458. punpcklbw m1, m7
  459. punpcklbw m2, m7
  460. punpcklbw m3, m7
  461. punpcklbw m4, m7
  462. punpcklbw m5, m7
  463. %ifdef m8
  464. pmullw m0, m8
  465. pmullw m1, m9
  466. pmullw m2, m10
  467. pmullw m3, m11
  468. pmullw m4, m12
  469. pmullw m5, m13
  470. %else
  471. pmullw m0, [mxq+ 0]
  472. pmullw m1, [mxq+16]
  473. pmullw m2, [mxq+32]
  474. pmullw m3, [mxq+48]
  475. pmullw m4, [mxq+64]
  476. pmullw m5, [mxq+80]
  477. %endif
  478. paddsw m1, m4
  479. paddsw m0, m5
  480. paddsw m1, m2
  481. paddsw m0, m3
  482. paddsw m0, m1
  483. paddsw m0, m6
  484. psraw m0, 7
  485. packuswb m0, m7
  486. movh [dstq], m0 ; store
  487. ; go to next line
  488. add dstq, dststrideq
  489. add srcq, srcstrideq
  490. dec heightd ; next row
  491. jg .nextrow
  492. REP_RET
  493. %macro FILTER_V 1
  494. ; 4x4 block, V-only 4-tap filter
  495. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  496. shl myd, 5
  497. %ifdef PIC
  498. lea picregq, [fourtap_filter_v_m]
  499. %endif
  500. lea myq, [fourtap_filter_v+myq-32]
  501. mova m6, [pw_64]
  502. pxor m7, m7
  503. mova m5, [myq+48]
  504. ; read 3 lines
  505. sub srcq, srcstrideq
  506. movh m0, [srcq]
  507. movh m1, [srcq+ srcstrideq]
  508. movh m2, [srcq+2*srcstrideq]
  509. add srcq, srcstrideq
  510. punpcklbw m0, m7
  511. punpcklbw m1, m7
  512. punpcklbw m2, m7
  513. .nextrow:
  514. ; first calculate negative taps (to prevent losing positive overflows)
  515. movh m4, [srcq+2*srcstrideq] ; read new row
  516. punpcklbw m4, m7
  517. mova m3, m4
  518. pmullw m0, [myq+0]
  519. pmullw m4, m5
  520. paddsw m4, m0
  521. ; then calculate positive taps
  522. mova m0, m1
  523. pmullw m1, [myq+16]
  524. paddsw m4, m1
  525. mova m1, m2
  526. pmullw m2, [myq+32]
  527. paddsw m4, m2
  528. mova m2, m3
  529. ; round/clip/store
  530. paddsw m4, m6
  531. psraw m4, 7
  532. packuswb m4, m7
  533. movh [dstq], m4
  534. ; go to next line
  535. add dstq, dststrideq
  536. add srcq, srcstrideq
  537. dec heightd ; next row
  538. jg .nextrow
  539. REP_RET
  540. ; 4x4 block, V-only 6-tap filter
  541. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  542. shl myd, 4
  543. lea myq, [myq*3]
  544. %ifdef PIC
  545. lea picregq, [sixtap_filter_v_m]
  546. %endif
  547. lea myq, [sixtap_filter_v+myq-96]
  548. pxor m7, m7
  549. ; read 5 lines
  550. sub srcq, srcstrideq
  551. sub srcq, srcstrideq
  552. movh m0, [srcq]
  553. movh m1, [srcq+srcstrideq]
  554. movh m2, [srcq+srcstrideq*2]
  555. lea srcq, [srcq+srcstrideq*2]
  556. add srcq, srcstrideq
  557. movh m3, [srcq]
  558. movh m4, [srcq+srcstrideq]
  559. punpcklbw m0, m7
  560. punpcklbw m1, m7
  561. punpcklbw m2, m7
  562. punpcklbw m3, m7
  563. punpcklbw m4, m7
  564. .nextrow:
  565. ; first calculate negative taps (to prevent losing positive overflows)
  566. mova m5, m1
  567. pmullw m5, [myq+16]
  568. mova m6, m4
  569. pmullw m6, [myq+64]
  570. paddsw m6, m5
  571. ; then calculate positive taps
  572. movh m5, [srcq+2*srcstrideq] ; read new row
  573. punpcklbw m5, m7
  574. pmullw m0, [myq+0]
  575. paddsw m6, m0
  576. mova m0, m1
  577. mova m1, m2
  578. pmullw m2, [myq+32]
  579. paddsw m6, m2
  580. mova m2, m3
  581. pmullw m3, [myq+48]
  582. paddsw m6, m3
  583. mova m3, m4
  584. mova m4, m5
  585. pmullw m5, [myq+80]
  586. paddsw m6, m5
  587. ; round/clip/store
  588. paddsw m6, [pw_64]
  589. psraw m6, 7
  590. packuswb m6, m7
  591. movh [dstq], m6
  592. ; go to next line
  593. add dstq, dststrideq
  594. add srcq, srcstrideq
  595. dec heightd ; next row
  596. jg .nextrow
  597. REP_RET
  598. %endmacro
  599. INIT_MMX mmxext
  600. FILTER_V 4
  601. INIT_XMM sse2
  602. FILTER_V 8
  603. %macro FILTER_BILINEAR 1
  604. cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
  605. shl myd, 4
  606. %ifdef PIC
  607. lea picregq, [bilinear_filter_vw_m]
  608. %endif
  609. pxor m6, m6
  610. mova m5, [bilinear_filter_vw+myq-1*16]
  611. neg myq
  612. mova m4, [bilinear_filter_vw+myq+7*16]
  613. .nextrow:
  614. movh m0, [srcq+srcstrideq*0]
  615. movh m1, [srcq+srcstrideq*1]
  616. movh m3, [srcq+srcstrideq*2]
  617. punpcklbw m0, m6
  618. punpcklbw m1, m6
  619. punpcklbw m3, m6
  620. mova m2, m1
  621. pmullw m0, m4
  622. pmullw m1, m5
  623. pmullw m2, m4
  624. pmullw m3, m5
  625. paddsw m0, m1
  626. paddsw m2, m3
  627. psraw m0, 2
  628. psraw m2, 2
  629. pavgw m0, m6
  630. pavgw m2, m6
  631. %if mmsize == 8
  632. packuswb m0, m0
  633. packuswb m2, m2
  634. movh [dstq+dststrideq*0], m0
  635. movh [dstq+dststrideq*1], m2
  636. %else
  637. packuswb m0, m2
  638. movh [dstq+dststrideq*0], m0
  639. movhps [dstq+dststrideq*1], m0
  640. %endif
  641. lea dstq, [dstq+dststrideq*2]
  642. lea srcq, [srcq+srcstrideq*2]
  643. sub heightd, 2
  644. jg .nextrow
  645. REP_RET
  646. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  647. shl mxd, 4
  648. %ifdef PIC
  649. lea picregq, [bilinear_filter_vw_m]
  650. %endif
  651. pxor m6, m6
  652. mova m5, [bilinear_filter_vw+mxq-1*16]
  653. neg mxq
  654. mova m4, [bilinear_filter_vw+mxq+7*16]
  655. .nextrow:
  656. movh m0, [srcq+srcstrideq*0+0]
  657. movh m1, [srcq+srcstrideq*0+1]
  658. movh m2, [srcq+srcstrideq*1+0]
  659. movh m3, [srcq+srcstrideq*1+1]
  660. punpcklbw m0, m6
  661. punpcklbw m1, m6
  662. punpcklbw m2, m6
  663. punpcklbw m3, m6
  664. pmullw m0, m4
  665. pmullw m1, m5
  666. pmullw m2, m4
  667. pmullw m3, m5
  668. paddsw m0, m1
  669. paddsw m2, m3
  670. psraw m0, 2
  671. psraw m2, 2
  672. pavgw m0, m6
  673. pavgw m2, m6
  674. %if mmsize == 8
  675. packuswb m0, m0
  676. packuswb m2, m2
  677. movh [dstq+dststrideq*0], m0
  678. movh [dstq+dststrideq*1], m2
  679. %else
  680. packuswb m0, m2
  681. movh [dstq+dststrideq*0], m0
  682. movhps [dstq+dststrideq*1], m0
  683. %endif
  684. lea dstq, [dstq+dststrideq*2]
  685. lea srcq, [srcq+srcstrideq*2]
  686. sub heightd, 2
  687. jg .nextrow
  688. REP_RET
  689. %endmacro
  690. INIT_MMX mmxext
  691. FILTER_BILINEAR 4
  692. INIT_XMM sse2
  693. FILTER_BILINEAR 8
  694. %macro FILTER_BILINEAR_SSSE3 1
  695. cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
  696. shl myd, 4
  697. %ifdef PIC
  698. lea picregq, [bilinear_filter_vb_m]
  699. %endif
  700. pxor m4, m4
  701. mova m3, [bilinear_filter_vb+myq-16]
  702. .nextrow:
  703. movh m0, [srcq+srcstrideq*0]
  704. movh m1, [srcq+srcstrideq*1]
  705. movh m2, [srcq+srcstrideq*2]
  706. punpcklbw m0, m1
  707. punpcklbw m1, m2
  708. pmaddubsw m0, m3
  709. pmaddubsw m1, m3
  710. psraw m0, 2
  711. psraw m1, 2
  712. pavgw m0, m4
  713. pavgw m1, m4
  714. %if mmsize==8
  715. packuswb m0, m0
  716. packuswb m1, m1
  717. movh [dstq+dststrideq*0], m0
  718. movh [dstq+dststrideq*1], m1
  719. %else
  720. packuswb m0, m1
  721. movh [dstq+dststrideq*0], m0
  722. movhps [dstq+dststrideq*1], m0
  723. %endif
  724. lea dstq, [dstq+dststrideq*2]
  725. lea srcq, [srcq+srcstrideq*2]
  726. sub heightd, 2
  727. jg .nextrow
  728. REP_RET
  729. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
  730. shl mxd, 4
  731. %ifdef PIC
  732. lea picregq, [bilinear_filter_vb_m]
  733. %endif
  734. pxor m4, m4
  735. mova m2, [filter_h2_shuf]
  736. mova m3, [bilinear_filter_vb+mxq-16]
  737. .nextrow:
  738. movu m0, [srcq+srcstrideq*0]
  739. movu m1, [srcq+srcstrideq*1]
  740. pshufb m0, m2
  741. pshufb m1, m2
  742. pmaddubsw m0, m3
  743. pmaddubsw m1, m3
  744. psraw m0, 2
  745. psraw m1, 2
  746. pavgw m0, m4
  747. pavgw m1, m4
  748. %if mmsize==8
  749. packuswb m0, m0
  750. packuswb m1, m1
  751. movh [dstq+dststrideq*0], m0
  752. movh [dstq+dststrideq*1], m1
  753. %else
  754. packuswb m0, m1
  755. movh [dstq+dststrideq*0], m0
  756. movhps [dstq+dststrideq*1], m0
  757. %endif
  758. lea dstq, [dstq+dststrideq*2]
  759. lea srcq, [srcq+srcstrideq*2]
  760. sub heightd, 2
  761. jg .nextrow
  762. REP_RET
  763. %endmacro
  764. INIT_MMX ssse3
  765. FILTER_BILINEAR_SSSE3 4
  766. INIT_XMM ssse3
  767. FILTER_BILINEAR_SSSE3 8
  768. INIT_MMX mmx
  769. cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
  770. .nextrow:
  771. movq mm0, [srcq+srcstrideq*0]
  772. movq mm1, [srcq+srcstrideq*1]
  773. lea srcq, [srcq+srcstrideq*2]
  774. movq [dstq+dststrideq*0], mm0
  775. movq [dstq+dststrideq*1], mm1
  776. lea dstq, [dstq+dststrideq*2]
  777. sub heightd, 2
  778. jg .nextrow
  779. REP_RET
  780. %if ARCH_X86_32
  781. INIT_MMX mmx
  782. cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
  783. .nextrow:
  784. movq mm0, [srcq+srcstrideq*0+0]
  785. movq mm1, [srcq+srcstrideq*0+8]
  786. movq mm2, [srcq+srcstrideq*1+0]
  787. movq mm3, [srcq+srcstrideq*1+8]
  788. lea srcq, [srcq+srcstrideq*2]
  789. movq [dstq+dststrideq*0+0], mm0
  790. movq [dstq+dststrideq*0+8], mm1
  791. movq [dstq+dststrideq*1+0], mm2
  792. movq [dstq+dststrideq*1+8], mm3
  793. lea dstq, [dstq+dststrideq*2]
  794. sub heightd, 2
  795. jg .nextrow
  796. REP_RET
  797. %endif
  798. INIT_XMM sse
  799. cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
  800. .nextrow:
  801. movups xmm0, [srcq+srcstrideq*0]
  802. movups xmm1, [srcq+srcstrideq*1]
  803. lea srcq, [srcq+srcstrideq*2]
  804. movaps [dstq+dststrideq*0], xmm0
  805. movaps [dstq+dststrideq*1], xmm1
  806. lea dstq, [dstq+dststrideq*2]
  807. sub heightd, 2
  808. jg .nextrow
  809. REP_RET
  810. ;-----------------------------------------------------------------------------
  811. ; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
  812. ;-----------------------------------------------------------------------------
  813. %macro ADD_DC 4
  814. %4 m2, [dst1q+%3]
  815. %4 m3, [dst1q+strideq+%3]
  816. %4 m4, [dst2q+%3]
  817. %4 m5, [dst2q+strideq+%3]
  818. paddusb m2, %1
  819. paddusb m3, %1
  820. paddusb m4, %1
  821. paddusb m5, %1
  822. psubusb m2, %2
  823. psubusb m3, %2
  824. psubusb m4, %2
  825. psubusb m5, %2
  826. %4 [dst1q+%3], m2
  827. %4 [dst1q+strideq+%3], m3
  828. %4 [dst2q+%3], m4
  829. %4 [dst2q+strideq+%3], m5
  830. %endmacro
  831. INIT_MMX mmx
  832. cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
  833. ; load data
  834. movd m0, [blockq]
  835. ; calculate DC
  836. paddw m0, [pw_4]
  837. pxor m1, m1
  838. psraw m0, 3
  839. movd [blockq], m1
  840. psubw m1, m0
  841. packuswb m0, m0
  842. packuswb m1, m1
  843. punpcklbw m0, m0
  844. punpcklbw m1, m1
  845. punpcklwd m0, m0
  846. punpcklwd m1, m1
  847. ; add DC
  848. DEFINE_ARGS dst1, dst2, stride
  849. lea dst2q, [dst1q+strideq*2]
  850. ADD_DC m0, m1, 0, movh
  851. RET
  852. INIT_XMM sse4
  853. cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
  854. ; load data
  855. movd m0, [blockq]
  856. pxor m1, m1
  857. ; calculate DC
  858. paddw m0, [pw_4]
  859. movd [blockq], m1
  860. DEFINE_ARGS dst1, dst2, stride
  861. lea dst2q, [dst1q+strideq*2]
  862. movd m2, [dst1q]
  863. movd m3, [dst1q+strideq]
  864. movd m4, [dst2q]
  865. movd m5, [dst2q+strideq]
  866. psraw m0, 3
  867. pshuflw m0, m0, 0
  868. punpcklqdq m0, m0
  869. punpckldq m2, m3
  870. punpckldq m4, m5
  871. punpcklbw m2, m1
  872. punpcklbw m4, m1
  873. paddw m2, m0
  874. paddw m4, m0
  875. packuswb m2, m4
  876. movd [dst1q], m2
  877. pextrd [dst1q+strideq], m2, 1
  878. pextrd [dst2q], m2, 2
  879. pextrd [dst2q+strideq], m2, 3
  880. RET
  881. ;-----------------------------------------------------------------------------
  882. ; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
  883. ;-----------------------------------------------------------------------------
  884. %if ARCH_X86_32
  885. INIT_MMX mmx
  886. cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
  887. ; load data
  888. movd m0, [blockq+32*0] ; A
  889. movd m1, [blockq+32*2] ; C
  890. punpcklwd m0, [blockq+32*1] ; A B
  891. punpcklwd m1, [blockq+32*3] ; C D
  892. punpckldq m0, m1 ; A B C D
  893. pxor m6, m6
  894. ; calculate DC
  895. paddw m0, [pw_4]
  896. movd [blockq+32*0], m6
  897. movd [blockq+32*1], m6
  898. movd [blockq+32*2], m6
  899. movd [blockq+32*3], m6
  900. psraw m0, 3
  901. psubw m6, m0
  902. packuswb m0, m0
  903. packuswb m6, m6
  904. punpcklbw m0, m0 ; AABBCCDD
  905. punpcklbw m6, m6 ; AABBCCDD
  906. movq m1, m0
  907. movq m7, m6
  908. punpcklbw m0, m0 ; AAAABBBB
  909. punpckhbw m1, m1 ; CCCCDDDD
  910. punpcklbw m6, m6 ; AAAABBBB
  911. punpckhbw m7, m7 ; CCCCDDDD
  912. ; add DC
  913. DEFINE_ARGS dst1, dst2, stride
  914. lea dst2q, [dst1q+strideq*2]
  915. ADD_DC m0, m6, 0, mova
  916. ADD_DC m1, m7, 8, mova
  917. RET
  918. %endif
  919. INIT_XMM sse2
  920. cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
  921. ; load data
  922. movd m0, [blockq+32*0] ; A
  923. movd m1, [blockq+32*2] ; C
  924. punpcklwd m0, [blockq+32*1] ; A B
  925. punpcklwd m1, [blockq+32*3] ; C D
  926. punpckldq m0, m1 ; A B C D
  927. pxor m1, m1
  928. ; calculate DC
  929. paddw m0, [pw_4]
  930. movd [blockq+32*0], m1
  931. movd [blockq+32*1], m1
  932. movd [blockq+32*2], m1
  933. movd [blockq+32*3], m1
  934. psraw m0, 3
  935. psubw m1, m0
  936. packuswb m0, m0
  937. packuswb m1, m1
  938. punpcklbw m0, m0
  939. punpcklbw m1, m1
  940. punpcklbw m0, m0
  941. punpcklbw m1, m1
  942. ; add DC
  943. DEFINE_ARGS dst1, dst2, stride
  944. lea dst2q, [dst1q+strideq*2]
  945. ADD_DC m0, m1, 0, mova
  946. RET
  947. ;-----------------------------------------------------------------------------
  948. ; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
  949. ;-----------------------------------------------------------------------------
  950. INIT_MMX mmx
  951. cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
  952. ; load data
  953. movd m0, [blockq+32*0] ; A
  954. movd m1, [blockq+32*2] ; C
  955. punpcklwd m0, [blockq+32*1] ; A B
  956. punpcklwd m1, [blockq+32*3] ; C D
  957. punpckldq m0, m1 ; A B C D
  958. pxor m6, m6
  959. ; calculate DC
  960. paddw m0, [pw_4]
  961. movd [blockq+32*0], m6
  962. movd [blockq+32*1], m6
  963. movd [blockq+32*2], m6
  964. movd [blockq+32*3], m6
  965. psraw m0, 3
  966. psubw m6, m0
  967. packuswb m0, m0
  968. packuswb m6, m6
  969. punpcklbw m0, m0 ; AABBCCDD
  970. punpcklbw m6, m6 ; AABBCCDD
  971. movq m1, m0
  972. movq m7, m6
  973. punpcklbw m0, m0 ; AAAABBBB
  974. punpckhbw m1, m1 ; CCCCDDDD
  975. punpcklbw m6, m6 ; AAAABBBB
  976. punpckhbw m7, m7 ; CCCCDDDD
  977. ; add DC
  978. DEFINE_ARGS dst1, dst2, stride
  979. lea dst2q, [dst1q+strideq*2]
  980. ADD_DC m0, m6, 0, mova
  981. lea dst1q, [dst1q+strideq*4]
  982. lea dst2q, [dst2q+strideq*4]
  983. ADD_DC m1, m7, 0, mova
  984. RET
  985. ;-----------------------------------------------------------------------------
  986. ; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
  987. ;-----------------------------------------------------------------------------
  988. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  989. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  990. %macro VP8_MULTIPLY_SUMSUB 4
  991. mova %3, %1
  992. mova %4, %2
  993. pmulhw %3, m6 ;20091(1)
  994. pmulhw %4, m6 ;20091(2)
  995. paddw %3, %1
  996. paddw %4, %2
  997. paddw %1, %1
  998. paddw %2, %2
  999. pmulhw %1, m7 ;35468(1)
  1000. pmulhw %2, m7 ;35468(2)
  1001. psubw %1, %4
  1002. paddw %2, %3
  1003. %endmacro
  1004. ; calculate x0=%1+%3; x1=%1-%3
  1005. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  1006. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  1007. ; %5/%6 are temporary registers
  1008. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  1009. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  1010. SUMSUB_BA w, %3, %1, %5 ;t0, t1
  1011. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  1012. SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
  1013. SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
  1014. SWAP %4, %1
  1015. SWAP %4, %3
  1016. %endmacro
  1017. %macro VP8_IDCT_ADD 0
  1018. cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
  1019. ; load block data
  1020. movq m0, [blockq+ 0]
  1021. movq m1, [blockq+ 8]
  1022. movq m2, [blockq+16]
  1023. movq m3, [blockq+24]
  1024. movq m6, [pw_20091]
  1025. movq m7, [pw_17734]
  1026. %if cpuflag(sse)
  1027. xorps xmm0, xmm0
  1028. movaps [blockq+ 0], xmm0
  1029. movaps [blockq+16], xmm0
  1030. %else
  1031. pxor m4, m4
  1032. movq [blockq+ 0], m4
  1033. movq [blockq+ 8], m4
  1034. movq [blockq+16], m4
  1035. movq [blockq+24], m4
  1036. %endif
  1037. ; actual IDCT
  1038. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1039. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1040. paddw m0, [pw_4]
  1041. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1042. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1043. ; store
  1044. pxor m4, m4
  1045. DEFINE_ARGS dst1, dst2, stride
  1046. lea dst2q, [dst1q+2*strideq]
  1047. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
  1048. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
  1049. RET
  1050. %endmacro
  1051. %if ARCH_X86_32
  1052. INIT_MMX mmx
  1053. VP8_IDCT_ADD
  1054. %endif
  1055. INIT_MMX sse
  1056. VP8_IDCT_ADD
  1057. ;-----------------------------------------------------------------------------
  1058. ; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
  1059. ;-----------------------------------------------------------------------------
  1060. %macro SCATTER_WHT 3
  1061. movd dc1d, m%1
  1062. movd dc2d, m%2
  1063. mov [blockq+2*16*(0+%3)], dc1w
  1064. mov [blockq+2*16*(1+%3)], dc2w
  1065. shr dc1d, 16
  1066. shr dc2d, 16
  1067. psrlq m%1, 32
  1068. psrlq m%2, 32
  1069. mov [blockq+2*16*(4+%3)], dc1w
  1070. mov [blockq+2*16*(5+%3)], dc2w
  1071. movd dc1d, m%1
  1072. movd dc2d, m%2
  1073. mov [blockq+2*16*(8+%3)], dc1w
  1074. mov [blockq+2*16*(9+%3)], dc2w
  1075. shr dc1d, 16
  1076. shr dc2d, 16
  1077. mov [blockq+2*16*(12+%3)], dc1w
  1078. mov [blockq+2*16*(13+%3)], dc2w
  1079. %endmacro
  1080. %macro HADAMARD4_1D 4
  1081. SUMSUB_BADC w, %2, %1, %4, %3
  1082. SUMSUB_BADC w, %4, %2, %3, %1
  1083. SWAP %1, %4, %3
  1084. %endmacro
  1085. %macro VP8_DC_WHT 0
  1086. cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
  1087. movq m0, [dc1q]
  1088. movq m1, [dc1q+8]
  1089. movq m2, [dc1q+16]
  1090. movq m3, [dc1q+24]
  1091. %if cpuflag(sse)
  1092. xorps xmm0, xmm0
  1093. movaps [dc1q+ 0], xmm0
  1094. movaps [dc1q+16], xmm0
  1095. %else
  1096. pxor m4, m4
  1097. movq [dc1q+ 0], m4
  1098. movq [dc1q+ 8], m4
  1099. movq [dc1q+16], m4
  1100. movq [dc1q+24], m4
  1101. %endif
  1102. HADAMARD4_1D 0, 1, 2, 3
  1103. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1104. paddw m0, [pw_3]
  1105. HADAMARD4_1D 0, 1, 2, 3
  1106. psraw m0, 3
  1107. psraw m1, 3
  1108. psraw m2, 3
  1109. psraw m3, 3
  1110. SCATTER_WHT 0, 1, 0
  1111. SCATTER_WHT 2, 3, 2
  1112. RET
  1113. %endmacro
  1114. %if ARCH_X86_32
  1115. INIT_MMX mmx
  1116. VP8_DC_WHT
  1117. %endif
  1118. INIT_MMX sse
  1119. VP8_DC_WHT