You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1215 lines
34KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Fiona Glaser <fiona@x264.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. fourtap_filter_hw_m: times 4 dw -6, 123
  25. times 4 dw 12, -1
  26. times 4 dw -9, 93
  27. times 4 dw 50, -6
  28. times 4 dw -6, 50
  29. times 4 dw 93, -9
  30. times 4 dw -1, 12
  31. times 4 dw 123, -6
  32. sixtap_filter_hw_m: times 4 dw 2, -11
  33. times 4 dw 108, 36
  34. times 4 dw -8, 1
  35. times 4 dw 3, -16
  36. times 4 dw 77, 77
  37. times 4 dw -16, 3
  38. times 4 dw 1, -8
  39. times 4 dw 36, 108
  40. times 4 dw -11, 2
  41. fourtap_filter_hb_m: times 8 db -6, 123
  42. times 8 db 12, -1
  43. times 8 db -9, 93
  44. times 8 db 50, -6
  45. times 8 db -6, 50
  46. times 8 db 93, -9
  47. times 8 db -1, 12
  48. times 8 db 123, -6
  49. sixtap_filter_hb_m: times 8 db 2, 1
  50. times 8 db -11, 108
  51. times 8 db 36, -8
  52. times 8 db 3, 3
  53. times 8 db -16, 77
  54. times 8 db 77, -16
  55. times 8 db 1, 2
  56. times 8 db -8, 36
  57. times 8 db 108, -11
  58. fourtap_filter_v_m: times 8 dw -6
  59. times 8 dw 123
  60. times 8 dw 12
  61. times 8 dw -1
  62. times 8 dw -9
  63. times 8 dw 93
  64. times 8 dw 50
  65. times 8 dw -6
  66. times 8 dw -6
  67. times 8 dw 50
  68. times 8 dw 93
  69. times 8 dw -9
  70. times 8 dw -1
  71. times 8 dw 12
  72. times 8 dw 123
  73. times 8 dw -6
  74. sixtap_filter_v_m: times 8 dw 2
  75. times 8 dw -11
  76. times 8 dw 108
  77. times 8 dw 36
  78. times 8 dw -8
  79. times 8 dw 1
  80. times 8 dw 3
  81. times 8 dw -16
  82. times 8 dw 77
  83. times 8 dw 77
  84. times 8 dw -16
  85. times 8 dw 3
  86. times 8 dw 1
  87. times 8 dw -8
  88. times 8 dw 36
  89. times 8 dw 108
  90. times 8 dw -11
  91. times 8 dw 2
  92. bilinear_filter_vw_m: times 8 dw 1
  93. times 8 dw 2
  94. times 8 dw 3
  95. times 8 dw 4
  96. times 8 dw 5
  97. times 8 dw 6
  98. times 8 dw 7
  99. bilinear_filter_vb_m: times 8 db 7, 1
  100. times 8 db 6, 2
  101. times 8 db 5, 3
  102. times 8 db 4, 4
  103. times 8 db 3, 5
  104. times 8 db 2, 6
  105. times 8 db 1, 7
  106. %ifdef PIC
  107. %define fourtap_filter_hw picregq
  108. %define sixtap_filter_hw picregq
  109. %define fourtap_filter_hb picregq
  110. %define sixtap_filter_hb picregq
  111. %define fourtap_filter_v picregq
  112. %define sixtap_filter_v picregq
  113. %define bilinear_filter_vw picregq
  114. %define bilinear_filter_vb picregq
  115. %define npicregs 1
  116. %else
  117. %define fourtap_filter_hw fourtap_filter_hw_m
  118. %define sixtap_filter_hw sixtap_filter_hw_m
  119. %define fourtap_filter_hb fourtap_filter_hb_m
  120. %define sixtap_filter_hb sixtap_filter_hb_m
  121. %define fourtap_filter_v fourtap_filter_v_m
  122. %define sixtap_filter_v sixtap_filter_v_m
  123. %define bilinear_filter_vw bilinear_filter_vw_m
  124. %define bilinear_filter_vb bilinear_filter_vb_m
  125. %define npicregs 0
  126. %endif
  127. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  128. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  129. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  130. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  131. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  132. pw_20091: times 4 dw 20091
  133. pw_17734: times 4 dw 17734
  134. cextern pw_3
  135. cextern pw_4
  136. cextern pw_64
  137. cextern pw_256
  138. SECTION .text
  139. ;-------------------------------------------------------------------------------
  140. ; subpel MC functions:
  141. ;
  142. ; void ff_put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, ptrdiff_t deststride,
  143. ; uint8_t *src, ptrdiff_t srcstride,
  144. ; int height, int mx, int my);
  145. ;-------------------------------------------------------------------------------
  146. %macro FILTER_SSSE3 1
  147. cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
  148. lea mxd, [mxq*3]
  149. mova m3, [filter_h6_shuf2]
  150. mova m4, [filter_h6_shuf3]
  151. %ifdef PIC
  152. lea picregq, [sixtap_filter_hb_m]
  153. %endif
  154. mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
  155. mova m6, [sixtap_filter_hb+mxq*8-32]
  156. mova m7, [sixtap_filter_hb+mxq*8-16]
  157. .nextrow:
  158. movu m0, [srcq-2]
  159. mova m1, m0
  160. mova m2, m0
  161. %if mmsize == 8
  162. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  163. ; shuffle with a memory operand
  164. punpcklbw m0, [srcq+3]
  165. %else
  166. pshufb m0, [filter_h6_shuf1]
  167. %endif
  168. pshufb m1, m3
  169. pshufb m2, m4
  170. pmaddubsw m0, m5
  171. pmaddubsw m1, m6
  172. pmaddubsw m2, m7
  173. paddsw m0, m1
  174. paddsw m0, m2
  175. pmulhrsw m0, [pw_256]
  176. packuswb m0, m0
  177. movh [dstq], m0 ; store
  178. ; go to next line
  179. add dstq, dststrideq
  180. add srcq, srcstrideq
  181. dec heightd ; next row
  182. jg .nextrow
  183. REP_RET
  184. cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  185. shl mxd, 4
  186. mova m2, [pw_256]
  187. mova m3, [filter_h2_shuf]
  188. mova m4, [filter_h4_shuf]
  189. %ifdef PIC
  190. lea picregq, [fourtap_filter_hb_m]
  191. %endif
  192. mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
  193. mova m6, [fourtap_filter_hb+mxq]
  194. .nextrow:
  195. movu m0, [srcq-1]
  196. mova m1, m0
  197. pshufb m0, m3
  198. pshufb m1, m4
  199. pmaddubsw m0, m5
  200. pmaddubsw m1, m6
  201. paddsw m0, m1
  202. pmulhrsw m0, m2
  203. packuswb m0, m0
  204. movh [dstq], m0 ; store
  205. ; go to next line
  206. add dstq, dststrideq
  207. add srcq, srcstrideq
  208. dec heightd ; next row
  209. jg .nextrow
  210. REP_RET
  211. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  212. shl myd, 4
  213. %ifdef PIC
  214. lea picregq, [fourtap_filter_hb_m]
  215. %endif
  216. mova m5, [fourtap_filter_hb+myq-16]
  217. mova m6, [fourtap_filter_hb+myq]
  218. mova m7, [pw_256]
  219. ; read 3 lines
  220. sub srcq, srcstrideq
  221. movh m0, [srcq]
  222. movh m1, [srcq+ srcstrideq]
  223. movh m2, [srcq+2*srcstrideq]
  224. add srcq, srcstrideq
  225. .nextrow:
  226. movh m3, [srcq+2*srcstrideq] ; read new row
  227. mova m4, m0
  228. mova m0, m1
  229. punpcklbw m4, m1
  230. mova m1, m2
  231. punpcklbw m2, m3
  232. pmaddubsw m4, m5
  233. pmaddubsw m2, m6
  234. paddsw m4, m2
  235. mova m2, m3
  236. pmulhrsw m4, m7
  237. packuswb m4, m4
  238. movh [dstq], m4
  239. ; go to next line
  240. add dstq, dststrideq
  241. add srcq, srcstrideq
  242. dec heightd ; next row
  243. jg .nextrow
  244. REP_RET
  245. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  246. lea myd, [myq*3]
  247. %ifdef PIC
  248. lea picregq, [sixtap_filter_hb_m]
  249. %endif
  250. lea myq, [sixtap_filter_hb+myq*8]
  251. ; read 5 lines
  252. sub srcq, srcstrideq
  253. sub srcq, srcstrideq
  254. movh m0, [srcq]
  255. movh m1, [srcq+srcstrideq]
  256. movh m2, [srcq+srcstrideq*2]
  257. lea srcq, [srcq+srcstrideq*2]
  258. add srcq, srcstrideq
  259. movh m3, [srcq]
  260. movh m4, [srcq+srcstrideq]
  261. .nextrow:
  262. movh m5, [srcq+2*srcstrideq] ; read new row
  263. mova m6, m0
  264. punpcklbw m6, m5
  265. mova m0, m1
  266. punpcklbw m1, m2
  267. mova m7, m3
  268. punpcklbw m7, m4
  269. pmaddubsw m6, [myq-48]
  270. pmaddubsw m1, [myq-32]
  271. pmaddubsw m7, [myq-16]
  272. paddsw m6, m1
  273. paddsw m6, m7
  274. mova m1, m2
  275. mova m2, m3
  276. pmulhrsw m6, [pw_256]
  277. mova m3, m4
  278. packuswb m6, m6
  279. mova m4, m5
  280. movh [dstq], m6
  281. ; go to next line
  282. add dstq, dststrideq
  283. add srcq, srcstrideq
  284. dec heightd ; next row
  285. jg .nextrow
  286. REP_RET
  287. %endmacro
  288. INIT_MMX ssse3
  289. FILTER_SSSE3 4
  290. INIT_XMM ssse3
  291. FILTER_SSSE3 8
  292. ; 4x4 block, H-only 4-tap filter
  293. INIT_MMX mmxext
  294. cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  295. shl mxd, 4
  296. %ifdef PIC
  297. lea picregq, [fourtap_filter_hw_m]
  298. %endif
  299. movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
  300. movq mm5, [fourtap_filter_hw+mxq]
  301. movq mm7, [pw_64]
  302. pxor mm6, mm6
  303. .nextrow:
  304. movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
  305. ; first set of 2 pixels
  306. movq mm2, mm1 ; byte ABCD..
  307. punpcklbw mm1, mm6 ; byte->word ABCD
  308. pshufw mm0, mm2, 9 ; byte CDEF..
  309. punpcklbw mm0, mm6 ; byte->word CDEF
  310. pshufw mm3, mm1, 0x94 ; word ABBC
  311. pshufw mm1, mm0, 0x94 ; word CDDE
  312. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  313. movq mm0, mm1 ; backup for second set of pixels
  314. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  315. paddd mm3, mm1 ; finish 1st 2px
  316. ; second set of 2 pixels, use backup of above
  317. punpckhbw mm2, mm6 ; byte->word EFGH
  318. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  319. pshufw mm1, mm2, 0x94 ; word EFFG
  320. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  321. paddd mm0, mm1 ; finish 2nd 2px
  322. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  323. packssdw mm3, mm0 ; merge dword->word (4px)
  324. paddsw mm3, mm7 ; rounding
  325. psraw mm3, 7
  326. packuswb mm3, mm6 ; clip and word->bytes
  327. movd [dstq], mm3 ; store
  328. ; go to next line
  329. add dstq, dststrideq
  330. add srcq, srcstrideq
  331. dec heightd ; next row
  332. jg .nextrow
  333. REP_RET
  334. ; 4x4 block, H-only 6-tap filter
  335. INIT_MMX mmxext
  336. cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  337. lea mxd, [mxq*3]
  338. %ifdef PIC
  339. lea picregq, [sixtap_filter_hw_m]
  340. %endif
  341. movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
  342. movq mm5, [sixtap_filter_hw+mxq*8-32]
  343. movq mm6, [sixtap_filter_hw+mxq*8-16]
  344. movq mm7, [pw_64]
  345. pxor mm3, mm3
  346. .nextrow:
  347. movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
  348. ; first set of 2 pixels
  349. movq mm2, mm1 ; byte ABCD..
  350. punpcklbw mm1, mm3 ; byte->word ABCD
  351. pshufw mm0, mm2, 0x9 ; byte CDEF..
  352. punpckhbw mm2, mm3 ; byte->word EFGH
  353. punpcklbw mm0, mm3 ; byte->word CDEF
  354. pshufw mm1, mm1, 0x94 ; word ABBC
  355. pshufw mm2, mm2, 0x94 ; word EFFG
  356. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  357. pshufw mm3, mm0, 0x94 ; word CDDE
  358. movq mm0, mm3 ; backup for second set of pixels
  359. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  360. paddd mm1, mm3 ; add to 1st 2px cache
  361. movq mm3, mm2 ; backup for second set of pixels
  362. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  363. paddd mm1, mm2 ; finish 1st 2px
  364. ; second set of 2 pixels, use backup of above
  365. movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
  366. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  367. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  368. paddd mm0, mm3 ; add to 2nd 2px cache
  369. pxor mm3, mm3
  370. punpcklbw mm2, mm3 ; byte->word FGHI
  371. pshufw mm2, mm2, 0xE9 ; word GHHI
  372. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  373. paddd mm0, mm2 ; finish 2nd 2px
  374. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  375. packssdw mm1, mm0 ; merge dword->word (4px)
  376. paddsw mm1, mm7 ; rounding
  377. psraw mm1, 7
  378. packuswb mm1, mm3 ; clip and word->bytes
  379. movd [dstq], mm1 ; store
  380. ; go to next line
  381. add dstq, dststrideq
  382. add srcq, srcstrideq
  383. dec heightd ; next row
  384. jg .nextrow
  385. REP_RET
  386. INIT_XMM sse2
  387. cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
  388. shl mxd, 5
  389. %ifdef PIC
  390. lea picregq, [fourtap_filter_v_m]
  391. %endif
  392. lea mxq, [fourtap_filter_v+mxq-32]
  393. pxor m7, m7
  394. mova m4, [pw_64]
  395. mova m5, [mxq+ 0]
  396. mova m6, [mxq+16]
  397. %ifdef m8
  398. mova m8, [mxq+32]
  399. mova m9, [mxq+48]
  400. %endif
  401. .nextrow:
  402. movq m0, [srcq-1]
  403. movq m1, [srcq-0]
  404. movq m2, [srcq+1]
  405. movq m3, [srcq+2]
  406. punpcklbw m0, m7
  407. punpcklbw m1, m7
  408. punpcklbw m2, m7
  409. punpcklbw m3, m7
  410. pmullw m0, m5
  411. pmullw m1, m6
  412. %ifdef m8
  413. pmullw m2, m8
  414. pmullw m3, m9
  415. %else
  416. pmullw m2, [mxq+32]
  417. pmullw m3, [mxq+48]
  418. %endif
  419. paddsw m0, m1
  420. paddsw m2, m3
  421. paddsw m0, m2
  422. paddsw m0, m4
  423. psraw m0, 7
  424. packuswb m0, m7
  425. movh [dstq], m0 ; store
  426. ; go to next line
  427. add dstq, dststrideq
  428. add srcq, srcstrideq
  429. dec heightd ; next row
  430. jg .nextrow
  431. REP_RET
  432. INIT_XMM sse2
  433. cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
  434. lea mxd, [mxq*3]
  435. shl mxd, 4
  436. %ifdef PIC
  437. lea picregq, [sixtap_filter_v_m]
  438. %endif
  439. lea mxq, [sixtap_filter_v+mxq-96]
  440. pxor m7, m7
  441. mova m6, [pw_64]
  442. %ifdef m8
  443. mova m8, [mxq+ 0]
  444. mova m9, [mxq+16]
  445. mova m10, [mxq+32]
  446. mova m11, [mxq+48]
  447. mova m12, [mxq+64]
  448. mova m13, [mxq+80]
  449. %endif
  450. .nextrow:
  451. movq m0, [srcq-2]
  452. movq m1, [srcq-1]
  453. movq m2, [srcq-0]
  454. movq m3, [srcq+1]
  455. movq m4, [srcq+2]
  456. movq m5, [srcq+3]
  457. punpcklbw m0, m7
  458. punpcklbw m1, m7
  459. punpcklbw m2, m7
  460. punpcklbw m3, m7
  461. punpcklbw m4, m7
  462. punpcklbw m5, m7
  463. %ifdef m8
  464. pmullw m0, m8
  465. pmullw m1, m9
  466. pmullw m2, m10
  467. pmullw m3, m11
  468. pmullw m4, m12
  469. pmullw m5, m13
  470. %else
  471. pmullw m0, [mxq+ 0]
  472. pmullw m1, [mxq+16]
  473. pmullw m2, [mxq+32]
  474. pmullw m3, [mxq+48]
  475. pmullw m4, [mxq+64]
  476. pmullw m5, [mxq+80]
  477. %endif
  478. paddsw m1, m4
  479. paddsw m0, m5
  480. paddsw m1, m2
  481. paddsw m0, m3
  482. paddsw m0, m1
  483. paddsw m0, m6
  484. psraw m0, 7
  485. packuswb m0, m7
  486. movh [dstq], m0 ; store
  487. ; go to next line
  488. add dstq, dststrideq
  489. add srcq, srcstrideq
  490. dec heightd ; next row
  491. jg .nextrow
  492. REP_RET
  493. %macro FILTER_V 1
  494. ; 4x4 block, V-only 4-tap filter
  495. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  496. shl myd, 5
  497. %ifdef PIC
  498. lea picregq, [fourtap_filter_v_m]
  499. %endif
  500. lea myq, [fourtap_filter_v+myq-32]
  501. mova m6, [pw_64]
  502. pxor m7, m7
  503. mova m5, [myq+48]
  504. ; read 3 lines
  505. sub srcq, srcstrideq
  506. movh m0, [srcq]
  507. movh m1, [srcq+ srcstrideq]
  508. movh m2, [srcq+2*srcstrideq]
  509. add srcq, srcstrideq
  510. punpcklbw m0, m7
  511. punpcklbw m1, m7
  512. punpcklbw m2, m7
  513. .nextrow:
  514. ; first calculate negative taps (to prevent losing positive overflows)
  515. movh m4, [srcq+2*srcstrideq] ; read new row
  516. punpcklbw m4, m7
  517. mova m3, m4
  518. pmullw m0, [myq+0]
  519. pmullw m4, m5
  520. paddsw m4, m0
  521. ; then calculate positive taps
  522. mova m0, m1
  523. pmullw m1, [myq+16]
  524. paddsw m4, m1
  525. mova m1, m2
  526. pmullw m2, [myq+32]
  527. paddsw m4, m2
  528. mova m2, m3
  529. ; round/clip/store
  530. paddsw m4, m6
  531. psraw m4, 7
  532. packuswb m4, m7
  533. movh [dstq], m4
  534. ; go to next line
  535. add dstq, dststrideq
  536. add srcq, srcstrideq
  537. dec heightd ; next row
  538. jg .nextrow
  539. REP_RET
  540. ; 4x4 block, V-only 6-tap filter
  541. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  542. shl myd, 4
  543. lea myq, [myq*3]
  544. %ifdef PIC
  545. lea picregq, [sixtap_filter_v_m]
  546. %endif
  547. lea myq, [sixtap_filter_v+myq-96]
  548. pxor m7, m7
  549. ; read 5 lines
  550. sub srcq, srcstrideq
  551. sub srcq, srcstrideq
  552. movh m0, [srcq]
  553. movh m1, [srcq+srcstrideq]
  554. movh m2, [srcq+srcstrideq*2]
  555. lea srcq, [srcq+srcstrideq*2]
  556. add srcq, srcstrideq
  557. movh m3, [srcq]
  558. movh m4, [srcq+srcstrideq]
  559. punpcklbw m0, m7
  560. punpcklbw m1, m7
  561. punpcklbw m2, m7
  562. punpcklbw m3, m7
  563. punpcklbw m4, m7
  564. .nextrow:
  565. ; first calculate negative taps (to prevent losing positive overflows)
  566. mova m5, m1
  567. pmullw m5, [myq+16]
  568. mova m6, m4
  569. pmullw m6, [myq+64]
  570. paddsw m6, m5
  571. ; then calculate positive taps
  572. movh m5, [srcq+2*srcstrideq] ; read new row
  573. punpcklbw m5, m7
  574. pmullw m0, [myq+0]
  575. paddsw m6, m0
  576. mova m0, m1
  577. mova m1, m2
  578. pmullw m2, [myq+32]
  579. paddsw m6, m2
  580. mova m2, m3
  581. pmullw m3, [myq+48]
  582. paddsw m6, m3
  583. mova m3, m4
  584. mova m4, m5
  585. pmullw m5, [myq+80]
  586. paddsw m6, m5
  587. ; round/clip/store
  588. paddsw m6, [pw_64]
  589. psraw m6, 7
  590. packuswb m6, m7
  591. movh [dstq], m6
  592. ; go to next line
  593. add dstq, dststrideq
  594. add srcq, srcstrideq
  595. dec heightd ; next row
  596. jg .nextrow
  597. REP_RET
  598. %endmacro
  599. INIT_MMX mmxext
  600. FILTER_V 4
  601. INIT_XMM sse2
  602. FILTER_V 8
  603. %macro FILTER_BILINEAR 1
  604. %if cpuflag(ssse3)
  605. cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
  606. shl myd, 4
  607. %ifdef PIC
  608. lea picregq, [bilinear_filter_vb_m]
  609. %endif
  610. pxor m4, m4
  611. mova m3, [bilinear_filter_vb+myq-16]
  612. .nextrow:
  613. movh m0, [srcq+srcstrideq*0]
  614. movh m1, [srcq+srcstrideq*1]
  615. movh m2, [srcq+srcstrideq*2]
  616. punpcklbw m0, m1
  617. punpcklbw m1, m2
  618. pmaddubsw m0, m3
  619. pmaddubsw m1, m3
  620. psraw m0, 2
  621. psraw m1, 2
  622. pavgw m0, m4
  623. pavgw m1, m4
  624. %if mmsize==8
  625. packuswb m0, m0
  626. packuswb m1, m1
  627. movh [dstq+dststrideq*0], m0
  628. movh [dstq+dststrideq*1], m1
  629. %else
  630. packuswb m0, m1
  631. movh [dstq+dststrideq*0], m0
  632. movhps [dstq+dststrideq*1], m0
  633. %endif
  634. %else ; cpuflag(ssse3)
  635. cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
  636. shl myd, 4
  637. %ifdef PIC
  638. lea picregq, [bilinear_filter_vw_m]
  639. %endif
  640. pxor m6, m6
  641. mova m5, [bilinear_filter_vw+myq-1*16]
  642. neg myq
  643. mova m4, [bilinear_filter_vw+myq+7*16]
  644. .nextrow:
  645. movh m0, [srcq+srcstrideq*0]
  646. movh m1, [srcq+srcstrideq*1]
  647. movh m3, [srcq+srcstrideq*2]
  648. punpcklbw m0, m6
  649. punpcklbw m1, m6
  650. punpcklbw m3, m6
  651. mova m2, m1
  652. pmullw m0, m4
  653. pmullw m1, m5
  654. pmullw m2, m4
  655. pmullw m3, m5
  656. paddsw m0, m1
  657. paddsw m2, m3
  658. psraw m0, 2
  659. psraw m2, 2
  660. pavgw m0, m6
  661. pavgw m2, m6
  662. %if mmsize == 8
  663. packuswb m0, m0
  664. packuswb m2, m2
  665. movh [dstq+dststrideq*0], m0
  666. movh [dstq+dststrideq*1], m2
  667. %else
  668. packuswb m0, m2
  669. movh [dstq+dststrideq*0], m0
  670. movhps [dstq+dststrideq*1], m0
  671. %endif
  672. %endif ; cpuflag(ssse3)
  673. lea dstq, [dstq+dststrideq*2]
  674. lea srcq, [srcq+srcstrideq*2]
  675. sub heightd, 2
  676. jg .nextrow
  677. REP_RET
  678. %if cpuflag(ssse3)
  679. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
  680. shl mxd, 4
  681. %ifdef PIC
  682. lea picregq, [bilinear_filter_vb_m]
  683. %endif
  684. pxor m4, m4
  685. mova m2, [filter_h2_shuf]
  686. mova m3, [bilinear_filter_vb+mxq-16]
  687. .nextrow:
  688. movu m0, [srcq+srcstrideq*0]
  689. movu m1, [srcq+srcstrideq*1]
  690. pshufb m0, m2
  691. pshufb m1, m2
  692. pmaddubsw m0, m3
  693. pmaddubsw m1, m3
  694. psraw m0, 2
  695. psraw m1, 2
  696. pavgw m0, m4
  697. pavgw m1, m4
  698. %if mmsize==8
  699. packuswb m0, m0
  700. packuswb m1, m1
  701. movh [dstq+dststrideq*0], m0
  702. movh [dstq+dststrideq*1], m1
  703. %else
  704. packuswb m0, m1
  705. movh [dstq+dststrideq*0], m0
  706. movhps [dstq+dststrideq*1], m0
  707. %endif
  708. %else ; cpuflag(ssse3)
  709. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  710. shl mxd, 4
  711. %ifdef PIC
  712. lea picregq, [bilinear_filter_vw_m]
  713. %endif
  714. pxor m6, m6
  715. mova m5, [bilinear_filter_vw+mxq-1*16]
  716. neg mxq
  717. mova m4, [bilinear_filter_vw+mxq+7*16]
  718. .nextrow:
  719. movh m0, [srcq+srcstrideq*0+0]
  720. movh m1, [srcq+srcstrideq*0+1]
  721. movh m2, [srcq+srcstrideq*1+0]
  722. movh m3, [srcq+srcstrideq*1+1]
  723. punpcklbw m0, m6
  724. punpcklbw m1, m6
  725. punpcklbw m2, m6
  726. punpcklbw m3, m6
  727. pmullw m0, m4
  728. pmullw m1, m5
  729. pmullw m2, m4
  730. pmullw m3, m5
  731. paddsw m0, m1
  732. paddsw m2, m3
  733. psraw m0, 2
  734. psraw m2, 2
  735. pavgw m0, m6
  736. pavgw m2, m6
  737. %if mmsize == 8
  738. packuswb m0, m0
  739. packuswb m2, m2
  740. movh [dstq+dststrideq*0], m0
  741. movh [dstq+dststrideq*1], m2
  742. %else
  743. packuswb m0, m2
  744. movh [dstq+dststrideq*0], m0
  745. movhps [dstq+dststrideq*1], m0
  746. %endif
  747. %endif ; cpuflag(ssse3)
  748. lea dstq, [dstq+dststrideq*2]
  749. lea srcq, [srcq+srcstrideq*2]
  750. sub heightd, 2
  751. jg .nextrow
  752. REP_RET
  753. %endmacro
  754. INIT_MMX mmxext
  755. FILTER_BILINEAR 4
  756. INIT_XMM sse2
  757. FILTER_BILINEAR 8
  758. INIT_MMX ssse3
  759. FILTER_BILINEAR 4
  760. INIT_XMM ssse3
  761. FILTER_BILINEAR 8
  762. INIT_MMX mmx
  763. cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
  764. .nextrow:
  765. movq mm0, [srcq+srcstrideq*0]
  766. movq mm1, [srcq+srcstrideq*1]
  767. lea srcq, [srcq+srcstrideq*2]
  768. movq [dstq+dststrideq*0], mm0
  769. movq [dstq+dststrideq*1], mm1
  770. lea dstq, [dstq+dststrideq*2]
  771. sub heightd, 2
  772. jg .nextrow
  773. REP_RET
  774. %if ARCH_X86_32
  775. INIT_MMX mmx
  776. cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
  777. .nextrow:
  778. movq mm0, [srcq+srcstrideq*0+0]
  779. movq mm1, [srcq+srcstrideq*0+8]
  780. movq mm2, [srcq+srcstrideq*1+0]
  781. movq mm3, [srcq+srcstrideq*1+8]
  782. lea srcq, [srcq+srcstrideq*2]
  783. movq [dstq+dststrideq*0+0], mm0
  784. movq [dstq+dststrideq*0+8], mm1
  785. movq [dstq+dststrideq*1+0], mm2
  786. movq [dstq+dststrideq*1+8], mm3
  787. lea dstq, [dstq+dststrideq*2]
  788. sub heightd, 2
  789. jg .nextrow
  790. REP_RET
  791. %endif
  792. INIT_XMM sse
  793. cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
  794. .nextrow:
  795. movups xmm0, [srcq+srcstrideq*0]
  796. movups xmm1, [srcq+srcstrideq*1]
  797. lea srcq, [srcq+srcstrideq*2]
  798. movaps [dstq+dststrideq*0], xmm0
  799. movaps [dstq+dststrideq*1], xmm1
  800. lea dstq, [dstq+dststrideq*2]
  801. sub heightd, 2
  802. jg .nextrow
  803. REP_RET
  804. ;-----------------------------------------------------------------------------
  805. ; void ff_vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
  806. ;-----------------------------------------------------------------------------
  807. %macro ADD_DC 4
  808. %4 m2, [dst1q+%3]
  809. %4 m3, [dst1q+strideq+%3]
  810. %4 m4, [dst2q+%3]
  811. %4 m5, [dst2q+strideq+%3]
  812. paddusb m2, %1
  813. paddusb m3, %1
  814. paddusb m4, %1
  815. paddusb m5, %1
  816. psubusb m2, %2
  817. psubusb m3, %2
  818. psubusb m4, %2
  819. psubusb m5, %2
  820. %4 [dst1q+%3], m2
  821. %4 [dst1q+strideq+%3], m3
  822. %4 [dst2q+%3], m4
  823. %4 [dst2q+strideq+%3], m5
  824. %endmacro
  825. INIT_MMX mmx
  826. cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
  827. ; load data
  828. movd m0, [blockq]
  829. ; calculate DC
  830. paddw m0, [pw_4]
  831. pxor m1, m1
  832. psraw m0, 3
  833. movd [blockq], m1
  834. psubw m1, m0
  835. packuswb m0, m0
  836. packuswb m1, m1
  837. punpcklbw m0, m0
  838. punpcklbw m1, m1
  839. punpcklwd m0, m0
  840. punpcklwd m1, m1
  841. ; add DC
  842. DEFINE_ARGS dst1, dst2, stride
  843. lea dst2q, [dst1q+strideq*2]
  844. ADD_DC m0, m1, 0, movh
  845. RET
  846. INIT_XMM sse4
  847. cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
  848. ; load data
  849. movd m0, [blockq]
  850. pxor m1, m1
  851. ; calculate DC
  852. paddw m0, [pw_4]
  853. movd [blockq], m1
  854. DEFINE_ARGS dst1, dst2, stride
  855. lea dst2q, [dst1q+strideq*2]
  856. movd m2, [dst1q]
  857. movd m3, [dst1q+strideq]
  858. movd m4, [dst2q]
  859. movd m5, [dst2q+strideq]
  860. psraw m0, 3
  861. pshuflw m0, m0, 0
  862. punpcklqdq m0, m0
  863. punpckldq m2, m3
  864. punpckldq m4, m5
  865. punpcklbw m2, m1
  866. punpcklbw m4, m1
  867. paddw m2, m0
  868. paddw m4, m0
  869. packuswb m2, m4
  870. movd [dst1q], m2
  871. pextrd [dst1q+strideq], m2, 1
  872. pextrd [dst2q], m2, 2
  873. pextrd [dst2q+strideq], m2, 3
  874. RET
  875. ;-----------------------------------------------------------------------------
  876. ; void ff_vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
  877. ;-----------------------------------------------------------------------------
  878. %if ARCH_X86_32
  879. INIT_MMX mmx
  880. cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
  881. ; load data
  882. movd m0, [blockq+32*0] ; A
  883. movd m1, [blockq+32*2] ; C
  884. punpcklwd m0, [blockq+32*1] ; A B
  885. punpcklwd m1, [blockq+32*3] ; C D
  886. punpckldq m0, m1 ; A B C D
  887. pxor m6, m6
  888. ; calculate DC
  889. paddw m0, [pw_4]
  890. movd [blockq+32*0], m6
  891. movd [blockq+32*1], m6
  892. movd [blockq+32*2], m6
  893. movd [blockq+32*3], m6
  894. psraw m0, 3
  895. psubw m6, m0
  896. packuswb m0, m0
  897. packuswb m6, m6
  898. punpcklbw m0, m0 ; AABBCCDD
  899. punpcklbw m6, m6 ; AABBCCDD
  900. movq m1, m0
  901. movq m7, m6
  902. punpcklbw m0, m0 ; AAAABBBB
  903. punpckhbw m1, m1 ; CCCCDDDD
  904. punpcklbw m6, m6 ; AAAABBBB
  905. punpckhbw m7, m7 ; CCCCDDDD
  906. ; add DC
  907. DEFINE_ARGS dst1, dst2, stride
  908. lea dst2q, [dst1q+strideq*2]
  909. ADD_DC m0, m6, 0, mova
  910. ADD_DC m1, m7, 8, mova
  911. RET
  912. %endif
  913. INIT_XMM sse2
  914. cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
  915. ; load data
  916. movd m0, [blockq+32*0] ; A
  917. movd m1, [blockq+32*2] ; C
  918. punpcklwd m0, [blockq+32*1] ; A B
  919. punpcklwd m1, [blockq+32*3] ; C D
  920. punpckldq m0, m1 ; A B C D
  921. pxor m1, m1
  922. ; calculate DC
  923. paddw m0, [pw_4]
  924. movd [blockq+32*0], m1
  925. movd [blockq+32*1], m1
  926. movd [blockq+32*2], m1
  927. movd [blockq+32*3], m1
  928. psraw m0, 3
  929. psubw m1, m0
  930. packuswb m0, m0
  931. packuswb m1, m1
  932. punpcklbw m0, m0
  933. punpcklbw m1, m1
  934. punpcklbw m0, m0
  935. punpcklbw m1, m1
  936. ; add DC
  937. DEFINE_ARGS dst1, dst2, stride
  938. lea dst2q, [dst1q+strideq*2]
  939. ADD_DC m0, m1, 0, mova
  940. RET
  941. ;-----------------------------------------------------------------------------
  942. ; void ff_vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
  943. ;-----------------------------------------------------------------------------
  944. INIT_MMX mmx
  945. cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
  946. ; load data
  947. movd m0, [blockq+32*0] ; A
  948. movd m1, [blockq+32*2] ; C
  949. punpcklwd m0, [blockq+32*1] ; A B
  950. punpcklwd m1, [blockq+32*3] ; C D
  951. punpckldq m0, m1 ; A B C D
  952. pxor m6, m6
  953. ; calculate DC
  954. paddw m0, [pw_4]
  955. movd [blockq+32*0], m6
  956. movd [blockq+32*1], m6
  957. movd [blockq+32*2], m6
  958. movd [blockq+32*3], m6
  959. psraw m0, 3
  960. psubw m6, m0
  961. packuswb m0, m0
  962. packuswb m6, m6
  963. punpcklbw m0, m0 ; AABBCCDD
  964. punpcklbw m6, m6 ; AABBCCDD
  965. movq m1, m0
  966. movq m7, m6
  967. punpcklbw m0, m0 ; AAAABBBB
  968. punpckhbw m1, m1 ; CCCCDDDD
  969. punpcklbw m6, m6 ; AAAABBBB
  970. punpckhbw m7, m7 ; CCCCDDDD
  971. ; add DC
  972. DEFINE_ARGS dst1, dst2, stride
  973. lea dst2q, [dst1q+strideq*2]
  974. ADD_DC m0, m6, 0, mova
  975. lea dst1q, [dst1q+strideq*4]
  976. lea dst2q, [dst2q+strideq*4]
  977. ADD_DC m1, m7, 0, mova
  978. RET
  979. ;-----------------------------------------------------------------------------
  980. ; void ff_vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
  981. ;-----------------------------------------------------------------------------
  982. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  983. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  984. %macro VP8_MULTIPLY_SUMSUB 4
  985. mova %3, %1
  986. mova %4, %2
  987. pmulhw %3, m6 ;20091(1)
  988. pmulhw %4, m6 ;20091(2)
  989. paddw %3, %1
  990. paddw %4, %2
  991. paddw %1, %1
  992. paddw %2, %2
  993. pmulhw %1, m7 ;35468(1)
  994. pmulhw %2, m7 ;35468(2)
  995. psubw %1, %4
  996. paddw %2, %3
  997. %endmacro
  998. ; calculate x0=%1+%3; x1=%1-%3
  999. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  1000. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  1001. ; %5/%6 are temporary registers
  1002. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  1003. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  1004. SUMSUB_BA w, %3, %1, %5 ;t0, t1
  1005. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  1006. SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
  1007. SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
  1008. SWAP %4, %1
  1009. SWAP %4, %3
  1010. %endmacro
  1011. %macro VP8_IDCT_ADD 0
  1012. cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
  1013. ; load block data
  1014. movq m0, [blockq+ 0]
  1015. movq m1, [blockq+ 8]
  1016. movq m2, [blockq+16]
  1017. movq m3, [blockq+24]
  1018. movq m6, [pw_20091]
  1019. movq m7, [pw_17734]
  1020. %if cpuflag(sse)
  1021. xorps xmm0, xmm0
  1022. movaps [blockq+ 0], xmm0
  1023. movaps [blockq+16], xmm0
  1024. %else
  1025. pxor m4, m4
  1026. movq [blockq+ 0], m4
  1027. movq [blockq+ 8], m4
  1028. movq [blockq+16], m4
  1029. movq [blockq+24], m4
  1030. %endif
  1031. ; actual IDCT
  1032. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1033. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1034. paddw m0, [pw_4]
  1035. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1036. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1037. ; store
  1038. pxor m4, m4
  1039. DEFINE_ARGS dst1, dst2, stride
  1040. lea dst2q, [dst1q+2*strideq]
  1041. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
  1042. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
  1043. RET
  1044. %endmacro
  1045. %if ARCH_X86_32
  1046. INIT_MMX mmx
  1047. VP8_IDCT_ADD
  1048. %endif
  1049. INIT_MMX sse
  1050. VP8_IDCT_ADD
  1051. ;-----------------------------------------------------------------------------
  1052. ; void ff_vp8_luma_dc_wht(int16_t block[4][4][16], int16_t dc[16])
  1053. ;-----------------------------------------------------------------------------
  1054. %macro SCATTER_WHT 3
  1055. movd dc1d, m%1
  1056. movd dc2d, m%2
  1057. mov [blockq+2*16*(0+%3)], dc1w
  1058. mov [blockq+2*16*(1+%3)], dc2w
  1059. shr dc1d, 16
  1060. shr dc2d, 16
  1061. psrlq m%1, 32
  1062. psrlq m%2, 32
  1063. mov [blockq+2*16*(4+%3)], dc1w
  1064. mov [blockq+2*16*(5+%3)], dc2w
  1065. movd dc1d, m%1
  1066. movd dc2d, m%2
  1067. mov [blockq+2*16*(8+%3)], dc1w
  1068. mov [blockq+2*16*(9+%3)], dc2w
  1069. shr dc1d, 16
  1070. shr dc2d, 16
  1071. mov [blockq+2*16*(12+%3)], dc1w
  1072. mov [blockq+2*16*(13+%3)], dc2w
  1073. %endmacro
  1074. %macro HADAMARD4_1D 4
  1075. SUMSUB_BADC w, %2, %1, %4, %3
  1076. SUMSUB_BADC w, %4, %2, %3, %1
  1077. SWAP %1, %4, %3
  1078. %endmacro
  1079. %macro VP8_DC_WHT 0
  1080. cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
  1081. movq m0, [dc1q]
  1082. movq m1, [dc1q+8]
  1083. movq m2, [dc1q+16]
  1084. movq m3, [dc1q+24]
  1085. %if cpuflag(sse)
  1086. xorps xmm0, xmm0
  1087. movaps [dc1q+ 0], xmm0
  1088. movaps [dc1q+16], xmm0
  1089. %else
  1090. pxor m4, m4
  1091. movq [dc1q+ 0], m4
  1092. movq [dc1q+ 8], m4
  1093. movq [dc1q+16], m4
  1094. movq [dc1q+24], m4
  1095. %endif
  1096. HADAMARD4_1D 0, 1, 2, 3
  1097. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1098. paddw m0, [pw_3]
  1099. HADAMARD4_1D 0, 1, 2, 3
  1100. psraw m0, 3
  1101. psraw m1, 3
  1102. psraw m2, 3
  1103. psraw m3, 3
  1104. SCATTER_WHT 0, 1, 0
  1105. SCATTER_WHT 2, 3, 2
  1106. RET
  1107. %endmacro
  1108. %if ARCH_X86_32
  1109. INIT_MMX mmx
  1110. VP8_DC_WHT
  1111. %endif
  1112. INIT_MMX sse
  1113. VP8_DC_WHT