You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2804 lines
79KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86inc.asm"
  23. %include "x86util.asm"
  24. SECTION_RODATA
  25. fourtap_filter_hw_m: times 4 dw -6, 123
  26. times 4 dw 12, -1
  27. times 4 dw -9, 93
  28. times 4 dw 50, -6
  29. times 4 dw -6, 50
  30. times 4 dw 93, -9
  31. times 4 dw -1, 12
  32. times 4 dw 123, -6
  33. sixtap_filter_hw_m: times 4 dw 2, -11
  34. times 4 dw 108, 36
  35. times 4 dw -8, 1
  36. times 4 dw 3, -16
  37. times 4 dw 77, 77
  38. times 4 dw -16, 3
  39. times 4 dw 1, -8
  40. times 4 dw 36, 108
  41. times 4 dw -11, 2
  42. fourtap_filter_hb_m: times 8 db -6, 123
  43. times 8 db 12, -1
  44. times 8 db -9, 93
  45. times 8 db 50, -6
  46. times 8 db -6, 50
  47. times 8 db 93, -9
  48. times 8 db -1, 12
  49. times 8 db 123, -6
  50. sixtap_filter_hb_m: times 8 db 2, 1
  51. times 8 db -11, 108
  52. times 8 db 36, -8
  53. times 8 db 3, 3
  54. times 8 db -16, 77
  55. times 8 db 77, -16
  56. times 8 db 1, 2
  57. times 8 db -8, 36
  58. times 8 db 108, -11
  59. fourtap_filter_v_m: times 8 dw -6
  60. times 8 dw 123
  61. times 8 dw 12
  62. times 8 dw -1
  63. times 8 dw -9
  64. times 8 dw 93
  65. times 8 dw 50
  66. times 8 dw -6
  67. times 8 dw -6
  68. times 8 dw 50
  69. times 8 dw 93
  70. times 8 dw -9
  71. times 8 dw -1
  72. times 8 dw 12
  73. times 8 dw 123
  74. times 8 dw -6
  75. sixtap_filter_v_m: times 8 dw 2
  76. times 8 dw -11
  77. times 8 dw 108
  78. times 8 dw 36
  79. times 8 dw -8
  80. times 8 dw 1
  81. times 8 dw 3
  82. times 8 dw -16
  83. times 8 dw 77
  84. times 8 dw 77
  85. times 8 dw -16
  86. times 8 dw 3
  87. times 8 dw 1
  88. times 8 dw -8
  89. times 8 dw 36
  90. times 8 dw 108
  91. times 8 dw -11
  92. times 8 dw 2
  93. bilinear_filter_vw_m: times 8 dw 1
  94. times 8 dw 2
  95. times 8 dw 3
  96. times 8 dw 4
  97. times 8 dw 5
  98. times 8 dw 6
  99. times 8 dw 7
  100. bilinear_filter_vb_m: times 8 db 7, 1
  101. times 8 db 6, 2
  102. times 8 db 5, 3
  103. times 8 db 4, 4
  104. times 8 db 3, 5
  105. times 8 db 2, 6
  106. times 8 db 1, 7
  107. %ifdef PIC
  108. %define fourtap_filter_hw picregq
  109. %define sixtap_filter_hw picregq
  110. %define fourtap_filter_hb picregq
  111. %define sixtap_filter_hb picregq
  112. %define fourtap_filter_v picregq
  113. %define sixtap_filter_v picregq
  114. %define bilinear_filter_vw picregq
  115. %define bilinear_filter_vb picregq
  116. %define npicregs 1
  117. %else
  118. %define fourtap_filter_hw fourtap_filter_hw_m
  119. %define sixtap_filter_hw sixtap_filter_hw_m
  120. %define fourtap_filter_hb fourtap_filter_hb_m
  121. %define sixtap_filter_hb sixtap_filter_hb_m
  122. %define fourtap_filter_v fourtap_filter_v_m
  123. %define sixtap_filter_v sixtap_filter_v_m
  124. %define bilinear_filter_vw bilinear_filter_vw_m
  125. %define bilinear_filter_vb bilinear_filter_vb_m
  126. %define npicregs 0
  127. %endif
  128. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  129. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  130. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  131. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  132. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  133. pw_20091: times 4 dw 20091
  134. pw_17734: times 4 dw 17734
  135. pb_27_63: times 8 db 27, 63
  136. pb_18_63: times 8 db 18, 63
  137. pb_9_63: times 8 db 9, 63
  138. cextern pb_1
  139. cextern pw_3
  140. cextern pb_3
  141. cextern pw_4
  142. cextern pb_4
  143. cextern pw_9
  144. cextern pw_18
  145. cextern pw_27
  146. cextern pw_63
  147. cextern pw_64
  148. cextern pb_80
  149. cextern pb_F8
  150. cextern pb_FE
  151. SECTION .text
  152. ;-----------------------------------------------------------------------------
  153. ; subpel MC functions:
  154. ;
  155. ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  156. ; uint8_t *src, int srcstride,
  157. ; int height, int mx, int my);
  158. ;-----------------------------------------------------------------------------
  159. %macro FILTER_SSSE3 1
  160. cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
  161. lea mxd, [mxq*3]
  162. mova m3, [filter_h6_shuf2]
  163. mova m4, [filter_h6_shuf3]
  164. %ifdef PIC
  165. lea picregq, [sixtap_filter_hb_m]
  166. %endif
  167. mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
  168. mova m6, [sixtap_filter_hb+mxq*8-32]
  169. mova m7, [sixtap_filter_hb+mxq*8-16]
  170. .nextrow
  171. movu m0, [srcq-2]
  172. mova m1, m0
  173. mova m2, m0
  174. %if mmsize == 8
  175. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  176. ; shuffle with a memory operand
  177. punpcklbw m0, [srcq+3]
  178. %else
  179. pshufb m0, [filter_h6_shuf1]
  180. %endif
  181. pshufb m1, m3
  182. pshufb m2, m4
  183. pmaddubsw m0, m5
  184. pmaddubsw m1, m6
  185. pmaddubsw m2, m7
  186. paddsw m0, m1
  187. paddsw m0, m2
  188. paddsw m0, [pw_64]
  189. psraw m0, 7
  190. packuswb m0, m0
  191. movh [dstq], m0 ; store
  192. ; go to next line
  193. add dstq, dststrideq
  194. add srcq, srcstrideq
  195. dec heightd ; next row
  196. jg .nextrow
  197. REP_RET
  198. cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  199. shl mxd, 4
  200. mova m2, [pw_64]
  201. mova m3, [filter_h2_shuf]
  202. mova m4, [filter_h4_shuf]
  203. %ifdef PIC
  204. lea picregq, [fourtap_filter_hb_m]
  205. %endif
  206. mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
  207. mova m6, [fourtap_filter_hb+mxq]
  208. .nextrow
  209. movu m0, [srcq-1]
  210. mova m1, m0
  211. pshufb m0, m3
  212. pshufb m1, m4
  213. pmaddubsw m0, m5
  214. pmaddubsw m1, m6
  215. paddsw m0, m2
  216. paddsw m0, m1
  217. psraw m0, 7
  218. packuswb m0, m0
  219. movh [dstq], m0 ; store
  220. ; go to next line
  221. add dstq, dststrideq
  222. add srcq, srcstrideq
  223. dec heightd ; next row
  224. jg .nextrow
  225. REP_RET
  226. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  227. shl myd, 4
  228. %ifdef PIC
  229. lea picregq, [fourtap_filter_hb_m]
  230. %endif
  231. mova m5, [fourtap_filter_hb+myq-16]
  232. mova m6, [fourtap_filter_hb+myq]
  233. mova m7, [pw_64]
  234. ; read 3 lines
  235. sub srcq, srcstrideq
  236. movh m0, [srcq]
  237. movh m1, [srcq+ srcstrideq]
  238. movh m2, [srcq+2*srcstrideq]
  239. add srcq, srcstrideq
  240. .nextrow
  241. movh m3, [srcq+2*srcstrideq] ; read new row
  242. mova m4, m0
  243. mova m0, m1
  244. punpcklbw m4, m1
  245. mova m1, m2
  246. punpcklbw m2, m3
  247. pmaddubsw m4, m5
  248. pmaddubsw m2, m6
  249. paddsw m4, m2
  250. mova m2, m3
  251. paddsw m4, m7
  252. psraw m4, 7
  253. packuswb m4, m4
  254. movh [dstq], m4
  255. ; go to next line
  256. add dstq, dststrideq
  257. add srcq, srcstrideq
  258. dec heightd ; next row
  259. jg .nextrow
  260. REP_RET
  261. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  262. lea myd, [myq*3]
  263. %ifdef PIC
  264. lea picregq, [sixtap_filter_hb_m]
  265. %endif
  266. lea myq, [sixtap_filter_hb+myq*8]
  267. ; read 5 lines
  268. sub srcq, srcstrideq
  269. sub srcq, srcstrideq
  270. movh m0, [srcq]
  271. movh m1, [srcq+srcstrideq]
  272. movh m2, [srcq+srcstrideq*2]
  273. lea srcq, [srcq+srcstrideq*2]
  274. add srcq, srcstrideq
  275. movh m3, [srcq]
  276. movh m4, [srcq+srcstrideq]
  277. .nextrow
  278. movh m5, [srcq+2*srcstrideq] ; read new row
  279. mova m6, m0
  280. punpcklbw m6, m5
  281. mova m0, m1
  282. punpcklbw m1, m2
  283. mova m7, m3
  284. punpcklbw m7, m4
  285. pmaddubsw m6, [myq-48]
  286. pmaddubsw m1, [myq-32]
  287. pmaddubsw m7, [myq-16]
  288. paddsw m6, m1
  289. paddsw m6, m7
  290. mova m1, m2
  291. paddsw m6, [pw_64]
  292. mova m2, m3
  293. psraw m6, 7
  294. mova m3, m4
  295. packuswb m6, m6
  296. mova m4, m5
  297. movh [dstq], m6
  298. ; go to next line
  299. add dstq, dststrideq
  300. add srcq, srcstrideq
  301. dec heightd ; next row
  302. jg .nextrow
  303. REP_RET
  304. %endmacro
  305. INIT_MMX ssse3
  306. FILTER_SSSE3 4
  307. INIT_XMM ssse3
  308. FILTER_SSSE3 8
  309. ; 4x4 block, H-only 4-tap filter
  310. INIT_MMX mmx2
  311. cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  312. shl mxd, 4
  313. %ifdef PIC
  314. lea picregq, [fourtap_filter_hw_m]
  315. %endif
  316. movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
  317. movq mm5, [fourtap_filter_hw+mxq]
  318. movq mm7, [pw_64]
  319. pxor mm6, mm6
  320. .nextrow
  321. movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
  322. ; first set of 2 pixels
  323. movq mm2, mm1 ; byte ABCD..
  324. punpcklbw mm1, mm6 ; byte->word ABCD
  325. pshufw mm0, mm2, 9 ; byte CDEF..
  326. punpcklbw mm0, mm6 ; byte->word CDEF
  327. pshufw mm3, mm1, 0x94 ; word ABBC
  328. pshufw mm1, mm0, 0x94 ; word CDDE
  329. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  330. movq mm0, mm1 ; backup for second set of pixels
  331. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  332. paddd mm3, mm1 ; finish 1st 2px
  333. ; second set of 2 pixels, use backup of above
  334. punpckhbw mm2, mm6 ; byte->word EFGH
  335. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  336. pshufw mm1, mm2, 0x94 ; word EFFG
  337. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  338. paddd mm0, mm1 ; finish 2nd 2px
  339. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  340. packssdw mm3, mm0 ; merge dword->word (4px)
  341. paddsw mm3, mm7 ; rounding
  342. psraw mm3, 7
  343. packuswb mm3, mm6 ; clip and word->bytes
  344. movd [dstq], mm3 ; store
  345. ; go to next line
  346. add dstq, dststrideq
  347. add srcq, srcstrideq
  348. dec heightd ; next row
  349. jg .nextrow
  350. REP_RET
  351. ; 4x4 block, H-only 6-tap filter
  352. INIT_MMX mmx2
  353. cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  354. lea mxd, [mxq*3]
  355. %ifdef PIC
  356. lea picregq, [sixtap_filter_hw_m]
  357. %endif
  358. movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
  359. movq mm5, [sixtap_filter_hw+mxq*8-32]
  360. movq mm6, [sixtap_filter_hw+mxq*8-16]
  361. movq mm7, [pw_64]
  362. pxor mm3, mm3
  363. .nextrow
  364. movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
  365. ; first set of 2 pixels
  366. movq mm2, mm1 ; byte ABCD..
  367. punpcklbw mm1, mm3 ; byte->word ABCD
  368. pshufw mm0, mm2, 0x9 ; byte CDEF..
  369. punpckhbw mm2, mm3 ; byte->word EFGH
  370. punpcklbw mm0, mm3 ; byte->word CDEF
  371. pshufw mm1, mm1, 0x94 ; word ABBC
  372. pshufw mm2, mm2, 0x94 ; word EFFG
  373. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  374. pshufw mm3, mm0, 0x94 ; word CDDE
  375. movq mm0, mm3 ; backup for second set of pixels
  376. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  377. paddd mm1, mm3 ; add to 1st 2px cache
  378. movq mm3, mm2 ; backup for second set of pixels
  379. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  380. paddd mm1, mm2 ; finish 1st 2px
  381. ; second set of 2 pixels, use backup of above
  382. movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
  383. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  384. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  385. paddd mm0, mm3 ; add to 2nd 2px cache
  386. pxor mm3, mm3
  387. punpcklbw mm2, mm3 ; byte->word FGHI
  388. pshufw mm2, mm2, 0xE9 ; word GHHI
  389. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  390. paddd mm0, mm2 ; finish 2nd 2px
  391. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  392. packssdw mm1, mm0 ; merge dword->word (4px)
  393. paddsw mm1, mm7 ; rounding
  394. psraw mm1, 7
  395. packuswb mm1, mm3 ; clip and word->bytes
  396. movd [dstq], mm1 ; store
  397. ; go to next line
  398. add dstq, dststrideq
  399. add srcq, srcstrideq
  400. dec heightd ; next row
  401. jg .nextrow
  402. REP_RET
  403. INIT_XMM sse2
  404. cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
  405. shl mxd, 5
  406. %ifdef PIC
  407. lea picregq, [fourtap_filter_v_m]
  408. %endif
  409. lea mxq, [fourtap_filter_v+mxq-32]
  410. pxor m7, m7
  411. mova m4, [pw_64]
  412. mova m5, [mxq+ 0]
  413. mova m6, [mxq+16]
  414. %ifdef m8
  415. mova m8, [mxq+32]
  416. mova m9, [mxq+48]
  417. %endif
  418. .nextrow
  419. movq m0, [srcq-1]
  420. movq m1, [srcq-0]
  421. movq m2, [srcq+1]
  422. movq m3, [srcq+2]
  423. punpcklbw m0, m7
  424. punpcklbw m1, m7
  425. punpcklbw m2, m7
  426. punpcklbw m3, m7
  427. pmullw m0, m5
  428. pmullw m1, m6
  429. %ifdef m8
  430. pmullw m2, m8
  431. pmullw m3, m9
  432. %else
  433. pmullw m2, [mxq+32]
  434. pmullw m3, [mxq+48]
  435. %endif
  436. paddsw m0, m1
  437. paddsw m2, m3
  438. paddsw m0, m2
  439. paddsw m0, m4
  440. psraw m0, 7
  441. packuswb m0, m7
  442. movh [dstq], m0 ; store
  443. ; go to next line
  444. add dstq, dststrideq
  445. add srcq, srcstrideq
  446. dec heightd ; next row
  447. jg .nextrow
  448. REP_RET
  449. INIT_XMM sse2
  450. cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
  451. lea mxd, [mxq*3]
  452. shl mxd, 4
  453. %ifdef PIC
  454. lea picregq, [sixtap_filter_v_m]
  455. %endif
  456. lea mxq, [sixtap_filter_v+mxq-96]
  457. pxor m7, m7
  458. mova m6, [pw_64]
  459. %ifdef m8
  460. mova m8, [mxq+ 0]
  461. mova m9, [mxq+16]
  462. mova m10, [mxq+32]
  463. mova m11, [mxq+48]
  464. mova m12, [mxq+64]
  465. mova m13, [mxq+80]
  466. %endif
  467. .nextrow
  468. movq m0, [srcq-2]
  469. movq m1, [srcq-1]
  470. movq m2, [srcq-0]
  471. movq m3, [srcq+1]
  472. movq m4, [srcq+2]
  473. movq m5, [srcq+3]
  474. punpcklbw m0, m7
  475. punpcklbw m1, m7
  476. punpcklbw m2, m7
  477. punpcklbw m3, m7
  478. punpcklbw m4, m7
  479. punpcklbw m5, m7
  480. %ifdef m8
  481. pmullw m0, m8
  482. pmullw m1, m9
  483. pmullw m2, m10
  484. pmullw m3, m11
  485. pmullw m4, m12
  486. pmullw m5, m13
  487. %else
  488. pmullw m0, [mxq+ 0]
  489. pmullw m1, [mxq+16]
  490. pmullw m2, [mxq+32]
  491. pmullw m3, [mxq+48]
  492. pmullw m4, [mxq+64]
  493. pmullw m5, [mxq+80]
  494. %endif
  495. paddsw m1, m4
  496. paddsw m0, m5
  497. paddsw m1, m2
  498. paddsw m0, m3
  499. paddsw m0, m1
  500. paddsw m0, m6
  501. psraw m0, 7
  502. packuswb m0, m7
  503. movh [dstq], m0 ; store
  504. ; go to next line
  505. add dstq, dststrideq
  506. add srcq, srcstrideq
  507. dec heightd ; next row
  508. jg .nextrow
  509. REP_RET
  510. %macro FILTER_V 1
  511. ; 4x4 block, V-only 4-tap filter
  512. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  513. shl myd, 5
  514. %ifdef PIC
  515. lea picregq, [fourtap_filter_v_m]
  516. %endif
  517. lea myq, [fourtap_filter_v+myq-32]
  518. mova m6, [pw_64]
  519. pxor m7, m7
  520. mova m5, [myq+48]
  521. ; read 3 lines
  522. sub srcq, srcstrideq
  523. movh m0, [srcq]
  524. movh m1, [srcq+ srcstrideq]
  525. movh m2, [srcq+2*srcstrideq]
  526. add srcq, srcstrideq
  527. punpcklbw m0, m7
  528. punpcklbw m1, m7
  529. punpcklbw m2, m7
  530. .nextrow
  531. ; first calculate negative taps (to prevent losing positive overflows)
  532. movh m4, [srcq+2*srcstrideq] ; read new row
  533. punpcklbw m4, m7
  534. mova m3, m4
  535. pmullw m0, [myq+0]
  536. pmullw m4, m5
  537. paddsw m4, m0
  538. ; then calculate positive taps
  539. mova m0, m1
  540. pmullw m1, [myq+16]
  541. paddsw m4, m1
  542. mova m1, m2
  543. pmullw m2, [myq+32]
  544. paddsw m4, m2
  545. mova m2, m3
  546. ; round/clip/store
  547. paddsw m4, m6
  548. psraw m4, 7
  549. packuswb m4, m7
  550. movh [dstq], m4
  551. ; go to next line
  552. add dstq, dststrideq
  553. add srcq, srcstrideq
  554. dec heightd ; next row
  555. jg .nextrow
  556. REP_RET
  557. ; 4x4 block, V-only 6-tap filter
  558. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  559. shl myd, 4
  560. lea myq, [myq*3]
  561. %ifdef PIC
  562. lea picregq, [sixtap_filter_v_m]
  563. %endif
  564. lea myq, [sixtap_filter_v+myq-96]
  565. pxor m7, m7
  566. ; read 5 lines
  567. sub srcq, srcstrideq
  568. sub srcq, srcstrideq
  569. movh m0, [srcq]
  570. movh m1, [srcq+srcstrideq]
  571. movh m2, [srcq+srcstrideq*2]
  572. lea srcq, [srcq+srcstrideq*2]
  573. add srcq, srcstrideq
  574. movh m3, [srcq]
  575. movh m4, [srcq+srcstrideq]
  576. punpcklbw m0, m7
  577. punpcklbw m1, m7
  578. punpcklbw m2, m7
  579. punpcklbw m3, m7
  580. punpcklbw m4, m7
  581. .nextrow
  582. ; first calculate negative taps (to prevent losing positive overflows)
  583. mova m5, m1
  584. pmullw m5, [myq+16]
  585. mova m6, m4
  586. pmullw m6, [myq+64]
  587. paddsw m6, m5
  588. ; then calculate positive taps
  589. movh m5, [srcq+2*srcstrideq] ; read new row
  590. punpcklbw m5, m7
  591. pmullw m0, [myq+0]
  592. paddsw m6, m0
  593. mova m0, m1
  594. mova m1, m2
  595. pmullw m2, [myq+32]
  596. paddsw m6, m2
  597. mova m2, m3
  598. pmullw m3, [myq+48]
  599. paddsw m6, m3
  600. mova m3, m4
  601. mova m4, m5
  602. pmullw m5, [myq+80]
  603. paddsw m6, m5
  604. ; round/clip/store
  605. paddsw m6, [pw_64]
  606. psraw m6, 7
  607. packuswb m6, m7
  608. movh [dstq], m6
  609. ; go to next line
  610. add dstq, dststrideq
  611. add srcq, srcstrideq
  612. dec heightd ; next row
  613. jg .nextrow
  614. REP_RET
  615. %endmacro
  616. INIT_MMX mmx2
  617. FILTER_V 4
  618. INIT_XMM sse2
  619. FILTER_V 8
  620. %macro FILTER_BILINEAR 1
  621. cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
  622. shl myd, 4
  623. %ifdef PIC
  624. lea picregq, [bilinear_filter_vw_m]
  625. %endif
  626. pxor m6, m6
  627. mova m5, [bilinear_filter_vw+myq-1*16]
  628. neg myq
  629. mova m4, [bilinear_filter_vw+myq+7*16]
  630. .nextrow
  631. movh m0, [srcq+srcstrideq*0]
  632. movh m1, [srcq+srcstrideq*1]
  633. movh m3, [srcq+srcstrideq*2]
  634. punpcklbw m0, m6
  635. punpcklbw m1, m6
  636. punpcklbw m3, m6
  637. mova m2, m1
  638. pmullw m0, m4
  639. pmullw m1, m5
  640. pmullw m2, m4
  641. pmullw m3, m5
  642. paddsw m0, m1
  643. paddsw m2, m3
  644. psraw m0, 2
  645. psraw m2, 2
  646. pavgw m0, m6
  647. pavgw m2, m6
  648. %if mmsize == 8
  649. packuswb m0, m0
  650. packuswb m2, m2
  651. movh [dstq+dststrideq*0], m0
  652. movh [dstq+dststrideq*1], m2
  653. %else
  654. packuswb m0, m2
  655. movh [dstq+dststrideq*0], m0
  656. movhps [dstq+dststrideq*1], m0
  657. %endif
  658. lea dstq, [dstq+dststrideq*2]
  659. lea srcq, [srcq+srcstrideq*2]
  660. sub heightd, 2
  661. jg .nextrow
  662. REP_RET
  663. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  664. shl mxd, 4
  665. %ifdef PIC
  666. lea picregq, [bilinear_filter_vw_m]
  667. %endif
  668. pxor m6, m6
  669. mova m5, [bilinear_filter_vw+mxq-1*16]
  670. neg mxq
  671. mova m4, [bilinear_filter_vw+mxq+7*16]
  672. .nextrow
  673. movh m0, [srcq+srcstrideq*0+0]
  674. movh m1, [srcq+srcstrideq*0+1]
  675. movh m2, [srcq+srcstrideq*1+0]
  676. movh m3, [srcq+srcstrideq*1+1]
  677. punpcklbw m0, m6
  678. punpcklbw m1, m6
  679. punpcklbw m2, m6
  680. punpcklbw m3, m6
  681. pmullw m0, m4
  682. pmullw m1, m5
  683. pmullw m2, m4
  684. pmullw m3, m5
  685. paddsw m0, m1
  686. paddsw m2, m3
  687. psraw m0, 2
  688. psraw m2, 2
  689. pavgw m0, m6
  690. pavgw m2, m6
  691. %if mmsize == 8
  692. packuswb m0, m0
  693. packuswb m2, m2
  694. movh [dstq+dststrideq*0], m0
  695. movh [dstq+dststrideq*1], m2
  696. %else
  697. packuswb m0, m2
  698. movh [dstq+dststrideq*0], m0
  699. movhps [dstq+dststrideq*1], m0
  700. %endif
  701. lea dstq, [dstq+dststrideq*2]
  702. lea srcq, [srcq+srcstrideq*2]
  703. sub heightd, 2
  704. jg .nextrow
  705. REP_RET
  706. %endmacro
  707. INIT_MMX mmx2
  708. FILTER_BILINEAR 4
  709. INIT_XMM sse2
  710. FILTER_BILINEAR 8
  711. %macro FILTER_BILINEAR_SSSE3 1
  712. cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
  713. shl myd, 4
  714. %ifdef PIC
  715. lea picregq, [bilinear_filter_vb_m]
  716. %endif
  717. pxor m4, m4
  718. mova m3, [bilinear_filter_vb+myq-16]
  719. .nextrow
  720. movh m0, [srcq+srcstrideq*0]
  721. movh m1, [srcq+srcstrideq*1]
  722. movh m2, [srcq+srcstrideq*2]
  723. punpcklbw m0, m1
  724. punpcklbw m1, m2
  725. pmaddubsw m0, m3
  726. pmaddubsw m1, m3
  727. psraw m0, 2
  728. psraw m1, 2
  729. pavgw m0, m4
  730. pavgw m1, m4
  731. %if mmsize==8
  732. packuswb m0, m0
  733. packuswb m1, m1
  734. movh [dstq+dststrideq*0], m0
  735. movh [dstq+dststrideq*1], m1
  736. %else
  737. packuswb m0, m1
  738. movh [dstq+dststrideq*0], m0
  739. movhps [dstq+dststrideq*1], m0
  740. %endif
  741. lea dstq, [dstq+dststrideq*2]
  742. lea srcq, [srcq+srcstrideq*2]
  743. sub heightd, 2
  744. jg .nextrow
  745. REP_RET
  746. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
  747. shl mxd, 4
  748. %ifdef PIC
  749. lea picregq, [bilinear_filter_vb_m]
  750. %endif
  751. pxor m4, m4
  752. mova m2, [filter_h2_shuf]
  753. mova m3, [bilinear_filter_vb+mxq-16]
  754. .nextrow
  755. movu m0, [srcq+srcstrideq*0]
  756. movu m1, [srcq+srcstrideq*1]
  757. pshufb m0, m2
  758. pshufb m1, m2
  759. pmaddubsw m0, m3
  760. pmaddubsw m1, m3
  761. psraw m0, 2
  762. psraw m1, 2
  763. pavgw m0, m4
  764. pavgw m1, m4
  765. %if mmsize==8
  766. packuswb m0, m0
  767. packuswb m1, m1
  768. movh [dstq+dststrideq*0], m0
  769. movh [dstq+dststrideq*1], m1
  770. %else
  771. packuswb m0, m1
  772. movh [dstq+dststrideq*0], m0
  773. movhps [dstq+dststrideq*1], m0
  774. %endif
  775. lea dstq, [dstq+dststrideq*2]
  776. lea srcq, [srcq+srcstrideq*2]
  777. sub heightd, 2
  778. jg .nextrow
  779. REP_RET
  780. %endmacro
  781. INIT_MMX ssse3
  782. FILTER_BILINEAR_SSSE3 4
  783. INIT_XMM ssse3
  784. FILTER_BILINEAR_SSSE3 8
  785. INIT_MMX mmx
  786. cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
  787. .nextrow:
  788. movq mm0, [srcq+srcstrideq*0]
  789. movq mm1, [srcq+srcstrideq*1]
  790. lea srcq, [srcq+srcstrideq*2]
  791. movq [dstq+dststrideq*0], mm0
  792. movq [dstq+dststrideq*1], mm1
  793. lea dstq, [dstq+dststrideq*2]
  794. sub heightd, 2
  795. jg .nextrow
  796. REP_RET
  797. %if ARCH_X86_32
  798. INIT_MMX mmx
  799. cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
  800. .nextrow:
  801. movq mm0, [srcq+srcstrideq*0+0]
  802. movq mm1, [srcq+srcstrideq*0+8]
  803. movq mm2, [srcq+srcstrideq*1+0]
  804. movq mm3, [srcq+srcstrideq*1+8]
  805. lea srcq, [srcq+srcstrideq*2]
  806. movq [dstq+dststrideq*0+0], mm0
  807. movq [dstq+dststrideq*0+8], mm1
  808. movq [dstq+dststrideq*1+0], mm2
  809. movq [dstq+dststrideq*1+8], mm3
  810. lea dstq, [dstq+dststrideq*2]
  811. sub heightd, 2
  812. jg .nextrow
  813. REP_RET
  814. %endif
  815. INIT_XMM sse
  816. cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
  817. .nextrow:
  818. movups xmm0, [srcq+srcstrideq*0]
  819. movups xmm1, [srcq+srcstrideq*1]
  820. lea srcq, [srcq+srcstrideq*2]
  821. movaps [dstq+dststrideq*0], xmm0
  822. movaps [dstq+dststrideq*1], xmm1
  823. lea dstq, [dstq+dststrideq*2]
  824. sub heightd, 2
  825. jg .nextrow
  826. REP_RET
  827. ;-----------------------------------------------------------------------------
  828. ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  829. ;-----------------------------------------------------------------------------
  830. %macro ADD_DC 4
  831. %4 m2, [dst1q+%3]
  832. %4 m3, [dst1q+strideq+%3]
  833. %4 m4, [dst2q+%3]
  834. %4 m5, [dst2q+strideq+%3]
  835. paddusb m2, %1
  836. paddusb m3, %1
  837. paddusb m4, %1
  838. paddusb m5, %1
  839. psubusb m2, %2
  840. psubusb m3, %2
  841. psubusb m4, %2
  842. psubusb m5, %2
  843. %4 [dst1q+%3], m2
  844. %4 [dst1q+strideq+%3], m3
  845. %4 [dst2q+%3], m4
  846. %4 [dst2q+strideq+%3], m5
  847. %endmacro
  848. INIT_MMX mmx
  849. cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
  850. ; load data
  851. movd m0, [blockq]
  852. ; calculate DC
  853. paddw m0, [pw_4]
  854. pxor m1, m1
  855. psraw m0, 3
  856. movd [blockq], m1
  857. psubw m1, m0
  858. packuswb m0, m0
  859. packuswb m1, m1
  860. punpcklbw m0, m0
  861. punpcklbw m1, m1
  862. punpcklwd m0, m0
  863. punpcklwd m1, m1
  864. ; add DC
  865. DEFINE_ARGS dst1, dst2, stride
  866. lea dst2q, [dst1q+strideq*2]
  867. ADD_DC m0, m1, 0, movh
  868. RET
  869. INIT_XMM sse4
  870. cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
  871. ; load data
  872. movd m0, [blockq]
  873. pxor m1, m1
  874. ; calculate DC
  875. paddw m0, [pw_4]
  876. movd [blockq], m1
  877. DEFINE_ARGS dst1, dst2, stride
  878. lea dst2q, [dst1q+strideq*2]
  879. movd m2, [dst1q]
  880. movd m3, [dst1q+strideq]
  881. movd m4, [dst2q]
  882. movd m5, [dst2q+strideq]
  883. psraw m0, 3
  884. pshuflw m0, m0, 0
  885. punpcklqdq m0, m0
  886. punpckldq m2, m3
  887. punpckldq m4, m5
  888. punpcklbw m2, m1
  889. punpcklbw m4, m1
  890. paddw m2, m0
  891. paddw m4, m0
  892. packuswb m2, m4
  893. movd [dst1q], m2
  894. pextrd [dst1q+strideq], m2, 1
  895. pextrd [dst2q], m2, 2
  896. pextrd [dst2q+strideq], m2, 3
  897. RET
  898. ;-----------------------------------------------------------------------------
  899. ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  900. ;-----------------------------------------------------------------------------
  901. %if ARCH_X86_32
  902. INIT_MMX mmx
  903. cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
  904. ; load data
  905. movd m0, [blockq+32*0] ; A
  906. movd m1, [blockq+32*2] ; C
  907. punpcklwd m0, [blockq+32*1] ; A B
  908. punpcklwd m1, [blockq+32*3] ; C D
  909. punpckldq m0, m1 ; A B C D
  910. pxor m6, m6
  911. ; calculate DC
  912. paddw m0, [pw_4]
  913. movd [blockq+32*0], m6
  914. movd [blockq+32*1], m6
  915. movd [blockq+32*2], m6
  916. movd [blockq+32*3], m6
  917. psraw m0, 3
  918. psubw m6, m0
  919. packuswb m0, m0
  920. packuswb m6, m6
  921. punpcklbw m0, m0 ; AABBCCDD
  922. punpcklbw m6, m6 ; AABBCCDD
  923. movq m1, m0
  924. movq m7, m6
  925. punpcklbw m0, m0 ; AAAABBBB
  926. punpckhbw m1, m1 ; CCCCDDDD
  927. punpcklbw m6, m6 ; AAAABBBB
  928. punpckhbw m7, m7 ; CCCCDDDD
  929. ; add DC
  930. DEFINE_ARGS dst1, dst2, stride
  931. lea dst2q, [dst1q+strideq*2]
  932. ADD_DC m0, m6, 0, mova
  933. ADD_DC m1, m7, 8, mova
  934. RET
  935. %endif
  936. INIT_XMM sse2
  937. cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
  938. ; load data
  939. movd m0, [blockq+32*0] ; A
  940. movd m1, [blockq+32*2] ; C
  941. punpcklwd m0, [blockq+32*1] ; A B
  942. punpcklwd m1, [blockq+32*3] ; C D
  943. punpckldq m0, m1 ; A B C D
  944. pxor m1, m1
  945. ; calculate DC
  946. paddw m0, [pw_4]
  947. movd [blockq+32*0], m1
  948. movd [blockq+32*1], m1
  949. movd [blockq+32*2], m1
  950. movd [blockq+32*3], m1
  951. psraw m0, 3
  952. psubw m1, m0
  953. packuswb m0, m0
  954. packuswb m1, m1
  955. punpcklbw m0, m0
  956. punpcklbw m1, m1
  957. punpcklbw m0, m0
  958. punpcklbw m1, m1
  959. ; add DC
  960. DEFINE_ARGS dst1, dst2, stride
  961. lea dst2q, [dst1q+strideq*2]
  962. ADD_DC m0, m1, 0, mova
  963. RET
  964. ;-----------------------------------------------------------------------------
  965. ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  966. ;-----------------------------------------------------------------------------
  967. INIT_MMX mmx
  968. cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
  969. ; load data
  970. movd m0, [blockq+32*0] ; A
  971. movd m1, [blockq+32*2] ; C
  972. punpcklwd m0, [blockq+32*1] ; A B
  973. punpcklwd m1, [blockq+32*3] ; C D
  974. punpckldq m0, m1 ; A B C D
  975. pxor m6, m6
  976. ; calculate DC
  977. paddw m0, [pw_4]
  978. movd [blockq+32*0], m6
  979. movd [blockq+32*1], m6
  980. movd [blockq+32*2], m6
  981. movd [blockq+32*3], m6
  982. psraw m0, 3
  983. psubw m6, m0
  984. packuswb m0, m0
  985. packuswb m6, m6
  986. punpcklbw m0, m0 ; AABBCCDD
  987. punpcklbw m6, m6 ; AABBCCDD
  988. movq m1, m0
  989. movq m7, m6
  990. punpcklbw m0, m0 ; AAAABBBB
  991. punpckhbw m1, m1 ; CCCCDDDD
  992. punpcklbw m6, m6 ; AAAABBBB
  993. punpckhbw m7, m7 ; CCCCDDDD
  994. ; add DC
  995. DEFINE_ARGS dst1, dst2, stride
  996. lea dst2q, [dst1q+strideq*2]
  997. ADD_DC m0, m6, 0, mova
  998. lea dst1q, [dst1q+strideq*4]
  999. lea dst2q, [dst2q+strideq*4]
  1000. ADD_DC m1, m7, 0, mova
  1001. RET
  1002. ;-----------------------------------------------------------------------------
  1003. ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  1004. ;-----------------------------------------------------------------------------
  1005. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  1006. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  1007. %macro VP8_MULTIPLY_SUMSUB 4
  1008. mova %3, %1
  1009. mova %4, %2
  1010. pmulhw %3, m6 ;20091(1)
  1011. pmulhw %4, m6 ;20091(2)
  1012. paddw %3, %1
  1013. paddw %4, %2
  1014. paddw %1, %1
  1015. paddw %2, %2
  1016. pmulhw %1, m7 ;35468(1)
  1017. pmulhw %2, m7 ;35468(2)
  1018. psubw %1, %4
  1019. paddw %2, %3
  1020. %endmacro
  1021. ; calculate x0=%1+%3; x1=%1-%3
  1022. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  1023. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  1024. ; %5/%6 are temporary registers
  1025. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  1026. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  1027. SUMSUB_BA w, %3, %1, %5 ;t0, t1
  1028. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  1029. SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
  1030. SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
  1031. SWAP %4, %1
  1032. SWAP %4, %3
  1033. %endmacro
  1034. %macro VP8_IDCT_ADD 0
  1035. cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
  1036. ; load block data
  1037. movq m0, [blockq+ 0]
  1038. movq m1, [blockq+ 8]
  1039. movq m2, [blockq+16]
  1040. movq m3, [blockq+24]
  1041. movq m6, [pw_20091]
  1042. movq m7, [pw_17734]
  1043. %if cpuflag(sse)
  1044. xorps xmm0, xmm0
  1045. movaps [blockq+ 0], xmm0
  1046. movaps [blockq+16], xmm0
  1047. %else
  1048. pxor m4, m4
  1049. movq [blockq+ 0], m4
  1050. movq [blockq+ 8], m4
  1051. movq [blockq+16], m4
  1052. movq [blockq+24], m4
  1053. %endif
  1054. ; actual IDCT
  1055. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1056. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1057. paddw m0, [pw_4]
  1058. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1059. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1060. ; store
  1061. pxor m4, m4
  1062. DEFINE_ARGS dst1, dst2, stride
  1063. lea dst2q, [dst1q+2*strideq]
  1064. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
  1065. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
  1066. RET
  1067. %endmacro
  1068. %if ARCH_X86_32
  1069. INIT_MMX mmx
  1070. VP8_IDCT_ADD
  1071. %endif
  1072. INIT_MMX sse
  1073. VP8_IDCT_ADD
  1074. ;-----------------------------------------------------------------------------
  1075. ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
  1076. ;-----------------------------------------------------------------------------
  1077. %macro SCATTER_WHT 3
  1078. movd dc1d, m%1
  1079. movd dc2d, m%2
  1080. mov [blockq+2*16*(0+%3)], dc1w
  1081. mov [blockq+2*16*(1+%3)], dc2w
  1082. shr dc1d, 16
  1083. shr dc2d, 16
  1084. psrlq m%1, 32
  1085. psrlq m%2, 32
  1086. mov [blockq+2*16*(4+%3)], dc1w
  1087. mov [blockq+2*16*(5+%3)], dc2w
  1088. movd dc1d, m%1
  1089. movd dc2d, m%2
  1090. mov [blockq+2*16*(8+%3)], dc1w
  1091. mov [blockq+2*16*(9+%3)], dc2w
  1092. shr dc1d, 16
  1093. shr dc2d, 16
  1094. mov [blockq+2*16*(12+%3)], dc1w
  1095. mov [blockq+2*16*(13+%3)], dc2w
  1096. %endmacro
  1097. %macro HADAMARD4_1D 4
  1098. SUMSUB_BADC w, %2, %1, %4, %3
  1099. SUMSUB_BADC w, %4, %2, %3, %1
  1100. SWAP %1, %4, %3
  1101. %endmacro
  1102. %macro VP8_DC_WHT 0
  1103. cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
  1104. movq m0, [dc1q]
  1105. movq m1, [dc1q+8]
  1106. movq m2, [dc1q+16]
  1107. movq m3, [dc1q+24]
  1108. %if cpuflag(sse)
  1109. xorps xmm0, xmm0
  1110. movaps [dc1q+ 0], xmm0
  1111. movaps [dc1q+16], xmm0
  1112. %else
  1113. pxor m4, m4
  1114. movq [dc1q+ 0], m4
  1115. movq [dc1q+ 8], m4
  1116. movq [dc1q+16], m4
  1117. movq [dc1q+24], m4
  1118. %endif
  1119. HADAMARD4_1D 0, 1, 2, 3
  1120. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1121. paddw m0, [pw_3]
  1122. HADAMARD4_1D 0, 1, 2, 3
  1123. psraw m0, 3
  1124. psraw m1, 3
  1125. psraw m2, 3
  1126. psraw m3, 3
  1127. SCATTER_WHT 0, 1, 0
  1128. SCATTER_WHT 2, 3, 2
  1129. RET
  1130. %endmacro
  1131. %if ARCH_X86_32
  1132. INIT_MMX mmx
  1133. VP8_DC_WHT
  1134. %endif
  1135. INIT_MMX sse
  1136. VP8_DC_WHT
  1137. ;-----------------------------------------------------------------------------
  1138. ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
  1139. ;-----------------------------------------------------------------------------
  1140. ; macro called with 7 mm register indexes as argument, and 4 regular registers
  1141. ;
  1142. ; first 4 mm registers will carry the transposed pixel data
  1143. ; the other three are scratchspace (one would be sufficient, but this allows
  1144. ; for more spreading/pipelining and thus faster execution on OOE CPUs)
  1145. ;
  1146. ; first two regular registers are buf+4*stride and buf+5*stride
  1147. ; third is -stride, fourth is +stride
  1148. %macro READ_8x4_INTERLEAVED 11
  1149. ; interleave 8 (A-H) rows of 4 pixels each
  1150. movd m%1, [%8+%10*4] ; A0-3
  1151. movd m%5, [%9+%10*4] ; B0-3
  1152. movd m%2, [%8+%10*2] ; C0-3
  1153. movd m%6, [%8+%10] ; D0-3
  1154. movd m%3, [%8] ; E0-3
  1155. movd m%7, [%9] ; F0-3
  1156. movd m%4, [%9+%11] ; G0-3
  1157. punpcklbw m%1, m%5 ; A/B interleaved
  1158. movd m%5, [%9+%11*2] ; H0-3
  1159. punpcklbw m%2, m%6 ; C/D interleaved
  1160. punpcklbw m%3, m%7 ; E/F interleaved
  1161. punpcklbw m%4, m%5 ; G/H interleaved
  1162. %endmacro
  1163. ; macro called with 7 mm register indexes as argument, and 5 regular registers
  1164. ; first 11 mean the same as READ_8x4_TRANSPOSED above
  1165. ; fifth regular register is scratchspace to reach the bottom 8 rows, it
  1166. ; will be set to second regular register + 8*stride at the end
  1167. %macro READ_16x4_INTERLEAVED 12
  1168. ; transpose 16 (A-P) rows of 4 pixels each
  1169. lea %12, [r0+8*r2]
  1170. ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
  1171. movd m%1, [%8+%10*4] ; A0-3
  1172. movd m%3, [%12+%10*4] ; I0-3
  1173. movd m%2, [%8+%10*2] ; C0-3
  1174. movd m%4, [%12+%10*2] ; K0-3
  1175. movd m%6, [%8+%10] ; D0-3
  1176. movd m%5, [%12+%10] ; L0-3
  1177. movd m%7, [%12] ; M0-3
  1178. add %12, %11
  1179. punpcklbw m%1, m%3 ; A/I
  1180. movd m%3, [%8] ; E0-3
  1181. punpcklbw m%2, m%4 ; C/K
  1182. punpcklbw m%6, m%5 ; D/L
  1183. punpcklbw m%3, m%7 ; E/M
  1184. punpcklbw m%2, m%6 ; C/D/K/L interleaved
  1185. ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
  1186. movd m%5, [%9+%10*4] ; B0-3
  1187. movd m%4, [%12+%10*4] ; J0-3
  1188. movd m%7, [%9] ; F0-3
  1189. movd m%6, [%12] ; N0-3
  1190. punpcklbw m%5, m%4 ; B/J
  1191. punpcklbw m%7, m%6 ; F/N
  1192. punpcklbw m%1, m%5 ; A/B/I/J interleaved
  1193. punpcklbw m%3, m%7 ; E/F/M/N interleaved
  1194. movd m%4, [%9+%11] ; G0-3
  1195. movd m%6, [%12+%11] ; O0-3
  1196. movd m%5, [%9+%11*2] ; H0-3
  1197. movd m%7, [%12+%11*2] ; P0-3
  1198. punpcklbw m%4, m%6 ; G/O
  1199. punpcklbw m%5, m%7 ; H/P
  1200. punpcklbw m%4, m%5 ; G/H/O/P interleaved
  1201. %endmacro
  1202. ; write 4 mm registers of 2 dwords each
  1203. ; first four arguments are mm register indexes containing source data
  1204. ; last four are registers containing buf+4*stride, buf+5*stride,
  1205. ; -stride and +stride
  1206. %macro WRITE_4x2D 8
  1207. ; write out (2 dwords per register)
  1208. movd [%5+%7*4], m%1
  1209. movd [%5+%7*2], m%2
  1210. movd [%5], m%3
  1211. movd [%6+%8], m%4
  1212. punpckhdq m%1, m%1
  1213. punpckhdq m%2, m%2
  1214. punpckhdq m%3, m%3
  1215. punpckhdq m%4, m%4
  1216. movd [%6+%7*4], m%1
  1217. movd [%5+%7], m%2
  1218. movd [%6], m%3
  1219. movd [%6+%8*2], m%4
  1220. %endmacro
  1221. ; write 4 xmm registers of 4 dwords each
  1222. ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
  1223. ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
  1224. ; we add 1*stride to the third regular registry in the process
  1225. ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
  1226. ; same memory region), or 8 if they cover two separate buffers (third one points to
  1227. ; a different memory region than the first two), allowing for more optimal code for
  1228. ; the 16-width case
  1229. %macro WRITE_4x4D 10
  1230. ; write out (4 dwords per register), start with dwords zero
  1231. movd [%5+%8*4], m%1
  1232. movd [%5], m%2
  1233. movd [%7+%8*4], m%3
  1234. movd [%7], m%4
  1235. ; store dwords 1
  1236. psrldq m%1, 4
  1237. psrldq m%2, 4
  1238. psrldq m%3, 4
  1239. psrldq m%4, 4
  1240. movd [%6+%8*4], m%1
  1241. movd [%6], m%2
  1242. %if %10 == 16
  1243. movd [%6+%9*4], m%3
  1244. %endif
  1245. movd [%7+%9], m%4
  1246. ; write dwords 2
  1247. psrldq m%1, 4
  1248. psrldq m%2, 4
  1249. %if %10 == 8
  1250. movd [%5+%8*2], m%1
  1251. movd %5d, m%3
  1252. %endif
  1253. psrldq m%3, 4
  1254. psrldq m%4, 4
  1255. %if %10 == 16
  1256. movd [%5+%8*2], m%1
  1257. %endif
  1258. movd [%6+%9], m%2
  1259. movd [%7+%8*2], m%3
  1260. movd [%7+%9*2], m%4
  1261. add %7, %9
  1262. ; store dwords 3
  1263. psrldq m%1, 4
  1264. psrldq m%2, 4
  1265. psrldq m%3, 4
  1266. psrldq m%4, 4
  1267. %if %10 == 8
  1268. mov [%7+%8*4], %5d
  1269. movd [%6+%8*2], m%1
  1270. %else
  1271. movd [%5+%8], m%1
  1272. %endif
  1273. movd [%6+%9*2], m%2
  1274. movd [%7+%8*2], m%3
  1275. movd [%7+%9*2], m%4
  1276. %endmacro
  1277. ; write 4 or 8 words in the mmx/xmm registers as 8 lines
  1278. ; 1 and 2 are the registers to write, this can be the same (for SSE2)
  1279. ; for pre-SSE4:
  1280. ; 3 is a general-purpose register that we will clobber
  1281. ; for SSE4:
  1282. ; 3 is a pointer to the destination's 5th line
  1283. ; 4 is a pointer to the destination's 4th line
  1284. ; 5/6 is -stride and +stride
  1285. %macro WRITE_2x4W 6
  1286. movd %3d, %1
  1287. punpckhdq %1, %1
  1288. mov [%4+%5*4], %3w
  1289. shr %3, 16
  1290. add %4, %6
  1291. mov [%4+%5*4], %3w
  1292. movd %3d, %1
  1293. add %4, %5
  1294. mov [%4+%5*2], %3w
  1295. shr %3, 16
  1296. mov [%4+%5 ], %3w
  1297. movd %3d, %2
  1298. punpckhdq %2, %2
  1299. mov [%4 ], %3w
  1300. shr %3, 16
  1301. mov [%4+%6 ], %3w
  1302. movd %3d, %2
  1303. add %4, %6
  1304. mov [%4+%6 ], %3w
  1305. shr %3, 16
  1306. mov [%4+%6*2], %3w
  1307. add %4, %5
  1308. %endmacro
  1309. %macro WRITE_8W 5
  1310. %if cpuflag(sse4)
  1311. pextrw [%3+%4*4], %1, 0
  1312. pextrw [%2+%4*4], %1, 1
  1313. pextrw [%3+%4*2], %1, 2
  1314. pextrw [%3+%4 ], %1, 3
  1315. pextrw [%3 ], %1, 4
  1316. pextrw [%2 ], %1, 5
  1317. pextrw [%2+%5 ], %1, 6
  1318. pextrw [%2+%5*2], %1, 7
  1319. %else
  1320. movd %2d, %1
  1321. psrldq %1, 4
  1322. mov [%3+%4*4], %2w
  1323. shr %2, 16
  1324. add %3, %5
  1325. mov [%3+%4*4], %2w
  1326. movd %2d, %1
  1327. psrldq %1, 4
  1328. add %3, %4
  1329. mov [%3+%4*2], %2w
  1330. shr %2, 16
  1331. mov [%3+%4 ], %2w
  1332. movd %2d, %1
  1333. psrldq %1, 4
  1334. mov [%3 ], %2w
  1335. shr %2, 16
  1336. mov [%3+%5 ], %2w
  1337. movd %2d, %1
  1338. add %3, %5
  1339. mov [%3+%5 ], %2w
  1340. shr %2, 16
  1341. mov [%3+%5*2], %2w
  1342. %endif
  1343. %endmacro
  1344. %macro SPLATB_REG 2-3
  1345. %if cpuflag(ssse3)
  1346. movd %1, %2d
  1347. pshufb %1, %3
  1348. %elif cpuflag(sse2)
  1349. movd %1, %2d
  1350. punpcklbw %1, %1
  1351. pshuflw %1, %1, 0x0
  1352. punpcklqdq %1, %1
  1353. %elif cpuflag(mmx2)
  1354. movd %1, %2d
  1355. punpcklbw %1, %1
  1356. pshufw %1, %1, 0x0
  1357. %else
  1358. movd %1, %2d
  1359. punpcklbw %1, %1
  1360. punpcklwd %1, %1
  1361. punpckldq %1, %1
  1362. %endif
  1363. %endmacro
  1364. %macro SIMPLE_LOOPFILTER 2
  1365. cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
  1366. %if mmsize == 8 ; mmx/mmxext
  1367. mov cntrq, 2
  1368. %endif
  1369. %if cpuflag(ssse3)
  1370. pxor m0, m0
  1371. %endif
  1372. SPLATB_REG m7, flim, m0 ; splat "flim" into register
  1373. ; set up indexes to address 4 rows
  1374. %if mmsize == 8
  1375. DEFINE_ARGS dst1, mstride, stride, cntr, dst2
  1376. %else
  1377. DEFINE_ARGS dst1, mstride, stride, dst3, dst2
  1378. %endif
  1379. mov strideq, mstrideq
  1380. neg mstrideq
  1381. %ifidn %1, h
  1382. lea dst1q, [dst1q+4*strideq-2]
  1383. %endif
  1384. %if mmsize == 8 ; mmx / mmxext
  1385. .next8px
  1386. %endif
  1387. %ifidn %1, v
  1388. ; read 4 half/full rows of pixels
  1389. mova m0, [dst1q+mstrideq*2] ; p1
  1390. mova m1, [dst1q+mstrideq] ; p0
  1391. mova m2, [dst1q] ; q0
  1392. mova m3, [dst1q+ strideq] ; q1
  1393. %else ; h
  1394. lea dst2q, [dst1q+ strideq]
  1395. %if mmsize == 8 ; mmx/mmxext
  1396. READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
  1397. %else ; sse2
  1398. READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
  1399. %endif
  1400. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1401. %endif
  1402. ; simple_limit
  1403. mova m5, m2 ; m5=backup of q0
  1404. mova m6, m1 ; m6=backup of p0
  1405. psubusb m1, m2 ; p0-q0
  1406. psubusb m2, m6 ; q0-p0
  1407. por m1, m2 ; FFABS(p0-q0)
  1408. paddusb m1, m1 ; m1=FFABS(p0-q0)*2
  1409. mova m4, m3
  1410. mova m2, m0
  1411. psubusb m3, m0 ; q1-p1
  1412. psubusb m0, m4 ; p1-q1
  1413. por m3, m0 ; FFABS(p1-q1)
  1414. mova m0, [pb_80]
  1415. pxor m2, m0
  1416. pxor m4, m0
  1417. psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
  1418. pand m3, [pb_FE]
  1419. psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
  1420. paddusb m3, m1
  1421. psubusb m3, m7
  1422. pxor m1, m1
  1423. pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
  1424. ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
  1425. mova m4, m5
  1426. pxor m5, m0
  1427. pxor m0, m6
  1428. psubsb m5, m0 ; q0-p0 (signed)
  1429. paddsb m2, m5
  1430. paddsb m2, m5
  1431. paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
  1432. pand m2, m3 ; apply filter mask (m3)
  1433. mova m3, [pb_F8]
  1434. mova m1, m2
  1435. paddsb m2, [pb_4] ; f1<<3=a+4
  1436. paddsb m1, [pb_3] ; f2<<3=a+3
  1437. pand m2, m3
  1438. pand m1, m3 ; cache f2<<3
  1439. pxor m0, m0
  1440. pxor m3, m3
  1441. pcmpgtb m0, m2 ; which values are <0?
  1442. psubb m3, m2 ; -f1<<3
  1443. psrlq m2, 3 ; +f1
  1444. psrlq m3, 3 ; -f1
  1445. pand m3, m0
  1446. pandn m0, m2
  1447. psubusb m4, m0
  1448. paddusb m4, m3 ; q0-f1
  1449. pxor m0, m0
  1450. pxor m3, m3
  1451. pcmpgtb m0, m1 ; which values are <0?
  1452. psubb m3, m1 ; -f2<<3
  1453. psrlq m1, 3 ; +f2
  1454. psrlq m3, 3 ; -f2
  1455. pand m3, m0
  1456. pandn m0, m1
  1457. paddusb m6, m0
  1458. psubusb m6, m3 ; p0+f2
  1459. ; store
  1460. %ifidn %1, v
  1461. mova [dst1q], m4
  1462. mova [dst1q+mstrideq], m6
  1463. %else ; h
  1464. inc dst1q
  1465. SBUTTERFLY bw, 6, 4, 0
  1466. %if mmsize == 16 ; sse2
  1467. %if cpuflag(sse4)
  1468. inc dst2q
  1469. %endif
  1470. WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
  1471. lea dst2q, [dst3q+mstrideq+1]
  1472. %if cpuflag(sse4)
  1473. inc dst3q
  1474. %endif
  1475. WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
  1476. %else ; mmx/mmxext
  1477. WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
  1478. %endif
  1479. %endif
  1480. %if mmsize == 8 ; mmx/mmxext
  1481. ; next 8 pixels
  1482. %ifidn %1, v
  1483. add dst1q, 8 ; advance 8 cols = pixels
  1484. %else ; h
  1485. lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
  1486. %endif
  1487. dec cntrq
  1488. jg .next8px
  1489. REP_RET
  1490. %else ; sse2
  1491. RET
  1492. %endif
  1493. %endmacro
  1494. %if ARCH_X86_32
  1495. INIT_MMX mmx
  1496. SIMPLE_LOOPFILTER v, 4
  1497. SIMPLE_LOOPFILTER h, 5
  1498. INIT_MMX mmx2
  1499. SIMPLE_LOOPFILTER v, 4
  1500. SIMPLE_LOOPFILTER h, 5
  1501. %endif
  1502. INIT_XMM sse2
  1503. SIMPLE_LOOPFILTER v, 3
  1504. SIMPLE_LOOPFILTER h, 5
  1505. INIT_XMM ssse3
  1506. SIMPLE_LOOPFILTER v, 3
  1507. SIMPLE_LOOPFILTER h, 5
  1508. INIT_XMM sse4
  1509. SIMPLE_LOOPFILTER h, 5
  1510. ;-----------------------------------------------------------------------------
  1511. ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1512. ; int flimE, int flimI, int hev_thr);
  1513. ;-----------------------------------------------------------------------------
  1514. %macro INNER_LOOPFILTER 2
  1515. %if %2 == 8 ; chroma
  1516. cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr
  1517. %else ; luma
  1518. cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
  1519. %endif
  1520. %if cpuflag(ssse3)
  1521. pxor m7, m7
  1522. %endif
  1523. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1524. %ifidn %1, v ; [3]=hev() result
  1525. %assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15)
  1526. %else ; h ; extra storage space for transposes
  1527. %assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15)
  1528. %endif
  1529. ; splat function arguments
  1530. SPLATB_REG m0, flimEq, m7 ; E
  1531. SPLATB_REG m1, flimIq, m7 ; I
  1532. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1533. SUB rsp, pad
  1534. %define m_flimE [rsp]
  1535. %define m_flimI [rsp+mmsize]
  1536. %define m_hevthr [rsp+mmsize*2]
  1537. %define m_maskres [rsp+mmsize*3]
  1538. %define m_p0backup [rsp+mmsize*3]
  1539. %define m_q0backup [rsp+mmsize*4]
  1540. mova m_flimE, m0
  1541. mova m_flimI, m1
  1542. mova m_hevthr, m2
  1543. %else
  1544. %define m_flimE m9
  1545. %define m_flimI m10
  1546. %define m_hevthr m11
  1547. %define m_maskres m12
  1548. %define m_p0backup m12
  1549. %define m_q0backup m8
  1550. ; splat function arguments
  1551. SPLATB_REG m_flimE, flimEq, m7 ; E
  1552. SPLATB_REG m_flimI, flimIq, m7 ; I
  1553. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  1554. %endif
  1555. %if %2 == 8 ; chroma
  1556. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  1557. %elif mmsize == 8
  1558. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  1559. mov cntrq, 2
  1560. %else
  1561. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  1562. %endif
  1563. mov strideq, mstrideq
  1564. neg mstrideq
  1565. %ifidn %1, h
  1566. lea dst1q, [dst1q+strideq*4-4]
  1567. %if %2 == 8 ; chroma
  1568. lea dst8q, [dst8q+strideq*4-4]
  1569. %endif
  1570. %endif
  1571. %if mmsize == 8
  1572. .next8px:
  1573. %endif
  1574. ; read
  1575. lea dst2q, [dst1q+strideq]
  1576. %ifidn %1, v
  1577. %if %2 == 8 && mmsize == 16
  1578. %define movrow movh
  1579. %else
  1580. %define movrow mova
  1581. %endif
  1582. movrow m0, [dst1q+mstrideq*4] ; p3
  1583. movrow m1, [dst2q+mstrideq*4] ; p2
  1584. movrow m2, [dst1q+mstrideq*2] ; p1
  1585. movrow m5, [dst2q] ; q1
  1586. movrow m6, [dst2q+ strideq*1] ; q2
  1587. movrow m7, [dst2q+ strideq*2] ; q3
  1588. %if mmsize == 16 && %2 == 8
  1589. movhps m0, [dst8q+mstrideq*4]
  1590. movhps m2, [dst8q+mstrideq*2]
  1591. add dst8q, strideq
  1592. movhps m1, [dst8q+mstrideq*4]
  1593. movhps m5, [dst8q]
  1594. movhps m6, [dst8q+ strideq ]
  1595. movhps m7, [dst8q+ strideq*2]
  1596. add dst8q, mstrideq
  1597. %endif
  1598. %elif mmsize == 8 ; mmx/mmxext (h)
  1599. ; read 8 rows of 8px each
  1600. movu m0, [dst1q+mstrideq*4]
  1601. movu m1, [dst2q+mstrideq*4]
  1602. movu m2, [dst1q+mstrideq*2]
  1603. movu m3, [dst1q+mstrideq ]
  1604. movu m4, [dst1q]
  1605. movu m5, [dst2q]
  1606. movu m6, [dst2q+ strideq ]
  1607. ; 8x8 transpose
  1608. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1609. mova m_q0backup, m1
  1610. movu m7, [dst2q+ strideq*2]
  1611. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1612. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1613. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1614. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1615. mova m1, m_q0backup
  1616. mova m_q0backup, m2 ; store q0
  1617. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1618. mova m_p0backup, m5 ; store p0
  1619. SWAP 1, 4
  1620. SWAP 2, 4
  1621. SWAP 6, 3
  1622. SWAP 5, 3
  1623. %else ; sse2 (h)
  1624. %if %2 == 16
  1625. lea dst8q, [dst1q+ strideq*8]
  1626. %endif
  1627. ; read 16 rows of 8px each, interleave
  1628. movh m0, [dst1q+mstrideq*4]
  1629. movh m1, [dst8q+mstrideq*4]
  1630. movh m2, [dst1q+mstrideq*2]
  1631. movh m5, [dst8q+mstrideq*2]
  1632. movh m3, [dst1q+mstrideq ]
  1633. movh m6, [dst8q+mstrideq ]
  1634. movh m4, [dst1q]
  1635. movh m7, [dst8q]
  1636. punpcklbw m0, m1 ; A/I
  1637. punpcklbw m2, m5 ; C/K
  1638. punpcklbw m3, m6 ; D/L
  1639. punpcklbw m4, m7 ; E/M
  1640. add dst8q, strideq
  1641. movh m1, [dst2q+mstrideq*4]
  1642. movh m6, [dst8q+mstrideq*4]
  1643. movh m5, [dst2q]
  1644. movh m7, [dst8q]
  1645. punpcklbw m1, m6 ; B/J
  1646. punpcklbw m5, m7 ; F/N
  1647. movh m6, [dst2q+ strideq ]
  1648. movh m7, [dst8q+ strideq ]
  1649. punpcklbw m6, m7 ; G/O
  1650. ; 8x16 transpose
  1651. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1652. %ifdef m8
  1653. SWAP 1, 8
  1654. %else
  1655. mova m_q0backup, m1
  1656. %endif
  1657. movh m7, [dst2q+ strideq*2]
  1658. movh m1, [dst8q+ strideq*2]
  1659. punpcklbw m7, m1 ; H/P
  1660. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1661. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1662. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1663. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1664. %ifdef m8
  1665. SWAP 1, 8
  1666. SWAP 2, 8
  1667. %else
  1668. mova m1, m_q0backup
  1669. mova m_q0backup, m2 ; store q0
  1670. %endif
  1671. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1672. %ifdef m12
  1673. SWAP 5, 12
  1674. %else
  1675. mova m_p0backup, m5 ; store p0
  1676. %endif
  1677. SWAP 1, 4
  1678. SWAP 2, 4
  1679. SWAP 6, 3
  1680. SWAP 5, 3
  1681. %endif
  1682. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  1683. mova m4, m1
  1684. SWAP 4, 1
  1685. psubusb m4, m0 ; p2-p3
  1686. psubusb m0, m1 ; p3-p2
  1687. por m0, m4 ; abs(p3-p2)
  1688. mova m4, m2
  1689. SWAP 4, 2
  1690. psubusb m4, m1 ; p1-p2
  1691. psubusb m1, m2 ; p2-p1
  1692. por m1, m4 ; abs(p2-p1)
  1693. mova m4, m6
  1694. SWAP 4, 6
  1695. psubusb m4, m7 ; q2-q3
  1696. psubusb m7, m6 ; q3-q2
  1697. por m7, m4 ; abs(q3-q2)
  1698. mova m4, m5
  1699. SWAP 4, 5
  1700. psubusb m4, m6 ; q1-q2
  1701. psubusb m6, m5 ; q2-q1
  1702. por m6, m4 ; abs(q2-q1)
  1703. %if notcpuflag(mmx2)
  1704. mova m4, m_flimI
  1705. pxor m3, m3
  1706. psubusb m0, m4
  1707. psubusb m1, m4
  1708. psubusb m7, m4
  1709. psubusb m6, m4
  1710. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  1711. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  1712. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  1713. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  1714. pand m0, m1
  1715. pand m7, m6
  1716. pand m0, m7
  1717. %else ; mmxext/sse2
  1718. pmaxub m0, m1
  1719. pmaxub m6, m7
  1720. pmaxub m0, m6
  1721. %endif
  1722. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  1723. SWAP 7, 3 ; now m7 is zero
  1724. %ifidn %1, v
  1725. movrow m3, [dst1q+mstrideq ] ; p0
  1726. %if mmsize == 16 && %2 == 8
  1727. movhps m3, [dst8q+mstrideq ]
  1728. %endif
  1729. %elifdef m12
  1730. SWAP 3, 12
  1731. %else
  1732. mova m3, m_p0backup
  1733. %endif
  1734. mova m1, m2
  1735. SWAP 1, 2
  1736. mova m6, m3
  1737. SWAP 3, 6
  1738. psubusb m1, m3 ; p1-p0
  1739. psubusb m6, m2 ; p0-p1
  1740. por m1, m6 ; abs(p1-p0)
  1741. %if notcpuflag(mmx2)
  1742. mova m6, m1
  1743. psubusb m1, m4
  1744. psubusb m6, m_hevthr
  1745. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  1746. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  1747. pand m0, m1
  1748. mova m_maskres, m6
  1749. %else ; mmxext/sse2
  1750. pmaxub m0, m1 ; max_I
  1751. SWAP 1, 4 ; max_hev_thresh
  1752. %endif
  1753. SWAP 6, 4 ; now m6 is I
  1754. %ifidn %1, v
  1755. movrow m4, [dst1q] ; q0
  1756. %if mmsize == 16 && %2 == 8
  1757. movhps m4, [dst8q]
  1758. %endif
  1759. %elifdef m8
  1760. SWAP 4, 8
  1761. %else
  1762. mova m4, m_q0backup
  1763. %endif
  1764. mova m1, m4
  1765. SWAP 1, 4
  1766. mova m7, m5
  1767. SWAP 7, 5
  1768. psubusb m1, m5 ; q0-q1
  1769. psubusb m7, m4 ; q1-q0
  1770. por m1, m7 ; abs(q1-q0)
  1771. %if notcpuflag(mmx2)
  1772. mova m7, m1
  1773. psubusb m1, m6
  1774. psubusb m7, m_hevthr
  1775. pxor m6, m6
  1776. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  1777. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  1778. mova m6, m_maskres
  1779. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  1780. pand m6, m7
  1781. %else ; mmxext/sse2
  1782. pxor m7, m7
  1783. pmaxub m0, m1
  1784. pmaxub m6, m1
  1785. psubusb m0, m_flimI
  1786. psubusb m6, m_hevthr
  1787. pcmpeqb m0, m7 ; max(abs(..)) <= I
  1788. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  1789. %endif
  1790. %ifdef m12
  1791. SWAP 6, 12
  1792. %else
  1793. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  1794. %endif
  1795. ; simple_limit
  1796. mova m1, m3
  1797. SWAP 1, 3
  1798. mova m6, m4 ; keep copies of p0/q0 around for later use
  1799. SWAP 6, 4
  1800. psubusb m1, m4 ; p0-q0
  1801. psubusb m6, m3 ; q0-p0
  1802. por m1, m6 ; abs(q0-p0)
  1803. paddusb m1, m1 ; m1=2*abs(q0-p0)
  1804. mova m7, m2
  1805. SWAP 7, 2
  1806. mova m6, m5
  1807. SWAP 6, 5
  1808. psubusb m7, m5 ; p1-q1
  1809. psubusb m6, m2 ; q1-p1
  1810. por m7, m6 ; abs(q1-p1)
  1811. pxor m6, m6
  1812. pand m7, [pb_FE]
  1813. psrlq m7, 1 ; abs(q1-p1)/2
  1814. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  1815. psubusb m7, m_flimE
  1816. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  1817. pand m0, m7 ; normal_limit result
  1818. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  1819. %ifdef m8 ; x86-64 && sse2
  1820. mova m8, [pb_80]
  1821. %define m_pb_80 m8
  1822. %else ; x86-32 or mmx/mmxext
  1823. %define m_pb_80 [pb_80]
  1824. %endif
  1825. mova m1, m4
  1826. mova m7, m3
  1827. pxor m1, m_pb_80
  1828. pxor m7, m_pb_80
  1829. psubsb m1, m7 ; (signed) q0-p0
  1830. mova m6, m2
  1831. mova m7, m5
  1832. pxor m6, m_pb_80
  1833. pxor m7, m_pb_80
  1834. psubsb m6, m7 ; (signed) p1-q1
  1835. mova m7, m_maskres
  1836. pandn m7, m6
  1837. paddsb m7, m1
  1838. paddsb m7, m1
  1839. paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
  1840. pand m7, m0
  1841. mova m1, [pb_F8]
  1842. mova m6, m7
  1843. paddsb m7, [pb_3]
  1844. paddsb m6, [pb_4]
  1845. pand m7, m1
  1846. pand m6, m1
  1847. pxor m1, m1
  1848. pxor m0, m0
  1849. pcmpgtb m1, m7
  1850. psubb m0, m7
  1851. psrlq m7, 3 ; +f2
  1852. psrlq m0, 3 ; -f2
  1853. pand m0, m1
  1854. pandn m1, m7
  1855. psubusb m3, m0
  1856. paddusb m3, m1 ; p0+f2
  1857. pxor m1, m1
  1858. pxor m0, m0
  1859. pcmpgtb m0, m6
  1860. psubb m1, m6
  1861. psrlq m6, 3 ; +f1
  1862. psrlq m1, 3 ; -f1
  1863. pand m1, m0
  1864. pandn m0, m6
  1865. psubusb m4, m0
  1866. paddusb m4, m1 ; q0-f1
  1867. %ifdef m12
  1868. SWAP 6, 12
  1869. %else
  1870. mova m6, m_maskres
  1871. %endif
  1872. %if notcpuflag(mmx2)
  1873. mova m7, [pb_1]
  1874. %else ; mmxext/sse2
  1875. pxor m7, m7
  1876. %endif
  1877. pand m0, m6
  1878. pand m1, m6
  1879. %if notcpuflag(mmx2)
  1880. paddusb m0, m7
  1881. pand m1, [pb_FE]
  1882. pandn m7, m0
  1883. psrlq m1, 1
  1884. psrlq m7, 1
  1885. SWAP 0, 7
  1886. %else ; mmxext/sse2
  1887. psubusb m1, [pb_1]
  1888. pavgb m0, m7 ; a
  1889. pavgb m1, m7 ; -a
  1890. %endif
  1891. psubusb m5, m0
  1892. psubusb m2, m1
  1893. paddusb m5, m1 ; q1-a
  1894. paddusb m2, m0 ; p1+a
  1895. ; store
  1896. %ifidn %1, v
  1897. movrow [dst1q+mstrideq*2], m2
  1898. movrow [dst1q+mstrideq ], m3
  1899. movrow [dst1q], m4
  1900. movrow [dst1q+ strideq ], m5
  1901. %if mmsize == 16 && %2 == 8
  1902. movhps [dst8q+mstrideq*2], m2
  1903. movhps [dst8q+mstrideq ], m3
  1904. movhps [dst8q], m4
  1905. movhps [dst8q+ strideq ], m5
  1906. %endif
  1907. %else ; h
  1908. add dst1q, 2
  1909. add dst2q, 2
  1910. ; 4x8/16 transpose
  1911. TRANSPOSE4x4B 2, 3, 4, 5, 6
  1912. %if mmsize == 8 ; mmx/mmxext (h)
  1913. WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
  1914. %else ; sse2 (h)
  1915. lea dst8q, [dst8q+mstrideq +2]
  1916. WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  1917. %endif
  1918. %endif
  1919. %if mmsize == 8
  1920. %if %2 == 8 ; chroma
  1921. %ifidn %1, h
  1922. sub dst1q, 2
  1923. %endif
  1924. cmp dst1q, dst8q
  1925. mov dst1q, dst8q
  1926. jnz .next8px
  1927. %else
  1928. %ifidn %1, h
  1929. lea dst1q, [dst1q+ strideq*8-2]
  1930. %else ; v
  1931. add dst1q, 8
  1932. %endif
  1933. dec cntrq
  1934. jg .next8px
  1935. %endif
  1936. %endif
  1937. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  1938. ADD rsp, pad
  1939. %endif
  1940. RET
  1941. %endmacro
  1942. %if ARCH_X86_32
  1943. INIT_MMX mmx
  1944. INNER_LOOPFILTER v, 16
  1945. INNER_LOOPFILTER h, 16
  1946. INNER_LOOPFILTER v, 8
  1947. INNER_LOOPFILTER h, 8
  1948. INIT_MMX mmx2
  1949. INNER_LOOPFILTER v, 16
  1950. INNER_LOOPFILTER h, 16
  1951. INNER_LOOPFILTER v, 8
  1952. INNER_LOOPFILTER h, 8
  1953. %endif
  1954. INIT_XMM sse2
  1955. INNER_LOOPFILTER v, 16
  1956. INNER_LOOPFILTER h, 16
  1957. INNER_LOOPFILTER v, 8
  1958. INNER_LOOPFILTER h, 8
  1959. INIT_XMM ssse3
  1960. INNER_LOOPFILTER v, 16
  1961. INNER_LOOPFILTER h, 16
  1962. INNER_LOOPFILTER v, 8
  1963. INNER_LOOPFILTER h, 8
  1964. ;-----------------------------------------------------------------------------
  1965. ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1966. ; int flimE, int flimI, int hev_thr);
  1967. ;-----------------------------------------------------------------------------
  1968. %macro MBEDGE_LOOPFILTER 2
  1969. %if %2 == 8 ; chroma
  1970. cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr
  1971. %else ; luma
  1972. cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr
  1973. %endif
  1974. %if cpuflag(ssse3)
  1975. pxor m7, m7
  1976. %endif
  1977. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1978. %if mmsize == 16 ; [3]=hev() result
  1979. ; [4]=filter tmp result
  1980. ; [5]/[6] = p2/q2 backup
  1981. ; [7]=lim_res sign result
  1982. %assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15)
  1983. %else ; 8 ; extra storage space for transposes
  1984. %assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15)
  1985. %endif
  1986. ; splat function arguments
  1987. SPLATB_REG m0, flimEq, m7 ; E
  1988. SPLATB_REG m1, flimIq, m7 ; I
  1989. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1990. SUB rsp, pad
  1991. %define m_flimE [rsp]
  1992. %define m_flimI [rsp+mmsize]
  1993. %define m_hevthr [rsp+mmsize*2]
  1994. %define m_maskres [rsp+mmsize*3]
  1995. %define m_limres [rsp+mmsize*4]
  1996. %define m_p0backup [rsp+mmsize*3]
  1997. %define m_q0backup [rsp+mmsize*4]
  1998. %define m_p2backup [rsp+mmsize*5]
  1999. %define m_q2backup [rsp+mmsize*6]
  2000. %if mmsize == 16
  2001. %define m_limsign [rsp]
  2002. %else
  2003. %define m_limsign [rsp+mmsize*7]
  2004. %endif
  2005. mova m_flimE, m0
  2006. mova m_flimI, m1
  2007. mova m_hevthr, m2
  2008. %else ; sse2 on x86-64
  2009. %define m_flimE m9
  2010. %define m_flimI m10
  2011. %define m_hevthr m11
  2012. %define m_maskres m12
  2013. %define m_limres m8
  2014. %define m_p0backup m12
  2015. %define m_q0backup m8
  2016. %define m_p2backup m13
  2017. %define m_q2backup m14
  2018. %define m_limsign m9
  2019. ; splat function arguments
  2020. SPLATB_REG m_flimE, flimEq, m7 ; E
  2021. SPLATB_REG m_flimI, flimIq, m7 ; I
  2022. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  2023. %endif
  2024. %if %2 == 8 ; chroma
  2025. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  2026. %elif mmsize == 8
  2027. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  2028. mov cntrq, 2
  2029. %else
  2030. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  2031. %endif
  2032. mov strideq, mstrideq
  2033. neg mstrideq
  2034. %ifidn %1, h
  2035. lea dst1q, [dst1q+strideq*4-4]
  2036. %if %2 == 8 ; chroma
  2037. lea dst8q, [dst8q+strideq*4-4]
  2038. %endif
  2039. %endif
  2040. %if mmsize == 8
  2041. .next8px:
  2042. %endif
  2043. ; read
  2044. lea dst2q, [dst1q+ strideq ]
  2045. %ifidn %1, v
  2046. %if %2 == 8 && mmsize == 16
  2047. %define movrow movh
  2048. %else
  2049. %define movrow mova
  2050. %endif
  2051. movrow m0, [dst1q+mstrideq*4] ; p3
  2052. movrow m1, [dst2q+mstrideq*4] ; p2
  2053. movrow m2, [dst1q+mstrideq*2] ; p1
  2054. movrow m5, [dst2q] ; q1
  2055. movrow m6, [dst2q+ strideq ] ; q2
  2056. movrow m7, [dst2q+ strideq*2] ; q3
  2057. %if mmsize == 16 && %2 == 8
  2058. movhps m0, [dst8q+mstrideq*4]
  2059. movhps m2, [dst8q+mstrideq*2]
  2060. add dst8q, strideq
  2061. movhps m1, [dst8q+mstrideq*4]
  2062. movhps m5, [dst8q]
  2063. movhps m6, [dst8q+ strideq ]
  2064. movhps m7, [dst8q+ strideq*2]
  2065. add dst8q, mstrideq
  2066. %endif
  2067. %elif mmsize == 8 ; mmx/mmxext (h)
  2068. ; read 8 rows of 8px each
  2069. movu m0, [dst1q+mstrideq*4]
  2070. movu m1, [dst2q+mstrideq*4]
  2071. movu m2, [dst1q+mstrideq*2]
  2072. movu m3, [dst1q+mstrideq ]
  2073. movu m4, [dst1q]
  2074. movu m5, [dst2q]
  2075. movu m6, [dst2q+ strideq ]
  2076. ; 8x8 transpose
  2077. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2078. mova m_q0backup, m1
  2079. movu m7, [dst2q+ strideq*2]
  2080. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2081. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2082. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2083. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2084. mova m1, m_q0backup
  2085. mova m_q0backup, m2 ; store q0
  2086. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2087. mova m_p0backup, m5 ; store p0
  2088. SWAP 1, 4
  2089. SWAP 2, 4
  2090. SWAP 6, 3
  2091. SWAP 5, 3
  2092. %else ; sse2 (h)
  2093. %if %2 == 16
  2094. lea dst8q, [dst1q+ strideq*8 ]
  2095. %endif
  2096. ; read 16 rows of 8px each, interleave
  2097. movh m0, [dst1q+mstrideq*4]
  2098. movh m1, [dst8q+mstrideq*4]
  2099. movh m2, [dst1q+mstrideq*2]
  2100. movh m5, [dst8q+mstrideq*2]
  2101. movh m3, [dst1q+mstrideq ]
  2102. movh m6, [dst8q+mstrideq ]
  2103. movh m4, [dst1q]
  2104. movh m7, [dst8q]
  2105. punpcklbw m0, m1 ; A/I
  2106. punpcklbw m2, m5 ; C/K
  2107. punpcklbw m3, m6 ; D/L
  2108. punpcklbw m4, m7 ; E/M
  2109. add dst8q, strideq
  2110. movh m1, [dst2q+mstrideq*4]
  2111. movh m6, [dst8q+mstrideq*4]
  2112. movh m5, [dst2q]
  2113. movh m7, [dst8q]
  2114. punpcklbw m1, m6 ; B/J
  2115. punpcklbw m5, m7 ; F/N
  2116. movh m6, [dst2q+ strideq ]
  2117. movh m7, [dst8q+ strideq ]
  2118. punpcklbw m6, m7 ; G/O
  2119. ; 8x16 transpose
  2120. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2121. %ifdef m8
  2122. SWAP 1, 8
  2123. %else
  2124. mova m_q0backup, m1
  2125. %endif
  2126. movh m7, [dst2q+ strideq*2]
  2127. movh m1, [dst8q+ strideq*2]
  2128. punpcklbw m7, m1 ; H/P
  2129. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2130. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2131. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2132. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2133. %ifdef m8
  2134. SWAP 1, 8
  2135. SWAP 2, 8
  2136. %else
  2137. mova m1, m_q0backup
  2138. mova m_q0backup, m2 ; store q0
  2139. %endif
  2140. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2141. %ifdef m12
  2142. SWAP 5, 12
  2143. %else
  2144. mova m_p0backup, m5 ; store p0
  2145. %endif
  2146. SWAP 1, 4
  2147. SWAP 2, 4
  2148. SWAP 6, 3
  2149. SWAP 5, 3
  2150. %endif
  2151. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  2152. mova m4, m1
  2153. SWAP 4, 1
  2154. psubusb m4, m0 ; p2-p3
  2155. psubusb m0, m1 ; p3-p2
  2156. por m0, m4 ; abs(p3-p2)
  2157. mova m4, m2
  2158. SWAP 4, 2
  2159. psubusb m4, m1 ; p1-p2
  2160. mova m_p2backup, m1
  2161. psubusb m1, m2 ; p2-p1
  2162. por m1, m4 ; abs(p2-p1)
  2163. mova m4, m6
  2164. SWAP 4, 6
  2165. psubusb m4, m7 ; q2-q3
  2166. psubusb m7, m6 ; q3-q2
  2167. por m7, m4 ; abs(q3-q2)
  2168. mova m4, m5
  2169. SWAP 4, 5
  2170. psubusb m4, m6 ; q1-q2
  2171. mova m_q2backup, m6
  2172. psubusb m6, m5 ; q2-q1
  2173. por m6, m4 ; abs(q2-q1)
  2174. %if notcpuflag(mmx2)
  2175. mova m4, m_flimI
  2176. pxor m3, m3
  2177. psubusb m0, m4
  2178. psubusb m1, m4
  2179. psubusb m7, m4
  2180. psubusb m6, m4
  2181. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  2182. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  2183. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  2184. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  2185. pand m0, m1
  2186. pand m7, m6
  2187. pand m0, m7
  2188. %else ; mmxext/sse2
  2189. pmaxub m0, m1
  2190. pmaxub m6, m7
  2191. pmaxub m0, m6
  2192. %endif
  2193. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  2194. SWAP 7, 3 ; now m7 is zero
  2195. %ifidn %1, v
  2196. movrow m3, [dst1q+mstrideq ] ; p0
  2197. %if mmsize == 16 && %2 == 8
  2198. movhps m3, [dst8q+mstrideq ]
  2199. %endif
  2200. %elifdef m12
  2201. SWAP 3, 12
  2202. %else
  2203. mova m3, m_p0backup
  2204. %endif
  2205. mova m1, m2
  2206. SWAP 1, 2
  2207. mova m6, m3
  2208. SWAP 3, 6
  2209. psubusb m1, m3 ; p1-p0
  2210. psubusb m6, m2 ; p0-p1
  2211. por m1, m6 ; abs(p1-p0)
  2212. %if notcpuflag(mmx2)
  2213. mova m6, m1
  2214. psubusb m1, m4
  2215. psubusb m6, m_hevthr
  2216. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  2217. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  2218. pand m0, m1
  2219. mova m_maskres, m6
  2220. %else ; mmxext/sse2
  2221. pmaxub m0, m1 ; max_I
  2222. SWAP 1, 4 ; max_hev_thresh
  2223. %endif
  2224. SWAP 6, 4 ; now m6 is I
  2225. %ifidn %1, v
  2226. movrow m4, [dst1q] ; q0
  2227. %if mmsize == 16 && %2 == 8
  2228. movhps m4, [dst8q]
  2229. %endif
  2230. %elifdef m8
  2231. SWAP 4, 8
  2232. %else
  2233. mova m4, m_q0backup
  2234. %endif
  2235. mova m1, m4
  2236. SWAP 1, 4
  2237. mova m7, m5
  2238. SWAP 7, 5
  2239. psubusb m1, m5 ; q0-q1
  2240. psubusb m7, m4 ; q1-q0
  2241. por m1, m7 ; abs(q1-q0)
  2242. %if notcpuflag(mmx2)
  2243. mova m7, m1
  2244. psubusb m1, m6
  2245. psubusb m7, m_hevthr
  2246. pxor m6, m6
  2247. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  2248. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  2249. mova m6, m_maskres
  2250. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  2251. pand m6, m7
  2252. %else ; mmxext/sse2
  2253. pxor m7, m7
  2254. pmaxub m0, m1
  2255. pmaxub m6, m1
  2256. psubusb m0, m_flimI
  2257. psubusb m6, m_hevthr
  2258. pcmpeqb m0, m7 ; max(abs(..)) <= I
  2259. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  2260. %endif
  2261. %ifdef m12
  2262. SWAP 6, 12
  2263. %else
  2264. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  2265. %endif
  2266. ; simple_limit
  2267. mova m1, m3
  2268. SWAP 1, 3
  2269. mova m6, m4 ; keep copies of p0/q0 around for later use
  2270. SWAP 6, 4
  2271. psubusb m1, m4 ; p0-q0
  2272. psubusb m6, m3 ; q0-p0
  2273. por m1, m6 ; abs(q0-p0)
  2274. paddusb m1, m1 ; m1=2*abs(q0-p0)
  2275. mova m7, m2
  2276. SWAP 7, 2
  2277. mova m6, m5
  2278. SWAP 6, 5
  2279. psubusb m7, m5 ; p1-q1
  2280. psubusb m6, m2 ; q1-p1
  2281. por m7, m6 ; abs(q1-p1)
  2282. pxor m6, m6
  2283. pand m7, [pb_FE]
  2284. psrlq m7, 1 ; abs(q1-p1)/2
  2285. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  2286. psubusb m7, m_flimE
  2287. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  2288. pand m0, m7 ; normal_limit result
  2289. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  2290. %ifdef m8 ; x86-64 && sse2
  2291. mova m8, [pb_80]
  2292. %define m_pb_80 m8
  2293. %else ; x86-32 or mmx/mmxext
  2294. %define m_pb_80 [pb_80]
  2295. %endif
  2296. mova m1, m4
  2297. mova m7, m3
  2298. pxor m1, m_pb_80
  2299. pxor m7, m_pb_80
  2300. psubsb m1, m7 ; (signed) q0-p0
  2301. mova m6, m2
  2302. mova m7, m5
  2303. pxor m6, m_pb_80
  2304. pxor m7, m_pb_80
  2305. psubsb m6, m7 ; (signed) p1-q1
  2306. mova m7, m_maskres
  2307. paddsb m6, m1
  2308. paddsb m6, m1
  2309. paddsb m6, m1
  2310. pand m6, m0
  2311. %ifdef m8
  2312. mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
  2313. pand m_limres, m7
  2314. %else
  2315. mova m0, m6
  2316. pand m0, m7
  2317. mova m_limres, m0
  2318. %endif
  2319. pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
  2320. mova m1, [pb_F8]
  2321. mova m6, m7
  2322. paddsb m7, [pb_3]
  2323. paddsb m6, [pb_4]
  2324. pand m7, m1
  2325. pand m6, m1
  2326. pxor m1, m1
  2327. pxor m0, m0
  2328. pcmpgtb m1, m7
  2329. psubb m0, m7
  2330. psrlq m7, 3 ; +f2
  2331. psrlq m0, 3 ; -f2
  2332. pand m0, m1
  2333. pandn m1, m7
  2334. psubusb m3, m0
  2335. paddusb m3, m1 ; p0+f2
  2336. pxor m1, m1
  2337. pxor m0, m0
  2338. pcmpgtb m0, m6
  2339. psubb m1, m6
  2340. psrlq m6, 3 ; +f1
  2341. psrlq m1, 3 ; -f1
  2342. pand m1, m0
  2343. pandn m0, m6
  2344. psubusb m4, m0
  2345. paddusb m4, m1 ; q0-f1
  2346. ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
  2347. %if cpuflag(ssse3)
  2348. mova m7, [pb_1]
  2349. %else
  2350. mova m7, [pw_63]
  2351. %endif
  2352. %ifdef m8
  2353. SWAP 1, 8
  2354. %else
  2355. mova m1, m_limres
  2356. %endif
  2357. pxor m0, m0
  2358. mova m6, m1
  2359. pcmpgtb m0, m1 ; which are negative
  2360. %if cpuflag(ssse3)
  2361. punpcklbw m6, m7 ; interleave with "1" for rounding
  2362. punpckhbw m1, m7
  2363. %else
  2364. punpcklbw m6, m0 ; signed byte->word
  2365. punpckhbw m1, m0
  2366. %endif
  2367. mova m_limsign, m0
  2368. %if cpuflag(ssse3)
  2369. mova m7, [pb_27_63]
  2370. %ifndef m8
  2371. mova m_limres, m1
  2372. %endif
  2373. %ifdef m10
  2374. SWAP 0, 10 ; don't lose lim_sign copy
  2375. %endif
  2376. mova m0, m7
  2377. pmaddubsw m7, m6
  2378. SWAP 6, 7
  2379. pmaddubsw m0, m1
  2380. SWAP 1, 0
  2381. %ifdef m10
  2382. SWAP 0, 10
  2383. %else
  2384. mova m0, m_limsign
  2385. %endif
  2386. %else
  2387. mova m_maskres, m6 ; backup for later in filter
  2388. mova m_limres, m1
  2389. pmullw m6, [pw_27]
  2390. pmullw m1, [pw_27]
  2391. paddw m6, m7
  2392. paddw m1, m7
  2393. %endif
  2394. psraw m6, 7
  2395. psraw m1, 7
  2396. packsswb m6, m1 ; a0
  2397. pxor m1, m1
  2398. psubb m1, m6
  2399. pand m1, m0 ; -a0
  2400. pandn m0, m6 ; +a0
  2401. %if cpuflag(ssse3)
  2402. mova m6, [pb_18_63] ; pipelining
  2403. %endif
  2404. psubusb m3, m1
  2405. paddusb m4, m1
  2406. paddusb m3, m0 ; p0+a0
  2407. psubusb m4, m0 ; q0-a0
  2408. %if cpuflag(ssse3)
  2409. SWAP 6, 7
  2410. %ifdef m10
  2411. SWAP 1, 10
  2412. %else
  2413. mova m1, m_limres
  2414. %endif
  2415. mova m0, m7
  2416. pmaddubsw m7, m6
  2417. SWAP 6, 7
  2418. pmaddubsw m0, m1
  2419. SWAP 1, 0
  2420. %ifdef m10
  2421. SWAP 0, 10
  2422. %endif
  2423. mova m0, m_limsign
  2424. %else
  2425. mova m6, m_maskres
  2426. mova m1, m_limres
  2427. pmullw m6, [pw_18]
  2428. pmullw m1, [pw_18]
  2429. paddw m6, m7
  2430. paddw m1, m7
  2431. %endif
  2432. mova m0, m_limsign
  2433. psraw m6, 7
  2434. psraw m1, 7
  2435. packsswb m6, m1 ; a1
  2436. pxor m1, m1
  2437. psubb m1, m6
  2438. pand m1, m0 ; -a1
  2439. pandn m0, m6 ; +a1
  2440. %if cpuflag(ssse3)
  2441. mova m6, [pb_9_63]
  2442. %endif
  2443. psubusb m2, m1
  2444. paddusb m5, m1
  2445. paddusb m2, m0 ; p1+a1
  2446. psubusb m5, m0 ; q1-a1
  2447. %if cpuflag(ssse3)
  2448. SWAP 6, 7
  2449. %ifdef m10
  2450. SWAP 1, 10
  2451. %else
  2452. mova m1, m_limres
  2453. %endif
  2454. mova m0, m7
  2455. pmaddubsw m7, m6
  2456. SWAP 6, 7
  2457. pmaddubsw m0, m1
  2458. SWAP 1, 0
  2459. %else
  2460. %ifdef m8
  2461. SWAP 6, 12
  2462. SWAP 1, 8
  2463. %else
  2464. mova m6, m_maskres
  2465. mova m1, m_limres
  2466. %endif
  2467. pmullw m6, [pw_9]
  2468. pmullw m1, [pw_9]
  2469. paddw m6, m7
  2470. paddw m1, m7
  2471. %endif
  2472. %ifdef m9
  2473. SWAP 7, 9
  2474. %else
  2475. mova m7, m_limsign
  2476. %endif
  2477. psraw m6, 7
  2478. psraw m1, 7
  2479. packsswb m6, m1 ; a1
  2480. pxor m0, m0
  2481. psubb m0, m6
  2482. pand m0, m7 ; -a1
  2483. pandn m7, m6 ; +a1
  2484. %ifdef m8
  2485. SWAP 1, 13
  2486. SWAP 6, 14
  2487. %else
  2488. mova m1, m_p2backup
  2489. mova m6, m_q2backup
  2490. %endif
  2491. psubusb m1, m0
  2492. paddusb m6, m0
  2493. paddusb m1, m7 ; p1+a1
  2494. psubusb m6, m7 ; q1-a1
  2495. ; store
  2496. %ifidn %1, v
  2497. movrow [dst2q+mstrideq*4], m1
  2498. movrow [dst1q+mstrideq*2], m2
  2499. movrow [dst1q+mstrideq ], m3
  2500. movrow [dst1q], m4
  2501. movrow [dst2q], m5
  2502. movrow [dst2q+ strideq ], m6
  2503. %if mmsize == 16 && %2 == 8
  2504. add dst8q, mstrideq
  2505. movhps [dst8q+mstrideq*2], m1
  2506. movhps [dst8q+mstrideq ], m2
  2507. movhps [dst8q], m3
  2508. add dst8q, strideq
  2509. movhps [dst8q], m4
  2510. movhps [dst8q+ strideq ], m5
  2511. movhps [dst8q+ strideq*2], m6
  2512. %endif
  2513. %else ; h
  2514. inc dst1q
  2515. inc dst2q
  2516. ; 4x8/16 transpose
  2517. TRANSPOSE4x4B 1, 2, 3, 4, 0
  2518. SBUTTERFLY bw, 5, 6, 0
  2519. %if mmsize == 8 ; mmx/mmxext (h)
  2520. WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
  2521. add dst1q, 4
  2522. WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
  2523. %else ; sse2 (h)
  2524. lea dst8q, [dst8q+mstrideq+1]
  2525. WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  2526. lea dst1q, [dst2q+mstrideq+4]
  2527. lea dst8q, [dst8q+mstrideq+4]
  2528. %if cpuflag(sse4)
  2529. add dst2q, 4
  2530. %endif
  2531. WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
  2532. %if cpuflag(sse4)
  2533. lea dst2q, [dst8q+ strideq ]
  2534. %endif
  2535. WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
  2536. %endif
  2537. %endif
  2538. %if mmsize == 8
  2539. %if %2 == 8 ; chroma
  2540. %ifidn %1, h
  2541. sub dst1q, 5
  2542. %endif
  2543. cmp dst1q, dst8q
  2544. mov dst1q, dst8q
  2545. jnz .next8px
  2546. %else
  2547. %ifidn %1, h
  2548. lea dst1q, [dst1q+ strideq*8-5]
  2549. %else ; v
  2550. add dst1q, 8
  2551. %endif
  2552. dec cntrq
  2553. jg .next8px
  2554. %endif
  2555. %endif
  2556. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  2557. ADD rsp, pad
  2558. %endif
  2559. RET
  2560. %endmacro
  2561. %if ARCH_X86_32
  2562. INIT_MMX mmx
  2563. MBEDGE_LOOPFILTER v, 16
  2564. MBEDGE_LOOPFILTER h, 16
  2565. MBEDGE_LOOPFILTER v, 8
  2566. MBEDGE_LOOPFILTER h, 8
  2567. INIT_MMX mmx2
  2568. MBEDGE_LOOPFILTER v, 16
  2569. MBEDGE_LOOPFILTER h, 16
  2570. MBEDGE_LOOPFILTER v, 8
  2571. MBEDGE_LOOPFILTER h, 8
  2572. %endif
  2573. INIT_XMM sse2
  2574. MBEDGE_LOOPFILTER v, 16
  2575. MBEDGE_LOOPFILTER h, 16
  2576. MBEDGE_LOOPFILTER v, 8
  2577. MBEDGE_LOOPFILTER h, 8
  2578. INIT_XMM ssse3
  2579. MBEDGE_LOOPFILTER v, 16
  2580. MBEDGE_LOOPFILTER h, 16
  2581. MBEDGE_LOOPFILTER v, 8
  2582. MBEDGE_LOOPFILTER h, 8
  2583. INIT_XMM sse4
  2584. MBEDGE_LOOPFILTER h, 16
  2585. MBEDGE_LOOPFILTER h, 8