You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2782 lines
78KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. fourtap_filter_hw_m: times 4 dw -6, 123
  25. times 4 dw 12, -1
  26. times 4 dw -9, 93
  27. times 4 dw 50, -6
  28. times 4 dw -6, 50
  29. times 4 dw 93, -9
  30. times 4 dw -1, 12
  31. times 4 dw 123, -6
  32. sixtap_filter_hw_m: times 4 dw 2, -11
  33. times 4 dw 108, 36
  34. times 4 dw -8, 1
  35. times 4 dw 3, -16
  36. times 4 dw 77, 77
  37. times 4 dw -16, 3
  38. times 4 dw 1, -8
  39. times 4 dw 36, 108
  40. times 4 dw -11, 2
  41. fourtap_filter_hb_m: times 8 db -6, 123
  42. times 8 db 12, -1
  43. times 8 db -9, 93
  44. times 8 db 50, -6
  45. times 8 db -6, 50
  46. times 8 db 93, -9
  47. times 8 db -1, 12
  48. times 8 db 123, -6
  49. sixtap_filter_hb_m: times 8 db 2, 1
  50. times 8 db -11, 108
  51. times 8 db 36, -8
  52. times 8 db 3, 3
  53. times 8 db -16, 77
  54. times 8 db 77, -16
  55. times 8 db 1, 2
  56. times 8 db -8, 36
  57. times 8 db 108, -11
  58. fourtap_filter_v_m: times 8 dw -6
  59. times 8 dw 123
  60. times 8 dw 12
  61. times 8 dw -1
  62. times 8 dw -9
  63. times 8 dw 93
  64. times 8 dw 50
  65. times 8 dw -6
  66. times 8 dw -6
  67. times 8 dw 50
  68. times 8 dw 93
  69. times 8 dw -9
  70. times 8 dw -1
  71. times 8 dw 12
  72. times 8 dw 123
  73. times 8 dw -6
  74. sixtap_filter_v_m: times 8 dw 2
  75. times 8 dw -11
  76. times 8 dw 108
  77. times 8 dw 36
  78. times 8 dw -8
  79. times 8 dw 1
  80. times 8 dw 3
  81. times 8 dw -16
  82. times 8 dw 77
  83. times 8 dw 77
  84. times 8 dw -16
  85. times 8 dw 3
  86. times 8 dw 1
  87. times 8 dw -8
  88. times 8 dw 36
  89. times 8 dw 108
  90. times 8 dw -11
  91. times 8 dw 2
  92. bilinear_filter_vw_m: times 8 dw 1
  93. times 8 dw 2
  94. times 8 dw 3
  95. times 8 dw 4
  96. times 8 dw 5
  97. times 8 dw 6
  98. times 8 dw 7
  99. bilinear_filter_vb_m: times 8 db 7, 1
  100. times 8 db 6, 2
  101. times 8 db 5, 3
  102. times 8 db 4, 4
  103. times 8 db 3, 5
  104. times 8 db 2, 6
  105. times 8 db 1, 7
  106. %ifdef PIC
  107. %define fourtap_filter_hw picregq
  108. %define sixtap_filter_hw picregq
  109. %define fourtap_filter_hb picregq
  110. %define sixtap_filter_hb picregq
  111. %define fourtap_filter_v picregq
  112. %define sixtap_filter_v picregq
  113. %define bilinear_filter_vw picregq
  114. %define bilinear_filter_vb picregq
  115. %define npicregs 1
  116. %else
  117. %define fourtap_filter_hw fourtap_filter_hw_m
  118. %define sixtap_filter_hw sixtap_filter_hw_m
  119. %define fourtap_filter_hb fourtap_filter_hb_m
  120. %define sixtap_filter_hb sixtap_filter_hb_m
  121. %define fourtap_filter_v fourtap_filter_v_m
  122. %define sixtap_filter_v sixtap_filter_v_m
  123. %define bilinear_filter_vw bilinear_filter_vw_m
  124. %define bilinear_filter_vb bilinear_filter_vb_m
  125. %define npicregs 0
  126. %endif
  127. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  128. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  129. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  130. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  131. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  132. pw_256: times 8 dw 256
  133. pw_20091: times 4 dw 20091
  134. pw_17734: times 4 dw 17734
  135. pb_27_63: times 8 db 27, 63
  136. pb_18_63: times 8 db 18, 63
  137. pb_9_63: times 8 db 9, 63
  138. cextern pb_1
  139. cextern pw_3
  140. cextern pb_3
  141. cextern pw_4
  142. cextern pb_4
  143. cextern pw_9
  144. cextern pw_18
  145. cextern pw_27
  146. cextern pw_63
  147. cextern pw_64
  148. cextern pb_80
  149. cextern pb_F8
  150. cextern pb_FE
  151. SECTION .text
  152. ;-----------------------------------------------------------------------------
  153. ; subpel MC functions:
  154. ;
  155. ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  156. ; uint8_t *src, int srcstride,
  157. ; int height, int mx, int my);
  158. ;-----------------------------------------------------------------------------
  159. %macro FILTER_SSSE3 1
  160. cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
  161. lea mxd, [mxq*3]
  162. mova m3, [filter_h6_shuf2]
  163. mova m4, [filter_h6_shuf3]
  164. %ifdef PIC
  165. lea picregq, [sixtap_filter_hb_m]
  166. %endif
  167. mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
  168. mova m6, [sixtap_filter_hb+mxq*8-32]
  169. mova m7, [sixtap_filter_hb+mxq*8-16]
  170. .nextrow:
  171. movu m0, [srcq-2]
  172. mova m1, m0
  173. mova m2, m0
  174. %if mmsize == 8
  175. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  176. ; shuffle with a memory operand
  177. punpcklbw m0, [srcq+3]
  178. %else
  179. pshufb m0, [filter_h6_shuf1]
  180. %endif
  181. pshufb m1, m3
  182. pshufb m2, m4
  183. pmaddubsw m0, m5
  184. pmaddubsw m1, m6
  185. pmaddubsw m2, m7
  186. paddsw m0, m1
  187. paddsw m0, m2
  188. pmulhrsw m0, [pw_256]
  189. packuswb m0, m0
  190. movh [dstq], m0 ; store
  191. ; go to next line
  192. add dstq, dststrideq
  193. add srcq, srcstrideq
  194. dec heightd ; next row
  195. jg .nextrow
  196. REP_RET
  197. cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  198. shl mxd, 4
  199. mova m2, [pw_256]
  200. mova m3, [filter_h2_shuf]
  201. mova m4, [filter_h4_shuf]
  202. %ifdef PIC
  203. lea picregq, [fourtap_filter_hb_m]
  204. %endif
  205. mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
  206. mova m6, [fourtap_filter_hb+mxq]
  207. .nextrow:
  208. movu m0, [srcq-1]
  209. mova m1, m0
  210. pshufb m0, m3
  211. pshufb m1, m4
  212. pmaddubsw m0, m5
  213. pmaddubsw m1, m6
  214. paddsw m0, m1
  215. pmulhrsw m0, m2
  216. packuswb m0, m0
  217. movh [dstq], m0 ; store
  218. ; go to next line
  219. add dstq, dststrideq
  220. add srcq, srcstrideq
  221. dec heightd ; next row
  222. jg .nextrow
  223. REP_RET
  224. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  225. shl myd, 4
  226. %ifdef PIC
  227. lea picregq, [fourtap_filter_hb_m]
  228. %endif
  229. mova m5, [fourtap_filter_hb+myq-16]
  230. mova m6, [fourtap_filter_hb+myq]
  231. mova m7, [pw_256]
  232. ; read 3 lines
  233. sub srcq, srcstrideq
  234. movh m0, [srcq]
  235. movh m1, [srcq+ srcstrideq]
  236. movh m2, [srcq+2*srcstrideq]
  237. add srcq, srcstrideq
  238. .nextrow:
  239. movh m3, [srcq+2*srcstrideq] ; read new row
  240. mova m4, m0
  241. mova m0, m1
  242. punpcklbw m4, m1
  243. mova m1, m2
  244. punpcklbw m2, m3
  245. pmaddubsw m4, m5
  246. pmaddubsw m2, m6
  247. paddsw m4, m2
  248. mova m2, m3
  249. pmulhrsw m4, m7
  250. packuswb m4, m4
  251. movh [dstq], m4
  252. ; go to next line
  253. add dstq, dststrideq
  254. add srcq, srcstrideq
  255. dec heightd ; next row
  256. jg .nextrow
  257. REP_RET
  258. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  259. lea myd, [myq*3]
  260. %ifdef PIC
  261. lea picregq, [sixtap_filter_hb_m]
  262. %endif
  263. lea myq, [sixtap_filter_hb+myq*8]
  264. ; read 5 lines
  265. sub srcq, srcstrideq
  266. sub srcq, srcstrideq
  267. movh m0, [srcq]
  268. movh m1, [srcq+srcstrideq]
  269. movh m2, [srcq+srcstrideq*2]
  270. lea srcq, [srcq+srcstrideq*2]
  271. add srcq, srcstrideq
  272. movh m3, [srcq]
  273. movh m4, [srcq+srcstrideq]
  274. .nextrow:
  275. movh m5, [srcq+2*srcstrideq] ; read new row
  276. mova m6, m0
  277. punpcklbw m6, m5
  278. mova m0, m1
  279. punpcklbw m1, m2
  280. mova m7, m3
  281. punpcklbw m7, m4
  282. pmaddubsw m6, [myq-48]
  283. pmaddubsw m1, [myq-32]
  284. pmaddubsw m7, [myq-16]
  285. paddsw m6, m1
  286. paddsw m6, m7
  287. mova m1, m2
  288. mova m2, m3
  289. pmulhrsw m6, [pw_256]
  290. mova m3, m4
  291. packuswb m6, m6
  292. mova m4, m5
  293. movh [dstq], m6
  294. ; go to next line
  295. add dstq, dststrideq
  296. add srcq, srcstrideq
  297. dec heightd ; next row
  298. jg .nextrow
  299. REP_RET
  300. %endmacro
  301. INIT_MMX ssse3
  302. FILTER_SSSE3 4
  303. INIT_XMM ssse3
  304. FILTER_SSSE3 8
  305. ; 4x4 block, H-only 4-tap filter
  306. INIT_MMX mmxext
  307. cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  308. shl mxd, 4
  309. %ifdef PIC
  310. lea picregq, [fourtap_filter_hw_m]
  311. %endif
  312. movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
  313. movq mm5, [fourtap_filter_hw+mxq]
  314. movq mm7, [pw_64]
  315. pxor mm6, mm6
  316. .nextrow:
  317. movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
  318. ; first set of 2 pixels
  319. movq mm2, mm1 ; byte ABCD..
  320. punpcklbw mm1, mm6 ; byte->word ABCD
  321. pshufw mm0, mm2, 9 ; byte CDEF..
  322. punpcklbw mm0, mm6 ; byte->word CDEF
  323. pshufw mm3, mm1, 0x94 ; word ABBC
  324. pshufw mm1, mm0, 0x94 ; word CDDE
  325. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  326. movq mm0, mm1 ; backup for second set of pixels
  327. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  328. paddd mm3, mm1 ; finish 1st 2px
  329. ; second set of 2 pixels, use backup of above
  330. punpckhbw mm2, mm6 ; byte->word EFGH
  331. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  332. pshufw mm1, mm2, 0x94 ; word EFFG
  333. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  334. paddd mm0, mm1 ; finish 2nd 2px
  335. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  336. packssdw mm3, mm0 ; merge dword->word (4px)
  337. paddsw mm3, mm7 ; rounding
  338. psraw mm3, 7
  339. packuswb mm3, mm6 ; clip and word->bytes
  340. movd [dstq], mm3 ; store
  341. ; go to next line
  342. add dstq, dststrideq
  343. add srcq, srcstrideq
  344. dec heightd ; next row
  345. jg .nextrow
  346. REP_RET
  347. ; 4x4 block, H-only 6-tap filter
  348. INIT_MMX mmxext
  349. cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  350. lea mxd, [mxq*3]
  351. %ifdef PIC
  352. lea picregq, [sixtap_filter_hw_m]
  353. %endif
  354. movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
  355. movq mm5, [sixtap_filter_hw+mxq*8-32]
  356. movq mm6, [sixtap_filter_hw+mxq*8-16]
  357. movq mm7, [pw_64]
  358. pxor mm3, mm3
  359. .nextrow:
  360. movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
  361. ; first set of 2 pixels
  362. movq mm2, mm1 ; byte ABCD..
  363. punpcklbw mm1, mm3 ; byte->word ABCD
  364. pshufw mm0, mm2, 0x9 ; byte CDEF..
  365. punpckhbw mm2, mm3 ; byte->word EFGH
  366. punpcklbw mm0, mm3 ; byte->word CDEF
  367. pshufw mm1, mm1, 0x94 ; word ABBC
  368. pshufw mm2, mm2, 0x94 ; word EFFG
  369. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  370. pshufw mm3, mm0, 0x94 ; word CDDE
  371. movq mm0, mm3 ; backup for second set of pixels
  372. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  373. paddd mm1, mm3 ; add to 1st 2px cache
  374. movq mm3, mm2 ; backup for second set of pixels
  375. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  376. paddd mm1, mm2 ; finish 1st 2px
  377. ; second set of 2 pixels, use backup of above
  378. movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
  379. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  380. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  381. paddd mm0, mm3 ; add to 2nd 2px cache
  382. pxor mm3, mm3
  383. punpcklbw mm2, mm3 ; byte->word FGHI
  384. pshufw mm2, mm2, 0xE9 ; word GHHI
  385. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  386. paddd mm0, mm2 ; finish 2nd 2px
  387. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  388. packssdw mm1, mm0 ; merge dword->word (4px)
  389. paddsw mm1, mm7 ; rounding
  390. psraw mm1, 7
  391. packuswb mm1, mm3 ; clip and word->bytes
  392. movd [dstq], mm1 ; store
  393. ; go to next line
  394. add dstq, dststrideq
  395. add srcq, srcstrideq
  396. dec heightd ; next row
  397. jg .nextrow
  398. REP_RET
  399. INIT_XMM sse2
  400. cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
  401. shl mxd, 5
  402. %ifdef PIC
  403. lea picregq, [fourtap_filter_v_m]
  404. %endif
  405. lea mxq, [fourtap_filter_v+mxq-32]
  406. pxor m7, m7
  407. mova m4, [pw_64]
  408. mova m5, [mxq+ 0]
  409. mova m6, [mxq+16]
  410. %ifdef m8
  411. mova m8, [mxq+32]
  412. mova m9, [mxq+48]
  413. %endif
  414. .nextrow:
  415. movq m0, [srcq-1]
  416. movq m1, [srcq-0]
  417. movq m2, [srcq+1]
  418. movq m3, [srcq+2]
  419. punpcklbw m0, m7
  420. punpcklbw m1, m7
  421. punpcklbw m2, m7
  422. punpcklbw m3, m7
  423. pmullw m0, m5
  424. pmullw m1, m6
  425. %ifdef m8
  426. pmullw m2, m8
  427. pmullw m3, m9
  428. %else
  429. pmullw m2, [mxq+32]
  430. pmullw m3, [mxq+48]
  431. %endif
  432. paddsw m0, m1
  433. paddsw m2, m3
  434. paddsw m0, m2
  435. paddsw m0, m4
  436. psraw m0, 7
  437. packuswb m0, m7
  438. movh [dstq], m0 ; store
  439. ; go to next line
  440. add dstq, dststrideq
  441. add srcq, srcstrideq
  442. dec heightd ; next row
  443. jg .nextrow
  444. REP_RET
  445. INIT_XMM sse2
  446. cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
  447. lea mxd, [mxq*3]
  448. shl mxd, 4
  449. %ifdef PIC
  450. lea picregq, [sixtap_filter_v_m]
  451. %endif
  452. lea mxq, [sixtap_filter_v+mxq-96]
  453. pxor m7, m7
  454. mova m6, [pw_64]
  455. %ifdef m8
  456. mova m8, [mxq+ 0]
  457. mova m9, [mxq+16]
  458. mova m10, [mxq+32]
  459. mova m11, [mxq+48]
  460. mova m12, [mxq+64]
  461. mova m13, [mxq+80]
  462. %endif
  463. .nextrow:
  464. movq m0, [srcq-2]
  465. movq m1, [srcq-1]
  466. movq m2, [srcq-0]
  467. movq m3, [srcq+1]
  468. movq m4, [srcq+2]
  469. movq m5, [srcq+3]
  470. punpcklbw m0, m7
  471. punpcklbw m1, m7
  472. punpcklbw m2, m7
  473. punpcklbw m3, m7
  474. punpcklbw m4, m7
  475. punpcklbw m5, m7
  476. %ifdef m8
  477. pmullw m0, m8
  478. pmullw m1, m9
  479. pmullw m2, m10
  480. pmullw m3, m11
  481. pmullw m4, m12
  482. pmullw m5, m13
  483. %else
  484. pmullw m0, [mxq+ 0]
  485. pmullw m1, [mxq+16]
  486. pmullw m2, [mxq+32]
  487. pmullw m3, [mxq+48]
  488. pmullw m4, [mxq+64]
  489. pmullw m5, [mxq+80]
  490. %endif
  491. paddsw m1, m4
  492. paddsw m0, m5
  493. paddsw m1, m2
  494. paddsw m0, m3
  495. paddsw m0, m1
  496. paddsw m0, m6
  497. psraw m0, 7
  498. packuswb m0, m7
  499. movh [dstq], m0 ; store
  500. ; go to next line
  501. add dstq, dststrideq
  502. add srcq, srcstrideq
  503. dec heightd ; next row
  504. jg .nextrow
  505. REP_RET
  506. %macro FILTER_V 1
  507. ; 4x4 block, V-only 4-tap filter
  508. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  509. shl myd, 5
  510. %ifdef PIC
  511. lea picregq, [fourtap_filter_v_m]
  512. %endif
  513. lea myq, [fourtap_filter_v+myq-32]
  514. mova m6, [pw_64]
  515. pxor m7, m7
  516. mova m5, [myq+48]
  517. ; read 3 lines
  518. sub srcq, srcstrideq
  519. movh m0, [srcq]
  520. movh m1, [srcq+ srcstrideq]
  521. movh m2, [srcq+2*srcstrideq]
  522. add srcq, srcstrideq
  523. punpcklbw m0, m7
  524. punpcklbw m1, m7
  525. punpcklbw m2, m7
  526. .nextrow:
  527. ; first calculate negative taps (to prevent losing positive overflows)
  528. movh m4, [srcq+2*srcstrideq] ; read new row
  529. punpcklbw m4, m7
  530. mova m3, m4
  531. pmullw m0, [myq+0]
  532. pmullw m4, m5
  533. paddsw m4, m0
  534. ; then calculate positive taps
  535. mova m0, m1
  536. pmullw m1, [myq+16]
  537. paddsw m4, m1
  538. mova m1, m2
  539. pmullw m2, [myq+32]
  540. paddsw m4, m2
  541. mova m2, m3
  542. ; round/clip/store
  543. paddsw m4, m6
  544. psraw m4, 7
  545. packuswb m4, m7
  546. movh [dstq], m4
  547. ; go to next line
  548. add dstq, dststrideq
  549. add srcq, srcstrideq
  550. dec heightd ; next row
  551. jg .nextrow
  552. REP_RET
  553. ; 4x4 block, V-only 6-tap filter
  554. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  555. shl myd, 4
  556. lea myq, [myq*3]
  557. %ifdef PIC
  558. lea picregq, [sixtap_filter_v_m]
  559. %endif
  560. lea myq, [sixtap_filter_v+myq-96]
  561. pxor m7, m7
  562. ; read 5 lines
  563. sub srcq, srcstrideq
  564. sub srcq, srcstrideq
  565. movh m0, [srcq]
  566. movh m1, [srcq+srcstrideq]
  567. movh m2, [srcq+srcstrideq*2]
  568. lea srcq, [srcq+srcstrideq*2]
  569. add srcq, srcstrideq
  570. movh m3, [srcq]
  571. movh m4, [srcq+srcstrideq]
  572. punpcklbw m0, m7
  573. punpcklbw m1, m7
  574. punpcklbw m2, m7
  575. punpcklbw m3, m7
  576. punpcklbw m4, m7
  577. .nextrow:
  578. ; first calculate negative taps (to prevent losing positive overflows)
  579. mova m5, m1
  580. pmullw m5, [myq+16]
  581. mova m6, m4
  582. pmullw m6, [myq+64]
  583. paddsw m6, m5
  584. ; then calculate positive taps
  585. movh m5, [srcq+2*srcstrideq] ; read new row
  586. punpcklbw m5, m7
  587. pmullw m0, [myq+0]
  588. paddsw m6, m0
  589. mova m0, m1
  590. mova m1, m2
  591. pmullw m2, [myq+32]
  592. paddsw m6, m2
  593. mova m2, m3
  594. pmullw m3, [myq+48]
  595. paddsw m6, m3
  596. mova m3, m4
  597. mova m4, m5
  598. pmullw m5, [myq+80]
  599. paddsw m6, m5
  600. ; round/clip/store
  601. paddsw m6, [pw_64]
  602. psraw m6, 7
  603. packuswb m6, m7
  604. movh [dstq], m6
  605. ; go to next line
  606. add dstq, dststrideq
  607. add srcq, srcstrideq
  608. dec heightd ; next row
  609. jg .nextrow
  610. REP_RET
  611. %endmacro
  612. INIT_MMX mmxext
  613. FILTER_V 4
  614. INIT_XMM sse2
  615. FILTER_V 8
  616. %macro FILTER_BILINEAR 1
  617. cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
  618. shl myd, 4
  619. %ifdef PIC
  620. lea picregq, [bilinear_filter_vw_m]
  621. %endif
  622. pxor m6, m6
  623. mova m5, [bilinear_filter_vw+myq-1*16]
  624. neg myq
  625. mova m4, [bilinear_filter_vw+myq+7*16]
  626. .nextrow:
  627. movh m0, [srcq+srcstrideq*0]
  628. movh m1, [srcq+srcstrideq*1]
  629. movh m3, [srcq+srcstrideq*2]
  630. punpcklbw m0, m6
  631. punpcklbw m1, m6
  632. punpcklbw m3, m6
  633. mova m2, m1
  634. pmullw m0, m4
  635. pmullw m1, m5
  636. pmullw m2, m4
  637. pmullw m3, m5
  638. paddsw m0, m1
  639. paddsw m2, m3
  640. psraw m0, 2
  641. psraw m2, 2
  642. pavgw m0, m6
  643. pavgw m2, m6
  644. %if mmsize == 8
  645. packuswb m0, m0
  646. packuswb m2, m2
  647. movh [dstq+dststrideq*0], m0
  648. movh [dstq+dststrideq*1], m2
  649. %else
  650. packuswb m0, m2
  651. movh [dstq+dststrideq*0], m0
  652. movhps [dstq+dststrideq*1], m0
  653. %endif
  654. lea dstq, [dstq+dststrideq*2]
  655. lea srcq, [srcq+srcstrideq*2]
  656. sub heightd, 2
  657. jg .nextrow
  658. REP_RET
  659. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  660. shl mxd, 4
  661. %ifdef PIC
  662. lea picregq, [bilinear_filter_vw_m]
  663. %endif
  664. pxor m6, m6
  665. mova m5, [bilinear_filter_vw+mxq-1*16]
  666. neg mxq
  667. mova m4, [bilinear_filter_vw+mxq+7*16]
  668. .nextrow:
  669. movh m0, [srcq+srcstrideq*0+0]
  670. movh m1, [srcq+srcstrideq*0+1]
  671. movh m2, [srcq+srcstrideq*1+0]
  672. movh m3, [srcq+srcstrideq*1+1]
  673. punpcklbw m0, m6
  674. punpcklbw m1, m6
  675. punpcklbw m2, m6
  676. punpcklbw m3, m6
  677. pmullw m0, m4
  678. pmullw m1, m5
  679. pmullw m2, m4
  680. pmullw m3, m5
  681. paddsw m0, m1
  682. paddsw m2, m3
  683. psraw m0, 2
  684. psraw m2, 2
  685. pavgw m0, m6
  686. pavgw m2, m6
  687. %if mmsize == 8
  688. packuswb m0, m0
  689. packuswb m2, m2
  690. movh [dstq+dststrideq*0], m0
  691. movh [dstq+dststrideq*1], m2
  692. %else
  693. packuswb m0, m2
  694. movh [dstq+dststrideq*0], m0
  695. movhps [dstq+dststrideq*1], m0
  696. %endif
  697. lea dstq, [dstq+dststrideq*2]
  698. lea srcq, [srcq+srcstrideq*2]
  699. sub heightd, 2
  700. jg .nextrow
  701. REP_RET
  702. %endmacro
  703. INIT_MMX mmxext
  704. FILTER_BILINEAR 4
  705. INIT_XMM sse2
  706. FILTER_BILINEAR 8
  707. %macro FILTER_BILINEAR_SSSE3 1
  708. cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
  709. shl myd, 4
  710. %ifdef PIC
  711. lea picregq, [bilinear_filter_vb_m]
  712. %endif
  713. pxor m4, m4
  714. mova m3, [bilinear_filter_vb+myq-16]
  715. .nextrow:
  716. movh m0, [srcq+srcstrideq*0]
  717. movh m1, [srcq+srcstrideq*1]
  718. movh m2, [srcq+srcstrideq*2]
  719. punpcklbw m0, m1
  720. punpcklbw m1, m2
  721. pmaddubsw m0, m3
  722. pmaddubsw m1, m3
  723. psraw m0, 2
  724. psraw m1, 2
  725. pavgw m0, m4
  726. pavgw m1, m4
  727. %if mmsize==8
  728. packuswb m0, m0
  729. packuswb m1, m1
  730. movh [dstq+dststrideq*0], m0
  731. movh [dstq+dststrideq*1], m1
  732. %else
  733. packuswb m0, m1
  734. movh [dstq+dststrideq*0], m0
  735. movhps [dstq+dststrideq*1], m0
  736. %endif
  737. lea dstq, [dstq+dststrideq*2]
  738. lea srcq, [srcq+srcstrideq*2]
  739. sub heightd, 2
  740. jg .nextrow
  741. REP_RET
  742. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
  743. shl mxd, 4
  744. %ifdef PIC
  745. lea picregq, [bilinear_filter_vb_m]
  746. %endif
  747. pxor m4, m4
  748. mova m2, [filter_h2_shuf]
  749. mova m3, [bilinear_filter_vb+mxq-16]
  750. .nextrow:
  751. movu m0, [srcq+srcstrideq*0]
  752. movu m1, [srcq+srcstrideq*1]
  753. pshufb m0, m2
  754. pshufb m1, m2
  755. pmaddubsw m0, m3
  756. pmaddubsw m1, m3
  757. psraw m0, 2
  758. psraw m1, 2
  759. pavgw m0, m4
  760. pavgw m1, m4
  761. %if mmsize==8
  762. packuswb m0, m0
  763. packuswb m1, m1
  764. movh [dstq+dststrideq*0], m0
  765. movh [dstq+dststrideq*1], m1
  766. %else
  767. packuswb m0, m1
  768. movh [dstq+dststrideq*0], m0
  769. movhps [dstq+dststrideq*1], m0
  770. %endif
  771. lea dstq, [dstq+dststrideq*2]
  772. lea srcq, [srcq+srcstrideq*2]
  773. sub heightd, 2
  774. jg .nextrow
  775. REP_RET
  776. %endmacro
  777. INIT_MMX ssse3
  778. FILTER_BILINEAR_SSSE3 4
  779. INIT_XMM ssse3
  780. FILTER_BILINEAR_SSSE3 8
  781. INIT_MMX mmx
  782. cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
  783. .nextrow:
  784. movq mm0, [srcq+srcstrideq*0]
  785. movq mm1, [srcq+srcstrideq*1]
  786. lea srcq, [srcq+srcstrideq*2]
  787. movq [dstq+dststrideq*0], mm0
  788. movq [dstq+dststrideq*1], mm1
  789. lea dstq, [dstq+dststrideq*2]
  790. sub heightd, 2
  791. jg .nextrow
  792. REP_RET
  793. %if ARCH_X86_32
  794. INIT_MMX mmx
  795. cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
  796. .nextrow:
  797. movq mm0, [srcq+srcstrideq*0+0]
  798. movq mm1, [srcq+srcstrideq*0+8]
  799. movq mm2, [srcq+srcstrideq*1+0]
  800. movq mm3, [srcq+srcstrideq*1+8]
  801. lea srcq, [srcq+srcstrideq*2]
  802. movq [dstq+dststrideq*0+0], mm0
  803. movq [dstq+dststrideq*0+8], mm1
  804. movq [dstq+dststrideq*1+0], mm2
  805. movq [dstq+dststrideq*1+8], mm3
  806. lea dstq, [dstq+dststrideq*2]
  807. sub heightd, 2
  808. jg .nextrow
  809. REP_RET
  810. %endif
  811. INIT_XMM sse
  812. cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
  813. .nextrow:
  814. movups xmm0, [srcq+srcstrideq*0]
  815. movups xmm1, [srcq+srcstrideq*1]
  816. lea srcq, [srcq+srcstrideq*2]
  817. movaps [dstq+dststrideq*0], xmm0
  818. movaps [dstq+dststrideq*1], xmm1
  819. lea dstq, [dstq+dststrideq*2]
  820. sub heightd, 2
  821. jg .nextrow
  822. REP_RET
  823. ;-----------------------------------------------------------------------------
  824. ; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
  825. ;-----------------------------------------------------------------------------
  826. %macro ADD_DC 4
  827. %4 m2, [dst1q+%3]
  828. %4 m3, [dst1q+strideq+%3]
  829. %4 m4, [dst2q+%3]
  830. %4 m5, [dst2q+strideq+%3]
  831. paddusb m2, %1
  832. paddusb m3, %1
  833. paddusb m4, %1
  834. paddusb m5, %1
  835. psubusb m2, %2
  836. psubusb m3, %2
  837. psubusb m4, %2
  838. psubusb m5, %2
  839. %4 [dst1q+%3], m2
  840. %4 [dst1q+strideq+%3], m3
  841. %4 [dst2q+%3], m4
  842. %4 [dst2q+strideq+%3], m5
  843. %endmacro
  844. INIT_MMX mmx
  845. cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
  846. ; load data
  847. movd m0, [blockq]
  848. ; calculate DC
  849. paddw m0, [pw_4]
  850. pxor m1, m1
  851. psraw m0, 3
  852. movd [blockq], m1
  853. psubw m1, m0
  854. packuswb m0, m0
  855. packuswb m1, m1
  856. punpcklbw m0, m0
  857. punpcklbw m1, m1
  858. punpcklwd m0, m0
  859. punpcklwd m1, m1
  860. ; add DC
  861. DEFINE_ARGS dst1, dst2, stride
  862. lea dst2q, [dst1q+strideq*2]
  863. ADD_DC m0, m1, 0, movh
  864. RET
  865. INIT_XMM sse4
  866. cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
  867. ; load data
  868. movd m0, [blockq]
  869. pxor m1, m1
  870. ; calculate DC
  871. paddw m0, [pw_4]
  872. movd [blockq], m1
  873. DEFINE_ARGS dst1, dst2, stride
  874. lea dst2q, [dst1q+strideq*2]
  875. movd m2, [dst1q]
  876. movd m3, [dst1q+strideq]
  877. movd m4, [dst2q]
  878. movd m5, [dst2q+strideq]
  879. psraw m0, 3
  880. pshuflw m0, m0, 0
  881. punpcklqdq m0, m0
  882. punpckldq m2, m3
  883. punpckldq m4, m5
  884. punpcklbw m2, m1
  885. punpcklbw m4, m1
  886. paddw m2, m0
  887. paddw m4, m0
  888. packuswb m2, m4
  889. movd [dst1q], m2
  890. pextrd [dst1q+strideq], m2, 1
  891. pextrd [dst2q], m2, 2
  892. pextrd [dst2q+strideq], m2, 3
  893. RET
  894. ;-----------------------------------------------------------------------------
  895. ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
  896. ;-----------------------------------------------------------------------------
  897. %if ARCH_X86_32
  898. INIT_MMX mmx
  899. cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
  900. ; load data
  901. movd m0, [blockq+32*0] ; A
  902. movd m1, [blockq+32*2] ; C
  903. punpcklwd m0, [blockq+32*1] ; A B
  904. punpcklwd m1, [blockq+32*3] ; C D
  905. punpckldq m0, m1 ; A B C D
  906. pxor m6, m6
  907. ; calculate DC
  908. paddw m0, [pw_4]
  909. movd [blockq+32*0], m6
  910. movd [blockq+32*1], m6
  911. movd [blockq+32*2], m6
  912. movd [blockq+32*3], m6
  913. psraw m0, 3
  914. psubw m6, m0
  915. packuswb m0, m0
  916. packuswb m6, m6
  917. punpcklbw m0, m0 ; AABBCCDD
  918. punpcklbw m6, m6 ; AABBCCDD
  919. movq m1, m0
  920. movq m7, m6
  921. punpcklbw m0, m0 ; AAAABBBB
  922. punpckhbw m1, m1 ; CCCCDDDD
  923. punpcklbw m6, m6 ; AAAABBBB
  924. punpckhbw m7, m7 ; CCCCDDDD
  925. ; add DC
  926. DEFINE_ARGS dst1, dst2, stride
  927. lea dst2q, [dst1q+strideq*2]
  928. ADD_DC m0, m6, 0, mova
  929. ADD_DC m1, m7, 8, mova
  930. RET
  931. %endif
  932. INIT_XMM sse2
  933. cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
  934. ; load data
  935. movd m0, [blockq+32*0] ; A
  936. movd m1, [blockq+32*2] ; C
  937. punpcklwd m0, [blockq+32*1] ; A B
  938. punpcklwd m1, [blockq+32*3] ; C D
  939. punpckldq m0, m1 ; A B C D
  940. pxor m1, m1
  941. ; calculate DC
  942. paddw m0, [pw_4]
  943. movd [blockq+32*0], m1
  944. movd [blockq+32*1], m1
  945. movd [blockq+32*2], m1
  946. movd [blockq+32*3], m1
  947. psraw m0, 3
  948. psubw m1, m0
  949. packuswb m0, m0
  950. packuswb m1, m1
  951. punpcklbw m0, m0
  952. punpcklbw m1, m1
  953. punpcklbw m0, m0
  954. punpcklbw m1, m1
  955. ; add DC
  956. DEFINE_ARGS dst1, dst2, stride
  957. lea dst2q, [dst1q+strideq*2]
  958. ADD_DC m0, m1, 0, mova
  959. RET
  960. ;-----------------------------------------------------------------------------
  961. ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
  962. ;-----------------------------------------------------------------------------
  963. INIT_MMX mmx
  964. cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
  965. ; load data
  966. movd m0, [blockq+32*0] ; A
  967. movd m1, [blockq+32*2] ; C
  968. punpcklwd m0, [blockq+32*1] ; A B
  969. punpcklwd m1, [blockq+32*3] ; C D
  970. punpckldq m0, m1 ; A B C D
  971. pxor m6, m6
  972. ; calculate DC
  973. paddw m0, [pw_4]
  974. movd [blockq+32*0], m6
  975. movd [blockq+32*1], m6
  976. movd [blockq+32*2], m6
  977. movd [blockq+32*3], m6
  978. psraw m0, 3
  979. psubw m6, m0
  980. packuswb m0, m0
  981. packuswb m6, m6
  982. punpcklbw m0, m0 ; AABBCCDD
  983. punpcklbw m6, m6 ; AABBCCDD
  984. movq m1, m0
  985. movq m7, m6
  986. punpcklbw m0, m0 ; AAAABBBB
  987. punpckhbw m1, m1 ; CCCCDDDD
  988. punpcklbw m6, m6 ; AAAABBBB
  989. punpckhbw m7, m7 ; CCCCDDDD
  990. ; add DC
  991. DEFINE_ARGS dst1, dst2, stride
  992. lea dst2q, [dst1q+strideq*2]
  993. ADD_DC m0, m6, 0, mova
  994. lea dst1q, [dst1q+strideq*4]
  995. lea dst2q, [dst2q+strideq*4]
  996. ADD_DC m1, m7, 0, mova
  997. RET
  998. ;-----------------------------------------------------------------------------
  999. ; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
  1000. ;-----------------------------------------------------------------------------
  1001. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  1002. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  1003. %macro VP8_MULTIPLY_SUMSUB 4
  1004. mova %3, %1
  1005. mova %4, %2
  1006. pmulhw %3, m6 ;20091(1)
  1007. pmulhw %4, m6 ;20091(2)
  1008. paddw %3, %1
  1009. paddw %4, %2
  1010. paddw %1, %1
  1011. paddw %2, %2
  1012. pmulhw %1, m7 ;35468(1)
  1013. pmulhw %2, m7 ;35468(2)
  1014. psubw %1, %4
  1015. paddw %2, %3
  1016. %endmacro
  1017. ; calculate x0=%1+%3; x1=%1-%3
  1018. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  1019. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  1020. ; %5/%6 are temporary registers
  1021. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  1022. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  1023. SUMSUB_BA w, %3, %1, %5 ;t0, t1
  1024. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  1025. SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
  1026. SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
  1027. SWAP %4, %1
  1028. SWAP %4, %3
  1029. %endmacro
  1030. %macro VP8_IDCT_ADD 0
  1031. cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
  1032. ; load block data
  1033. movq m0, [blockq+ 0]
  1034. movq m1, [blockq+ 8]
  1035. movq m2, [blockq+16]
  1036. movq m3, [blockq+24]
  1037. movq m6, [pw_20091]
  1038. movq m7, [pw_17734]
  1039. %if cpuflag(sse)
  1040. xorps xmm0, xmm0
  1041. movaps [blockq+ 0], xmm0
  1042. movaps [blockq+16], xmm0
  1043. %else
  1044. pxor m4, m4
  1045. movq [blockq+ 0], m4
  1046. movq [blockq+ 8], m4
  1047. movq [blockq+16], m4
  1048. movq [blockq+24], m4
  1049. %endif
  1050. ; actual IDCT
  1051. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1052. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1053. paddw m0, [pw_4]
  1054. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1055. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1056. ; store
  1057. pxor m4, m4
  1058. DEFINE_ARGS dst1, dst2, stride
  1059. lea dst2q, [dst1q+2*strideq]
  1060. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
  1061. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
  1062. RET
  1063. %endmacro
  1064. %if ARCH_X86_32
  1065. INIT_MMX mmx
  1066. VP8_IDCT_ADD
  1067. %endif
  1068. INIT_MMX sse
  1069. VP8_IDCT_ADD
  1070. ;-----------------------------------------------------------------------------
  1071. ; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
  1072. ;-----------------------------------------------------------------------------
  1073. %macro SCATTER_WHT 3
  1074. movd dc1d, m%1
  1075. movd dc2d, m%2
  1076. mov [blockq+2*16*(0+%3)], dc1w
  1077. mov [blockq+2*16*(1+%3)], dc2w
  1078. shr dc1d, 16
  1079. shr dc2d, 16
  1080. psrlq m%1, 32
  1081. psrlq m%2, 32
  1082. mov [blockq+2*16*(4+%3)], dc1w
  1083. mov [blockq+2*16*(5+%3)], dc2w
  1084. movd dc1d, m%1
  1085. movd dc2d, m%2
  1086. mov [blockq+2*16*(8+%3)], dc1w
  1087. mov [blockq+2*16*(9+%3)], dc2w
  1088. shr dc1d, 16
  1089. shr dc2d, 16
  1090. mov [blockq+2*16*(12+%3)], dc1w
  1091. mov [blockq+2*16*(13+%3)], dc2w
  1092. %endmacro
  1093. %macro HADAMARD4_1D 4
  1094. SUMSUB_BADC w, %2, %1, %4, %3
  1095. SUMSUB_BADC w, %4, %2, %3, %1
  1096. SWAP %1, %4, %3
  1097. %endmacro
  1098. %macro VP8_DC_WHT 0
  1099. cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
  1100. movq m0, [dc1q]
  1101. movq m1, [dc1q+8]
  1102. movq m2, [dc1q+16]
  1103. movq m3, [dc1q+24]
  1104. %if cpuflag(sse)
  1105. xorps xmm0, xmm0
  1106. movaps [dc1q+ 0], xmm0
  1107. movaps [dc1q+16], xmm0
  1108. %else
  1109. pxor m4, m4
  1110. movq [dc1q+ 0], m4
  1111. movq [dc1q+ 8], m4
  1112. movq [dc1q+16], m4
  1113. movq [dc1q+24], m4
  1114. %endif
  1115. HADAMARD4_1D 0, 1, 2, 3
  1116. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1117. paddw m0, [pw_3]
  1118. HADAMARD4_1D 0, 1, 2, 3
  1119. psraw m0, 3
  1120. psraw m1, 3
  1121. psraw m2, 3
  1122. psraw m3, 3
  1123. SCATTER_WHT 0, 1, 0
  1124. SCATTER_WHT 2, 3, 2
  1125. RET
  1126. %endmacro
  1127. %if ARCH_X86_32
  1128. INIT_MMX mmx
  1129. VP8_DC_WHT
  1130. %endif
  1131. INIT_MMX sse
  1132. VP8_DC_WHT
  1133. ;-----------------------------------------------------------------------------
  1134. ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
  1135. ;-----------------------------------------------------------------------------
  1136. ; macro called with 7 mm register indexes as argument, and 4 regular registers
  1137. ;
  1138. ; first 4 mm registers will carry the transposed pixel data
  1139. ; the other three are scratchspace (one would be sufficient, but this allows
  1140. ; for more spreading/pipelining and thus faster execution on OOE CPUs)
  1141. ;
  1142. ; first two regular registers are buf+4*stride and buf+5*stride
  1143. ; third is -stride, fourth is +stride
  1144. %macro READ_8x4_INTERLEAVED 11
  1145. ; interleave 8 (A-H) rows of 4 pixels each
  1146. movd m%1, [%8+%10*4] ; A0-3
  1147. movd m%5, [%9+%10*4] ; B0-3
  1148. movd m%2, [%8+%10*2] ; C0-3
  1149. movd m%6, [%8+%10] ; D0-3
  1150. movd m%3, [%8] ; E0-3
  1151. movd m%7, [%9] ; F0-3
  1152. movd m%4, [%9+%11] ; G0-3
  1153. punpcklbw m%1, m%5 ; A/B interleaved
  1154. movd m%5, [%9+%11*2] ; H0-3
  1155. punpcklbw m%2, m%6 ; C/D interleaved
  1156. punpcklbw m%3, m%7 ; E/F interleaved
  1157. punpcklbw m%4, m%5 ; G/H interleaved
  1158. %endmacro
  1159. ; macro called with 7 mm register indexes as argument, and 5 regular registers
  1160. ; first 11 mean the same as READ_8x4_TRANSPOSED above
  1161. ; fifth regular register is scratchspace to reach the bottom 8 rows, it
  1162. ; will be set to second regular register + 8*stride at the end
  1163. %macro READ_16x4_INTERLEAVED 12
  1164. ; transpose 16 (A-P) rows of 4 pixels each
  1165. lea %12, [r0+8*r2]
  1166. ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
  1167. movd m%1, [%8+%10*4] ; A0-3
  1168. movd m%3, [%12+%10*4] ; I0-3
  1169. movd m%2, [%8+%10*2] ; C0-3
  1170. movd m%4, [%12+%10*2] ; K0-3
  1171. movd m%6, [%8+%10] ; D0-3
  1172. movd m%5, [%12+%10] ; L0-3
  1173. movd m%7, [%12] ; M0-3
  1174. add %12, %11
  1175. punpcklbw m%1, m%3 ; A/I
  1176. movd m%3, [%8] ; E0-3
  1177. punpcklbw m%2, m%4 ; C/K
  1178. punpcklbw m%6, m%5 ; D/L
  1179. punpcklbw m%3, m%7 ; E/M
  1180. punpcklbw m%2, m%6 ; C/D/K/L interleaved
  1181. ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
  1182. movd m%5, [%9+%10*4] ; B0-3
  1183. movd m%4, [%12+%10*4] ; J0-3
  1184. movd m%7, [%9] ; F0-3
  1185. movd m%6, [%12] ; N0-3
  1186. punpcklbw m%5, m%4 ; B/J
  1187. punpcklbw m%7, m%6 ; F/N
  1188. punpcklbw m%1, m%5 ; A/B/I/J interleaved
  1189. punpcklbw m%3, m%7 ; E/F/M/N interleaved
  1190. movd m%4, [%9+%11] ; G0-3
  1191. movd m%6, [%12+%11] ; O0-3
  1192. movd m%5, [%9+%11*2] ; H0-3
  1193. movd m%7, [%12+%11*2] ; P0-3
  1194. punpcklbw m%4, m%6 ; G/O
  1195. punpcklbw m%5, m%7 ; H/P
  1196. punpcklbw m%4, m%5 ; G/H/O/P interleaved
  1197. %endmacro
  1198. ; write 4 mm registers of 2 dwords each
  1199. ; first four arguments are mm register indexes containing source data
  1200. ; last four are registers containing buf+4*stride, buf+5*stride,
  1201. ; -stride and +stride
  1202. %macro WRITE_4x2D 8
  1203. ; write out (2 dwords per register)
  1204. movd [%5+%7*4], m%1
  1205. movd [%5+%7*2], m%2
  1206. movd [%5], m%3
  1207. movd [%6+%8], m%4
  1208. punpckhdq m%1, m%1
  1209. punpckhdq m%2, m%2
  1210. punpckhdq m%3, m%3
  1211. punpckhdq m%4, m%4
  1212. movd [%6+%7*4], m%1
  1213. movd [%5+%7], m%2
  1214. movd [%6], m%3
  1215. movd [%6+%8*2], m%4
  1216. %endmacro
  1217. ; write 4 xmm registers of 4 dwords each
  1218. ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
  1219. ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
  1220. ; we add 1*stride to the third regular registry in the process
  1221. ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
  1222. ; same memory region), or 8 if they cover two separate buffers (third one points to
  1223. ; a different memory region than the first two), allowing for more optimal code for
  1224. ; the 16-width case
  1225. %macro WRITE_4x4D 10
  1226. ; write out (4 dwords per register), start with dwords zero
  1227. movd [%5+%8*4], m%1
  1228. movd [%5], m%2
  1229. movd [%7+%8*4], m%3
  1230. movd [%7], m%4
  1231. ; store dwords 1
  1232. psrldq m%1, 4
  1233. psrldq m%2, 4
  1234. psrldq m%3, 4
  1235. psrldq m%4, 4
  1236. movd [%6+%8*4], m%1
  1237. movd [%6], m%2
  1238. %if %10 == 16
  1239. movd [%6+%9*4], m%3
  1240. %endif
  1241. movd [%7+%9], m%4
  1242. ; write dwords 2
  1243. psrldq m%1, 4
  1244. psrldq m%2, 4
  1245. %if %10 == 8
  1246. movd [%5+%8*2], m%1
  1247. movd %5d, m%3
  1248. %endif
  1249. psrldq m%3, 4
  1250. psrldq m%4, 4
  1251. %if %10 == 16
  1252. movd [%5+%8*2], m%1
  1253. %endif
  1254. movd [%6+%9], m%2
  1255. movd [%7+%8*2], m%3
  1256. movd [%7+%9*2], m%4
  1257. add %7, %9
  1258. ; store dwords 3
  1259. psrldq m%1, 4
  1260. psrldq m%2, 4
  1261. psrldq m%3, 4
  1262. psrldq m%4, 4
  1263. %if %10 == 8
  1264. mov [%7+%8*4], %5d
  1265. movd [%6+%8*2], m%1
  1266. %else
  1267. movd [%5+%8], m%1
  1268. %endif
  1269. movd [%6+%9*2], m%2
  1270. movd [%7+%8*2], m%3
  1271. movd [%7+%9*2], m%4
  1272. %endmacro
  1273. ; write 4 or 8 words in the mmx/xmm registers as 8 lines
  1274. ; 1 and 2 are the registers to write, this can be the same (for SSE2)
  1275. ; for pre-SSE4:
  1276. ; 3 is a general-purpose register that we will clobber
  1277. ; for SSE4:
  1278. ; 3 is a pointer to the destination's 5th line
  1279. ; 4 is a pointer to the destination's 4th line
  1280. ; 5/6 is -stride and +stride
  1281. %macro WRITE_2x4W 6
  1282. movd %3d, %1
  1283. punpckhdq %1, %1
  1284. mov [%4+%5*4], %3w
  1285. shr %3, 16
  1286. add %4, %6
  1287. mov [%4+%5*4], %3w
  1288. movd %3d, %1
  1289. add %4, %5
  1290. mov [%4+%5*2], %3w
  1291. shr %3, 16
  1292. mov [%4+%5 ], %3w
  1293. movd %3d, %2
  1294. punpckhdq %2, %2
  1295. mov [%4 ], %3w
  1296. shr %3, 16
  1297. mov [%4+%6 ], %3w
  1298. movd %3d, %2
  1299. add %4, %6
  1300. mov [%4+%6 ], %3w
  1301. shr %3, 16
  1302. mov [%4+%6*2], %3w
  1303. add %4, %5
  1304. %endmacro
  1305. %macro WRITE_8W 5
  1306. %if cpuflag(sse4)
  1307. pextrw [%3+%4*4], %1, 0
  1308. pextrw [%2+%4*4], %1, 1
  1309. pextrw [%3+%4*2], %1, 2
  1310. pextrw [%3+%4 ], %1, 3
  1311. pextrw [%3 ], %1, 4
  1312. pextrw [%2 ], %1, 5
  1313. pextrw [%2+%5 ], %1, 6
  1314. pextrw [%2+%5*2], %1, 7
  1315. %else
  1316. movd %2d, %1
  1317. psrldq %1, 4
  1318. mov [%3+%4*4], %2w
  1319. shr %2, 16
  1320. add %3, %5
  1321. mov [%3+%4*4], %2w
  1322. movd %2d, %1
  1323. psrldq %1, 4
  1324. add %3, %4
  1325. mov [%3+%4*2], %2w
  1326. shr %2, 16
  1327. mov [%3+%4 ], %2w
  1328. movd %2d, %1
  1329. psrldq %1, 4
  1330. mov [%3 ], %2w
  1331. shr %2, 16
  1332. mov [%3+%5 ], %2w
  1333. movd %2d, %1
  1334. add %3, %5
  1335. mov [%3+%5 ], %2w
  1336. shr %2, 16
  1337. mov [%3+%5*2], %2w
  1338. %endif
  1339. %endmacro
  1340. %macro SIMPLE_LOOPFILTER 2
  1341. cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
  1342. %if mmsize == 8 ; mmx/mmxext
  1343. mov cntrq, 2
  1344. %endif
  1345. %if cpuflag(ssse3)
  1346. pxor m0, m0
  1347. %endif
  1348. SPLATB_REG m7, flim, m0 ; splat "flim" into register
  1349. ; set up indexes to address 4 rows
  1350. %if mmsize == 8
  1351. DEFINE_ARGS dst1, mstride, stride, cntr, dst2
  1352. %else
  1353. DEFINE_ARGS dst1, mstride, stride, dst3, dst2
  1354. %endif
  1355. mov strideq, mstrideq
  1356. neg mstrideq
  1357. %ifidn %1, h
  1358. lea dst1q, [dst1q+4*strideq-2]
  1359. %endif
  1360. %if mmsize == 8 ; mmx / mmxext
  1361. .next8px:
  1362. %endif
  1363. %ifidn %1, v
  1364. ; read 4 half/full rows of pixels
  1365. mova m0, [dst1q+mstrideq*2] ; p1
  1366. mova m1, [dst1q+mstrideq] ; p0
  1367. mova m2, [dst1q] ; q0
  1368. mova m3, [dst1q+ strideq] ; q1
  1369. %else ; h
  1370. lea dst2q, [dst1q+ strideq]
  1371. %if mmsize == 8 ; mmx/mmxext
  1372. READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
  1373. %else ; sse2
  1374. READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
  1375. %endif
  1376. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1377. %endif
  1378. ; simple_limit
  1379. mova m5, m2 ; m5=backup of q0
  1380. mova m6, m1 ; m6=backup of p0
  1381. psubusb m1, m2 ; p0-q0
  1382. psubusb m2, m6 ; q0-p0
  1383. por m1, m2 ; FFABS(p0-q0)
  1384. paddusb m1, m1 ; m1=FFABS(p0-q0)*2
  1385. mova m4, m3
  1386. mova m2, m0
  1387. psubusb m3, m0 ; q1-p1
  1388. psubusb m0, m4 ; p1-q1
  1389. por m3, m0 ; FFABS(p1-q1)
  1390. mova m0, [pb_80]
  1391. pxor m2, m0
  1392. pxor m4, m0
  1393. psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
  1394. pand m3, [pb_FE]
  1395. psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
  1396. paddusb m3, m1
  1397. psubusb m3, m7
  1398. pxor m1, m1
  1399. pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
  1400. ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
  1401. mova m4, m5
  1402. pxor m5, m0
  1403. pxor m0, m6
  1404. psubsb m5, m0 ; q0-p0 (signed)
  1405. paddsb m2, m5
  1406. paddsb m2, m5
  1407. paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
  1408. pand m2, m3 ; apply filter mask (m3)
  1409. mova m3, [pb_F8]
  1410. mova m1, m2
  1411. paddsb m2, [pb_4] ; f1<<3=a+4
  1412. paddsb m1, [pb_3] ; f2<<3=a+3
  1413. pand m2, m3
  1414. pand m1, m3 ; cache f2<<3
  1415. pxor m0, m0
  1416. pxor m3, m3
  1417. pcmpgtb m0, m2 ; which values are <0?
  1418. psubb m3, m2 ; -f1<<3
  1419. psrlq m2, 3 ; +f1
  1420. psrlq m3, 3 ; -f1
  1421. pand m3, m0
  1422. pandn m0, m2
  1423. psubusb m4, m0
  1424. paddusb m4, m3 ; q0-f1
  1425. pxor m0, m0
  1426. pxor m3, m3
  1427. pcmpgtb m0, m1 ; which values are <0?
  1428. psubb m3, m1 ; -f2<<3
  1429. psrlq m1, 3 ; +f2
  1430. psrlq m3, 3 ; -f2
  1431. pand m3, m0
  1432. pandn m0, m1
  1433. paddusb m6, m0
  1434. psubusb m6, m3 ; p0+f2
  1435. ; store
  1436. %ifidn %1, v
  1437. mova [dst1q], m4
  1438. mova [dst1q+mstrideq], m6
  1439. %else ; h
  1440. inc dst1q
  1441. SBUTTERFLY bw, 6, 4, 0
  1442. %if mmsize == 16 ; sse2
  1443. %if cpuflag(sse4)
  1444. inc dst2q
  1445. %endif
  1446. WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
  1447. lea dst2q, [dst3q+mstrideq+1]
  1448. %if cpuflag(sse4)
  1449. inc dst3q
  1450. %endif
  1451. WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
  1452. %else ; mmx/mmxext
  1453. WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
  1454. %endif
  1455. %endif
  1456. %if mmsize == 8 ; mmx/mmxext
  1457. ; next 8 pixels
  1458. %ifidn %1, v
  1459. add dst1q, 8 ; advance 8 cols = pixels
  1460. %else ; h
  1461. lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
  1462. %endif
  1463. dec cntrq
  1464. jg .next8px
  1465. REP_RET
  1466. %else ; sse2
  1467. RET
  1468. %endif
  1469. %endmacro
  1470. %if ARCH_X86_32
  1471. INIT_MMX mmx
  1472. SIMPLE_LOOPFILTER v, 4
  1473. SIMPLE_LOOPFILTER h, 5
  1474. INIT_MMX mmxext
  1475. SIMPLE_LOOPFILTER v, 4
  1476. SIMPLE_LOOPFILTER h, 5
  1477. %endif
  1478. INIT_XMM sse2
  1479. SIMPLE_LOOPFILTER v, 3
  1480. SIMPLE_LOOPFILTER h, 5
  1481. INIT_XMM ssse3
  1482. SIMPLE_LOOPFILTER v, 3
  1483. SIMPLE_LOOPFILTER h, 5
  1484. INIT_XMM sse4
  1485. SIMPLE_LOOPFILTER h, 5
  1486. ;-----------------------------------------------------------------------------
  1487. ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1488. ; int flimE, int flimI, int hev_thr);
  1489. ;-----------------------------------------------------------------------------
  1490. %macro INNER_LOOPFILTER 2
  1491. %define stack_size 0
  1492. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1493. %ifidn %1, v ; [3]=hev() result
  1494. %define stack_size mmsize * -4
  1495. %else ; h ; extra storage space for transposes
  1496. %define stack_size mmsize * -5
  1497. %endif
  1498. %endif
  1499. %if %2 == 8 ; chroma
  1500. cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
  1501. %else ; luma
  1502. cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
  1503. %endif
  1504. %if cpuflag(ssse3)
  1505. pxor m7, m7
  1506. %endif
  1507. %ifndef m8
  1508. ; splat function arguments
  1509. SPLATB_REG m0, flimEq, m7 ; E
  1510. SPLATB_REG m1, flimIq, m7 ; I
  1511. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1512. %define m_flimE [rsp]
  1513. %define m_flimI [rsp+mmsize]
  1514. %define m_hevthr [rsp+mmsize*2]
  1515. %define m_maskres [rsp+mmsize*3]
  1516. %define m_p0backup [rsp+mmsize*3]
  1517. %define m_q0backup [rsp+mmsize*4]
  1518. mova m_flimE, m0
  1519. mova m_flimI, m1
  1520. mova m_hevthr, m2
  1521. %else
  1522. %define m_flimE m9
  1523. %define m_flimI m10
  1524. %define m_hevthr m11
  1525. %define m_maskres m12
  1526. %define m_p0backup m12
  1527. %define m_q0backup m8
  1528. ; splat function arguments
  1529. SPLATB_REG m_flimE, flimEq, m7 ; E
  1530. SPLATB_REG m_flimI, flimIq, m7 ; I
  1531. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  1532. %endif
  1533. %if %2 == 8 ; chroma
  1534. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  1535. %elif mmsize == 8
  1536. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  1537. mov cntrq, 2
  1538. %else
  1539. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  1540. %endif
  1541. mov strideq, mstrideq
  1542. neg mstrideq
  1543. %ifidn %1, h
  1544. lea dst1q, [dst1q+strideq*4-4]
  1545. %if %2 == 8 ; chroma
  1546. lea dst8q, [dst8q+strideq*4-4]
  1547. %endif
  1548. %endif
  1549. %if mmsize == 8
  1550. .next8px:
  1551. %endif
  1552. ; read
  1553. lea dst2q, [dst1q+strideq]
  1554. %ifidn %1, v
  1555. %if %2 == 8 && mmsize == 16
  1556. %define movrow movh
  1557. %else
  1558. %define movrow mova
  1559. %endif
  1560. movrow m0, [dst1q+mstrideq*4] ; p3
  1561. movrow m1, [dst2q+mstrideq*4] ; p2
  1562. movrow m2, [dst1q+mstrideq*2] ; p1
  1563. movrow m5, [dst2q] ; q1
  1564. movrow m6, [dst2q+ strideq*1] ; q2
  1565. movrow m7, [dst2q+ strideq*2] ; q3
  1566. %if mmsize == 16 && %2 == 8
  1567. movhps m0, [dst8q+mstrideq*4]
  1568. movhps m2, [dst8q+mstrideq*2]
  1569. add dst8q, strideq
  1570. movhps m1, [dst8q+mstrideq*4]
  1571. movhps m5, [dst8q]
  1572. movhps m6, [dst8q+ strideq ]
  1573. movhps m7, [dst8q+ strideq*2]
  1574. add dst8q, mstrideq
  1575. %endif
  1576. %elif mmsize == 8 ; mmx/mmxext (h)
  1577. ; read 8 rows of 8px each
  1578. movu m0, [dst1q+mstrideq*4]
  1579. movu m1, [dst2q+mstrideq*4]
  1580. movu m2, [dst1q+mstrideq*2]
  1581. movu m3, [dst1q+mstrideq ]
  1582. movu m4, [dst1q]
  1583. movu m5, [dst2q]
  1584. movu m6, [dst2q+ strideq ]
  1585. ; 8x8 transpose
  1586. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1587. mova m_q0backup, m1
  1588. movu m7, [dst2q+ strideq*2]
  1589. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1590. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1591. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1592. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1593. mova m1, m_q0backup
  1594. mova m_q0backup, m2 ; store q0
  1595. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1596. mova m_p0backup, m5 ; store p0
  1597. SWAP 1, 4
  1598. SWAP 2, 4
  1599. SWAP 6, 3
  1600. SWAP 5, 3
  1601. %else ; sse2 (h)
  1602. %if %2 == 16
  1603. lea dst8q, [dst1q+ strideq*8]
  1604. %endif
  1605. ; read 16 rows of 8px each, interleave
  1606. movh m0, [dst1q+mstrideq*4]
  1607. movh m1, [dst8q+mstrideq*4]
  1608. movh m2, [dst1q+mstrideq*2]
  1609. movh m5, [dst8q+mstrideq*2]
  1610. movh m3, [dst1q+mstrideq ]
  1611. movh m6, [dst8q+mstrideq ]
  1612. movh m4, [dst1q]
  1613. movh m7, [dst8q]
  1614. punpcklbw m0, m1 ; A/I
  1615. punpcklbw m2, m5 ; C/K
  1616. punpcklbw m3, m6 ; D/L
  1617. punpcklbw m4, m7 ; E/M
  1618. add dst8q, strideq
  1619. movh m1, [dst2q+mstrideq*4]
  1620. movh m6, [dst8q+mstrideq*4]
  1621. movh m5, [dst2q]
  1622. movh m7, [dst8q]
  1623. punpcklbw m1, m6 ; B/J
  1624. punpcklbw m5, m7 ; F/N
  1625. movh m6, [dst2q+ strideq ]
  1626. movh m7, [dst8q+ strideq ]
  1627. punpcklbw m6, m7 ; G/O
  1628. ; 8x16 transpose
  1629. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1630. %ifdef m8
  1631. SWAP 1, 8
  1632. %else
  1633. mova m_q0backup, m1
  1634. %endif
  1635. movh m7, [dst2q+ strideq*2]
  1636. movh m1, [dst8q+ strideq*2]
  1637. punpcklbw m7, m1 ; H/P
  1638. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1639. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1640. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1641. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1642. %ifdef m8
  1643. SWAP 1, 8
  1644. SWAP 2, 8
  1645. %else
  1646. mova m1, m_q0backup
  1647. mova m_q0backup, m2 ; store q0
  1648. %endif
  1649. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1650. %ifdef m12
  1651. SWAP 5, 12
  1652. %else
  1653. mova m_p0backup, m5 ; store p0
  1654. %endif
  1655. SWAP 1, 4
  1656. SWAP 2, 4
  1657. SWAP 6, 3
  1658. SWAP 5, 3
  1659. %endif
  1660. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  1661. mova m4, m1
  1662. SWAP 4, 1
  1663. psubusb m4, m0 ; p2-p3
  1664. psubusb m0, m1 ; p3-p2
  1665. por m0, m4 ; abs(p3-p2)
  1666. mova m4, m2
  1667. SWAP 4, 2
  1668. psubusb m4, m1 ; p1-p2
  1669. psubusb m1, m2 ; p2-p1
  1670. por m1, m4 ; abs(p2-p1)
  1671. mova m4, m6
  1672. SWAP 4, 6
  1673. psubusb m4, m7 ; q2-q3
  1674. psubusb m7, m6 ; q3-q2
  1675. por m7, m4 ; abs(q3-q2)
  1676. mova m4, m5
  1677. SWAP 4, 5
  1678. psubusb m4, m6 ; q1-q2
  1679. psubusb m6, m5 ; q2-q1
  1680. por m6, m4 ; abs(q2-q1)
  1681. %if notcpuflag(mmxext)
  1682. mova m4, m_flimI
  1683. pxor m3, m3
  1684. psubusb m0, m4
  1685. psubusb m1, m4
  1686. psubusb m7, m4
  1687. psubusb m6, m4
  1688. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  1689. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  1690. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  1691. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  1692. pand m0, m1
  1693. pand m7, m6
  1694. pand m0, m7
  1695. %else ; mmxext/sse2
  1696. pmaxub m0, m1
  1697. pmaxub m6, m7
  1698. pmaxub m0, m6
  1699. %endif
  1700. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  1701. SWAP 7, 3 ; now m7 is zero
  1702. %ifidn %1, v
  1703. movrow m3, [dst1q+mstrideq ] ; p0
  1704. %if mmsize == 16 && %2 == 8
  1705. movhps m3, [dst8q+mstrideq ]
  1706. %endif
  1707. %elifdef m12
  1708. SWAP 3, 12
  1709. %else
  1710. mova m3, m_p0backup
  1711. %endif
  1712. mova m1, m2
  1713. SWAP 1, 2
  1714. mova m6, m3
  1715. SWAP 3, 6
  1716. psubusb m1, m3 ; p1-p0
  1717. psubusb m6, m2 ; p0-p1
  1718. por m1, m6 ; abs(p1-p0)
  1719. %if notcpuflag(mmxext)
  1720. mova m6, m1
  1721. psubusb m1, m4
  1722. psubusb m6, m_hevthr
  1723. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  1724. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  1725. pand m0, m1
  1726. mova m_maskres, m6
  1727. %else ; mmxext/sse2
  1728. pmaxub m0, m1 ; max_I
  1729. SWAP 1, 4 ; max_hev_thresh
  1730. %endif
  1731. SWAP 6, 4 ; now m6 is I
  1732. %ifidn %1, v
  1733. movrow m4, [dst1q] ; q0
  1734. %if mmsize == 16 && %2 == 8
  1735. movhps m4, [dst8q]
  1736. %endif
  1737. %elifdef m8
  1738. SWAP 4, 8
  1739. %else
  1740. mova m4, m_q0backup
  1741. %endif
  1742. mova m1, m4
  1743. SWAP 1, 4
  1744. mova m7, m5
  1745. SWAP 7, 5
  1746. psubusb m1, m5 ; q0-q1
  1747. psubusb m7, m4 ; q1-q0
  1748. por m1, m7 ; abs(q1-q0)
  1749. %if notcpuflag(mmxext)
  1750. mova m7, m1
  1751. psubusb m1, m6
  1752. psubusb m7, m_hevthr
  1753. pxor m6, m6
  1754. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  1755. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  1756. mova m6, m_maskres
  1757. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  1758. pand m6, m7
  1759. %else ; mmxext/sse2
  1760. pxor m7, m7
  1761. pmaxub m0, m1
  1762. pmaxub m6, m1
  1763. psubusb m0, m_flimI
  1764. psubusb m6, m_hevthr
  1765. pcmpeqb m0, m7 ; max(abs(..)) <= I
  1766. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  1767. %endif
  1768. %ifdef m12
  1769. SWAP 6, 12
  1770. %else
  1771. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  1772. %endif
  1773. ; simple_limit
  1774. mova m1, m3
  1775. SWAP 1, 3
  1776. mova m6, m4 ; keep copies of p0/q0 around for later use
  1777. SWAP 6, 4
  1778. psubusb m1, m4 ; p0-q0
  1779. psubusb m6, m3 ; q0-p0
  1780. por m1, m6 ; abs(q0-p0)
  1781. paddusb m1, m1 ; m1=2*abs(q0-p0)
  1782. mova m7, m2
  1783. SWAP 7, 2
  1784. mova m6, m5
  1785. SWAP 6, 5
  1786. psubusb m7, m5 ; p1-q1
  1787. psubusb m6, m2 ; q1-p1
  1788. por m7, m6 ; abs(q1-p1)
  1789. pxor m6, m6
  1790. pand m7, [pb_FE]
  1791. psrlq m7, 1 ; abs(q1-p1)/2
  1792. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  1793. psubusb m7, m_flimE
  1794. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  1795. pand m0, m7 ; normal_limit result
  1796. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  1797. %ifdef m8 ; x86-64 && sse2
  1798. mova m8, [pb_80]
  1799. %define m_pb_80 m8
  1800. %else ; x86-32 or mmx/mmxext
  1801. %define m_pb_80 [pb_80]
  1802. %endif
  1803. mova m1, m4
  1804. mova m7, m3
  1805. pxor m1, m_pb_80
  1806. pxor m7, m_pb_80
  1807. psubsb m1, m7 ; (signed) q0-p0
  1808. mova m6, m2
  1809. mova m7, m5
  1810. pxor m6, m_pb_80
  1811. pxor m7, m_pb_80
  1812. psubsb m6, m7 ; (signed) p1-q1
  1813. mova m7, m_maskres
  1814. pandn m7, m6
  1815. paddsb m7, m1
  1816. paddsb m7, m1
  1817. paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
  1818. pand m7, m0
  1819. mova m1, [pb_F8]
  1820. mova m6, m7
  1821. paddsb m7, [pb_3]
  1822. paddsb m6, [pb_4]
  1823. pand m7, m1
  1824. pand m6, m1
  1825. pxor m1, m1
  1826. pxor m0, m0
  1827. pcmpgtb m1, m7
  1828. psubb m0, m7
  1829. psrlq m7, 3 ; +f2
  1830. psrlq m0, 3 ; -f2
  1831. pand m0, m1
  1832. pandn m1, m7
  1833. psubusb m3, m0
  1834. paddusb m3, m1 ; p0+f2
  1835. pxor m1, m1
  1836. pxor m0, m0
  1837. pcmpgtb m0, m6
  1838. psubb m1, m6
  1839. psrlq m6, 3 ; +f1
  1840. psrlq m1, 3 ; -f1
  1841. pand m1, m0
  1842. pandn m0, m6
  1843. psubusb m4, m0
  1844. paddusb m4, m1 ; q0-f1
  1845. %ifdef m12
  1846. SWAP 6, 12
  1847. %else
  1848. mova m6, m_maskres
  1849. %endif
  1850. %if notcpuflag(mmxext)
  1851. mova m7, [pb_1]
  1852. %else ; mmxext/sse2
  1853. pxor m7, m7
  1854. %endif
  1855. pand m0, m6
  1856. pand m1, m6
  1857. %if notcpuflag(mmxext)
  1858. paddusb m0, m7
  1859. pand m1, [pb_FE]
  1860. pandn m7, m0
  1861. psrlq m1, 1
  1862. psrlq m7, 1
  1863. SWAP 0, 7
  1864. %else ; mmxext/sse2
  1865. psubusb m1, [pb_1]
  1866. pavgb m0, m7 ; a
  1867. pavgb m1, m7 ; -a
  1868. %endif
  1869. psubusb m5, m0
  1870. psubusb m2, m1
  1871. paddusb m5, m1 ; q1-a
  1872. paddusb m2, m0 ; p1+a
  1873. ; store
  1874. %ifidn %1, v
  1875. movrow [dst1q+mstrideq*2], m2
  1876. movrow [dst1q+mstrideq ], m3
  1877. movrow [dst1q], m4
  1878. movrow [dst1q+ strideq ], m5
  1879. %if mmsize == 16 && %2 == 8
  1880. movhps [dst8q+mstrideq*2], m2
  1881. movhps [dst8q+mstrideq ], m3
  1882. movhps [dst8q], m4
  1883. movhps [dst8q+ strideq ], m5
  1884. %endif
  1885. %else ; h
  1886. add dst1q, 2
  1887. add dst2q, 2
  1888. ; 4x8/16 transpose
  1889. TRANSPOSE4x4B 2, 3, 4, 5, 6
  1890. %if mmsize == 8 ; mmx/mmxext (h)
  1891. WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
  1892. %else ; sse2 (h)
  1893. lea dst8q, [dst8q+mstrideq +2]
  1894. WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  1895. %endif
  1896. %endif
  1897. %if mmsize == 8
  1898. %if %2 == 8 ; chroma
  1899. %ifidn %1, h
  1900. sub dst1q, 2
  1901. %endif
  1902. cmp dst1q, dst8q
  1903. mov dst1q, dst8q
  1904. jnz .next8px
  1905. %else
  1906. %ifidn %1, h
  1907. lea dst1q, [dst1q+ strideq*8-2]
  1908. %else ; v
  1909. add dst1q, 8
  1910. %endif
  1911. dec cntrq
  1912. jg .next8px
  1913. %endif
  1914. REP_RET
  1915. %else ; mmsize == 16
  1916. RET
  1917. %endif
  1918. %endmacro
  1919. %if ARCH_X86_32
  1920. INIT_MMX mmx
  1921. INNER_LOOPFILTER v, 16
  1922. INNER_LOOPFILTER h, 16
  1923. INNER_LOOPFILTER v, 8
  1924. INNER_LOOPFILTER h, 8
  1925. INIT_MMX mmxext
  1926. INNER_LOOPFILTER v, 16
  1927. INNER_LOOPFILTER h, 16
  1928. INNER_LOOPFILTER v, 8
  1929. INNER_LOOPFILTER h, 8
  1930. %endif
  1931. INIT_XMM sse2
  1932. INNER_LOOPFILTER v, 16
  1933. INNER_LOOPFILTER h, 16
  1934. INNER_LOOPFILTER v, 8
  1935. INNER_LOOPFILTER h, 8
  1936. INIT_XMM ssse3
  1937. INNER_LOOPFILTER v, 16
  1938. INNER_LOOPFILTER h, 16
  1939. INNER_LOOPFILTER v, 8
  1940. INNER_LOOPFILTER h, 8
  1941. ;-----------------------------------------------------------------------------
  1942. ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1943. ; int flimE, int flimI, int hev_thr);
  1944. ;-----------------------------------------------------------------------------
  1945. %macro MBEDGE_LOOPFILTER 2
  1946. %define stack_size 0
  1947. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1948. %if mmsize == 16 ; [3]=hev() result
  1949. ; [4]=filter tmp result
  1950. ; [5]/[6] = p2/q2 backup
  1951. ; [7]=lim_res sign result
  1952. %define stack_size mmsize * -7
  1953. %else ; 8 ; extra storage space for transposes
  1954. %define stack_size mmsize * -8
  1955. %endif
  1956. %endif
  1957. %if %2 == 8 ; chroma
  1958. cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
  1959. %else ; luma
  1960. cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
  1961. %endif
  1962. %if cpuflag(ssse3)
  1963. pxor m7, m7
  1964. %endif
  1965. %ifndef m8
  1966. ; splat function arguments
  1967. SPLATB_REG m0, flimEq, m7 ; E
  1968. SPLATB_REG m1, flimIq, m7 ; I
  1969. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1970. %define m_flimE [rsp]
  1971. %define m_flimI [rsp+mmsize]
  1972. %define m_hevthr [rsp+mmsize*2]
  1973. %define m_maskres [rsp+mmsize*3]
  1974. %define m_limres [rsp+mmsize*4]
  1975. %define m_p0backup [rsp+mmsize*3]
  1976. %define m_q0backup [rsp+mmsize*4]
  1977. %define m_p2backup [rsp+mmsize*5]
  1978. %define m_q2backup [rsp+mmsize*6]
  1979. %if mmsize == 16
  1980. %define m_limsign [rsp]
  1981. %else
  1982. %define m_limsign [rsp+mmsize*7]
  1983. %endif
  1984. mova m_flimE, m0
  1985. mova m_flimI, m1
  1986. mova m_hevthr, m2
  1987. %else ; sse2 on x86-64
  1988. %define m_flimE m9
  1989. %define m_flimI m10
  1990. %define m_hevthr m11
  1991. %define m_maskres m12
  1992. %define m_limres m8
  1993. %define m_p0backup m12
  1994. %define m_q0backup m8
  1995. %define m_p2backup m13
  1996. %define m_q2backup m14
  1997. %define m_limsign m9
  1998. ; splat function arguments
  1999. SPLATB_REG m_flimE, flimEq, m7 ; E
  2000. SPLATB_REG m_flimI, flimIq, m7 ; I
  2001. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  2002. %endif
  2003. %if %2 == 8 ; chroma
  2004. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  2005. %elif mmsize == 8
  2006. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  2007. mov cntrq, 2
  2008. %else
  2009. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  2010. %endif
  2011. mov strideq, mstrideq
  2012. neg mstrideq
  2013. %ifidn %1, h
  2014. lea dst1q, [dst1q+strideq*4-4]
  2015. %if %2 == 8 ; chroma
  2016. lea dst8q, [dst8q+strideq*4-4]
  2017. %endif
  2018. %endif
  2019. %if mmsize == 8
  2020. .next8px:
  2021. %endif
  2022. ; read
  2023. lea dst2q, [dst1q+ strideq ]
  2024. %ifidn %1, v
  2025. %if %2 == 8 && mmsize == 16
  2026. %define movrow movh
  2027. %else
  2028. %define movrow mova
  2029. %endif
  2030. movrow m0, [dst1q+mstrideq*4] ; p3
  2031. movrow m1, [dst2q+mstrideq*4] ; p2
  2032. movrow m2, [dst1q+mstrideq*2] ; p1
  2033. movrow m5, [dst2q] ; q1
  2034. movrow m6, [dst2q+ strideq ] ; q2
  2035. movrow m7, [dst2q+ strideq*2] ; q3
  2036. %if mmsize == 16 && %2 == 8
  2037. movhps m0, [dst8q+mstrideq*4]
  2038. movhps m2, [dst8q+mstrideq*2]
  2039. add dst8q, strideq
  2040. movhps m1, [dst8q+mstrideq*4]
  2041. movhps m5, [dst8q]
  2042. movhps m6, [dst8q+ strideq ]
  2043. movhps m7, [dst8q+ strideq*2]
  2044. add dst8q, mstrideq
  2045. %endif
  2046. %elif mmsize == 8 ; mmx/mmxext (h)
  2047. ; read 8 rows of 8px each
  2048. movu m0, [dst1q+mstrideq*4]
  2049. movu m1, [dst2q+mstrideq*4]
  2050. movu m2, [dst1q+mstrideq*2]
  2051. movu m3, [dst1q+mstrideq ]
  2052. movu m4, [dst1q]
  2053. movu m5, [dst2q]
  2054. movu m6, [dst2q+ strideq ]
  2055. ; 8x8 transpose
  2056. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2057. mova m_q0backup, m1
  2058. movu m7, [dst2q+ strideq*2]
  2059. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2060. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2061. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2062. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2063. mova m1, m_q0backup
  2064. mova m_q0backup, m2 ; store q0
  2065. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2066. mova m_p0backup, m5 ; store p0
  2067. SWAP 1, 4
  2068. SWAP 2, 4
  2069. SWAP 6, 3
  2070. SWAP 5, 3
  2071. %else ; sse2 (h)
  2072. %if %2 == 16
  2073. lea dst8q, [dst1q+ strideq*8 ]
  2074. %endif
  2075. ; read 16 rows of 8px each, interleave
  2076. movh m0, [dst1q+mstrideq*4]
  2077. movh m1, [dst8q+mstrideq*4]
  2078. movh m2, [dst1q+mstrideq*2]
  2079. movh m5, [dst8q+mstrideq*2]
  2080. movh m3, [dst1q+mstrideq ]
  2081. movh m6, [dst8q+mstrideq ]
  2082. movh m4, [dst1q]
  2083. movh m7, [dst8q]
  2084. punpcklbw m0, m1 ; A/I
  2085. punpcklbw m2, m5 ; C/K
  2086. punpcklbw m3, m6 ; D/L
  2087. punpcklbw m4, m7 ; E/M
  2088. add dst8q, strideq
  2089. movh m1, [dst2q+mstrideq*4]
  2090. movh m6, [dst8q+mstrideq*4]
  2091. movh m5, [dst2q]
  2092. movh m7, [dst8q]
  2093. punpcklbw m1, m6 ; B/J
  2094. punpcklbw m5, m7 ; F/N
  2095. movh m6, [dst2q+ strideq ]
  2096. movh m7, [dst8q+ strideq ]
  2097. punpcklbw m6, m7 ; G/O
  2098. ; 8x16 transpose
  2099. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2100. %ifdef m8
  2101. SWAP 1, 8
  2102. %else
  2103. mova m_q0backup, m1
  2104. %endif
  2105. movh m7, [dst2q+ strideq*2]
  2106. movh m1, [dst8q+ strideq*2]
  2107. punpcklbw m7, m1 ; H/P
  2108. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2109. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2110. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2111. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2112. %ifdef m8
  2113. SWAP 1, 8
  2114. SWAP 2, 8
  2115. %else
  2116. mova m1, m_q0backup
  2117. mova m_q0backup, m2 ; store q0
  2118. %endif
  2119. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2120. %ifdef m12
  2121. SWAP 5, 12
  2122. %else
  2123. mova m_p0backup, m5 ; store p0
  2124. %endif
  2125. SWAP 1, 4
  2126. SWAP 2, 4
  2127. SWAP 6, 3
  2128. SWAP 5, 3
  2129. %endif
  2130. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  2131. mova m4, m1
  2132. SWAP 4, 1
  2133. psubusb m4, m0 ; p2-p3
  2134. psubusb m0, m1 ; p3-p2
  2135. por m0, m4 ; abs(p3-p2)
  2136. mova m4, m2
  2137. SWAP 4, 2
  2138. psubusb m4, m1 ; p1-p2
  2139. mova m_p2backup, m1
  2140. psubusb m1, m2 ; p2-p1
  2141. por m1, m4 ; abs(p2-p1)
  2142. mova m4, m6
  2143. SWAP 4, 6
  2144. psubusb m4, m7 ; q2-q3
  2145. psubusb m7, m6 ; q3-q2
  2146. por m7, m4 ; abs(q3-q2)
  2147. mova m4, m5
  2148. SWAP 4, 5
  2149. psubusb m4, m6 ; q1-q2
  2150. mova m_q2backup, m6
  2151. psubusb m6, m5 ; q2-q1
  2152. por m6, m4 ; abs(q2-q1)
  2153. %if notcpuflag(mmxext)
  2154. mova m4, m_flimI
  2155. pxor m3, m3
  2156. psubusb m0, m4
  2157. psubusb m1, m4
  2158. psubusb m7, m4
  2159. psubusb m6, m4
  2160. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  2161. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  2162. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  2163. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  2164. pand m0, m1
  2165. pand m7, m6
  2166. pand m0, m7
  2167. %else ; mmxext/sse2
  2168. pmaxub m0, m1
  2169. pmaxub m6, m7
  2170. pmaxub m0, m6
  2171. %endif
  2172. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  2173. SWAP 7, 3 ; now m7 is zero
  2174. %ifidn %1, v
  2175. movrow m3, [dst1q+mstrideq ] ; p0
  2176. %if mmsize == 16 && %2 == 8
  2177. movhps m3, [dst8q+mstrideq ]
  2178. %endif
  2179. %elifdef m12
  2180. SWAP 3, 12
  2181. %else
  2182. mova m3, m_p0backup
  2183. %endif
  2184. mova m1, m2
  2185. SWAP 1, 2
  2186. mova m6, m3
  2187. SWAP 3, 6
  2188. psubusb m1, m3 ; p1-p0
  2189. psubusb m6, m2 ; p0-p1
  2190. por m1, m6 ; abs(p1-p0)
  2191. %if notcpuflag(mmxext)
  2192. mova m6, m1
  2193. psubusb m1, m4
  2194. psubusb m6, m_hevthr
  2195. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  2196. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  2197. pand m0, m1
  2198. mova m_maskres, m6
  2199. %else ; mmxext/sse2
  2200. pmaxub m0, m1 ; max_I
  2201. SWAP 1, 4 ; max_hev_thresh
  2202. %endif
  2203. SWAP 6, 4 ; now m6 is I
  2204. %ifidn %1, v
  2205. movrow m4, [dst1q] ; q0
  2206. %if mmsize == 16 && %2 == 8
  2207. movhps m4, [dst8q]
  2208. %endif
  2209. %elifdef m8
  2210. SWAP 4, 8
  2211. %else
  2212. mova m4, m_q0backup
  2213. %endif
  2214. mova m1, m4
  2215. SWAP 1, 4
  2216. mova m7, m5
  2217. SWAP 7, 5
  2218. psubusb m1, m5 ; q0-q1
  2219. psubusb m7, m4 ; q1-q0
  2220. por m1, m7 ; abs(q1-q0)
  2221. %if notcpuflag(mmxext)
  2222. mova m7, m1
  2223. psubusb m1, m6
  2224. psubusb m7, m_hevthr
  2225. pxor m6, m6
  2226. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  2227. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  2228. mova m6, m_maskres
  2229. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  2230. pand m6, m7
  2231. %else ; mmxext/sse2
  2232. pxor m7, m7
  2233. pmaxub m0, m1
  2234. pmaxub m6, m1
  2235. psubusb m0, m_flimI
  2236. psubusb m6, m_hevthr
  2237. pcmpeqb m0, m7 ; max(abs(..)) <= I
  2238. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  2239. %endif
  2240. %ifdef m12
  2241. SWAP 6, 12
  2242. %else
  2243. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  2244. %endif
  2245. ; simple_limit
  2246. mova m1, m3
  2247. SWAP 1, 3
  2248. mova m6, m4 ; keep copies of p0/q0 around for later use
  2249. SWAP 6, 4
  2250. psubusb m1, m4 ; p0-q0
  2251. psubusb m6, m3 ; q0-p0
  2252. por m1, m6 ; abs(q0-p0)
  2253. paddusb m1, m1 ; m1=2*abs(q0-p0)
  2254. mova m7, m2
  2255. SWAP 7, 2
  2256. mova m6, m5
  2257. SWAP 6, 5
  2258. psubusb m7, m5 ; p1-q1
  2259. psubusb m6, m2 ; q1-p1
  2260. por m7, m6 ; abs(q1-p1)
  2261. pxor m6, m6
  2262. pand m7, [pb_FE]
  2263. psrlq m7, 1 ; abs(q1-p1)/2
  2264. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  2265. psubusb m7, m_flimE
  2266. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  2267. pand m0, m7 ; normal_limit result
  2268. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  2269. %ifdef m8 ; x86-64 && sse2
  2270. mova m8, [pb_80]
  2271. %define m_pb_80 m8
  2272. %else ; x86-32 or mmx/mmxext
  2273. %define m_pb_80 [pb_80]
  2274. %endif
  2275. mova m1, m4
  2276. mova m7, m3
  2277. pxor m1, m_pb_80
  2278. pxor m7, m_pb_80
  2279. psubsb m1, m7 ; (signed) q0-p0
  2280. mova m6, m2
  2281. mova m7, m5
  2282. pxor m6, m_pb_80
  2283. pxor m7, m_pb_80
  2284. psubsb m6, m7 ; (signed) p1-q1
  2285. mova m7, m_maskres
  2286. paddsb m6, m1
  2287. paddsb m6, m1
  2288. paddsb m6, m1
  2289. pand m6, m0
  2290. %ifdef m8
  2291. mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
  2292. pand m_limres, m7
  2293. %else
  2294. mova m0, m6
  2295. pand m0, m7
  2296. mova m_limres, m0
  2297. %endif
  2298. pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
  2299. mova m1, [pb_F8]
  2300. mova m6, m7
  2301. paddsb m7, [pb_3]
  2302. paddsb m6, [pb_4]
  2303. pand m7, m1
  2304. pand m6, m1
  2305. pxor m1, m1
  2306. pxor m0, m0
  2307. pcmpgtb m1, m7
  2308. psubb m0, m7
  2309. psrlq m7, 3 ; +f2
  2310. psrlq m0, 3 ; -f2
  2311. pand m0, m1
  2312. pandn m1, m7
  2313. psubusb m3, m0
  2314. paddusb m3, m1 ; p0+f2
  2315. pxor m1, m1
  2316. pxor m0, m0
  2317. pcmpgtb m0, m6
  2318. psubb m1, m6
  2319. psrlq m6, 3 ; +f1
  2320. psrlq m1, 3 ; -f1
  2321. pand m1, m0
  2322. pandn m0, m6
  2323. psubusb m4, m0
  2324. paddusb m4, m1 ; q0-f1
  2325. ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
  2326. %if cpuflag(ssse3)
  2327. mova m7, [pb_1]
  2328. %else
  2329. mova m7, [pw_63]
  2330. %endif
  2331. %ifdef m8
  2332. SWAP 1, 8
  2333. %else
  2334. mova m1, m_limres
  2335. %endif
  2336. pxor m0, m0
  2337. mova m6, m1
  2338. pcmpgtb m0, m1 ; which are negative
  2339. %if cpuflag(ssse3)
  2340. punpcklbw m6, m7 ; interleave with "1" for rounding
  2341. punpckhbw m1, m7
  2342. %else
  2343. punpcklbw m6, m0 ; signed byte->word
  2344. punpckhbw m1, m0
  2345. %endif
  2346. mova m_limsign, m0
  2347. %if cpuflag(ssse3)
  2348. mova m7, [pb_27_63]
  2349. %ifndef m8
  2350. mova m_limres, m1
  2351. %endif
  2352. %ifdef m10
  2353. SWAP 0, 10 ; don't lose lim_sign copy
  2354. %endif
  2355. mova m0, m7
  2356. pmaddubsw m7, m6
  2357. SWAP 6, 7
  2358. pmaddubsw m0, m1
  2359. SWAP 1, 0
  2360. %ifdef m10
  2361. SWAP 0, 10
  2362. %else
  2363. mova m0, m_limsign
  2364. %endif
  2365. %else
  2366. mova m_maskres, m6 ; backup for later in filter
  2367. mova m_limres, m1
  2368. pmullw m6, [pw_27]
  2369. pmullw m1, [pw_27]
  2370. paddw m6, m7
  2371. paddw m1, m7
  2372. %endif
  2373. psraw m6, 7
  2374. psraw m1, 7
  2375. packsswb m6, m1 ; a0
  2376. pxor m1, m1
  2377. psubb m1, m6
  2378. pand m1, m0 ; -a0
  2379. pandn m0, m6 ; +a0
  2380. %if cpuflag(ssse3)
  2381. mova m6, [pb_18_63] ; pipelining
  2382. %endif
  2383. psubusb m3, m1
  2384. paddusb m4, m1
  2385. paddusb m3, m0 ; p0+a0
  2386. psubusb m4, m0 ; q0-a0
  2387. %if cpuflag(ssse3)
  2388. SWAP 6, 7
  2389. %ifdef m10
  2390. SWAP 1, 10
  2391. %else
  2392. mova m1, m_limres
  2393. %endif
  2394. mova m0, m7
  2395. pmaddubsw m7, m6
  2396. SWAP 6, 7
  2397. pmaddubsw m0, m1
  2398. SWAP 1, 0
  2399. %ifdef m10
  2400. SWAP 0, 10
  2401. %endif
  2402. mova m0, m_limsign
  2403. %else
  2404. mova m6, m_maskres
  2405. mova m1, m_limres
  2406. pmullw m6, [pw_18]
  2407. pmullw m1, [pw_18]
  2408. paddw m6, m7
  2409. paddw m1, m7
  2410. %endif
  2411. mova m0, m_limsign
  2412. psraw m6, 7
  2413. psraw m1, 7
  2414. packsswb m6, m1 ; a1
  2415. pxor m1, m1
  2416. psubb m1, m6
  2417. pand m1, m0 ; -a1
  2418. pandn m0, m6 ; +a1
  2419. %if cpuflag(ssse3)
  2420. mova m6, [pb_9_63]
  2421. %endif
  2422. psubusb m2, m1
  2423. paddusb m5, m1
  2424. paddusb m2, m0 ; p1+a1
  2425. psubusb m5, m0 ; q1-a1
  2426. %if cpuflag(ssse3)
  2427. SWAP 6, 7
  2428. %ifdef m10
  2429. SWAP 1, 10
  2430. %else
  2431. mova m1, m_limres
  2432. %endif
  2433. mova m0, m7
  2434. pmaddubsw m7, m6
  2435. SWAP 6, 7
  2436. pmaddubsw m0, m1
  2437. SWAP 1, 0
  2438. %else
  2439. %ifdef m8
  2440. SWAP 6, 12
  2441. SWAP 1, 8
  2442. %else
  2443. mova m6, m_maskres
  2444. mova m1, m_limres
  2445. %endif
  2446. pmullw m6, [pw_9]
  2447. pmullw m1, [pw_9]
  2448. paddw m6, m7
  2449. paddw m1, m7
  2450. %endif
  2451. %ifdef m9
  2452. SWAP 7, 9
  2453. %else
  2454. mova m7, m_limsign
  2455. %endif
  2456. psraw m6, 7
  2457. psraw m1, 7
  2458. packsswb m6, m1 ; a1
  2459. pxor m0, m0
  2460. psubb m0, m6
  2461. pand m0, m7 ; -a1
  2462. pandn m7, m6 ; +a1
  2463. %ifdef m8
  2464. SWAP 1, 13
  2465. SWAP 6, 14
  2466. %else
  2467. mova m1, m_p2backup
  2468. mova m6, m_q2backup
  2469. %endif
  2470. psubusb m1, m0
  2471. paddusb m6, m0
  2472. paddusb m1, m7 ; p1+a1
  2473. psubusb m6, m7 ; q1-a1
  2474. ; store
  2475. %ifidn %1, v
  2476. movrow [dst2q+mstrideq*4], m1
  2477. movrow [dst1q+mstrideq*2], m2
  2478. movrow [dst1q+mstrideq ], m3
  2479. movrow [dst1q], m4
  2480. movrow [dst2q], m5
  2481. movrow [dst2q+ strideq ], m6
  2482. %if mmsize == 16 && %2 == 8
  2483. add dst8q, mstrideq
  2484. movhps [dst8q+mstrideq*2], m1
  2485. movhps [dst8q+mstrideq ], m2
  2486. movhps [dst8q], m3
  2487. add dst8q, strideq
  2488. movhps [dst8q], m4
  2489. movhps [dst8q+ strideq ], m5
  2490. movhps [dst8q+ strideq*2], m6
  2491. %endif
  2492. %else ; h
  2493. inc dst1q
  2494. inc dst2q
  2495. ; 4x8/16 transpose
  2496. TRANSPOSE4x4B 1, 2, 3, 4, 0
  2497. SBUTTERFLY bw, 5, 6, 0
  2498. %if mmsize == 8 ; mmx/mmxext (h)
  2499. WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
  2500. add dst1q, 4
  2501. WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
  2502. %else ; sse2 (h)
  2503. lea dst8q, [dst8q+mstrideq+1]
  2504. WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  2505. lea dst1q, [dst2q+mstrideq+4]
  2506. lea dst8q, [dst8q+mstrideq+4]
  2507. %if cpuflag(sse4)
  2508. add dst2q, 4
  2509. %endif
  2510. WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
  2511. %if cpuflag(sse4)
  2512. lea dst2q, [dst8q+ strideq ]
  2513. %endif
  2514. WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
  2515. %endif
  2516. %endif
  2517. %if mmsize == 8
  2518. %if %2 == 8 ; chroma
  2519. %ifidn %1, h
  2520. sub dst1q, 5
  2521. %endif
  2522. cmp dst1q, dst8q
  2523. mov dst1q, dst8q
  2524. jnz .next8px
  2525. %else
  2526. %ifidn %1, h
  2527. lea dst1q, [dst1q+ strideq*8-5]
  2528. %else ; v
  2529. add dst1q, 8
  2530. %endif
  2531. dec cntrq
  2532. jg .next8px
  2533. %endif
  2534. REP_RET
  2535. %else ; mmsize == 16
  2536. RET
  2537. %endif
  2538. %endmacro
  2539. %if ARCH_X86_32
  2540. INIT_MMX mmx
  2541. MBEDGE_LOOPFILTER v, 16
  2542. MBEDGE_LOOPFILTER h, 16
  2543. MBEDGE_LOOPFILTER v, 8
  2544. MBEDGE_LOOPFILTER h, 8
  2545. INIT_MMX mmxext
  2546. MBEDGE_LOOPFILTER v, 16
  2547. MBEDGE_LOOPFILTER h, 16
  2548. MBEDGE_LOOPFILTER v, 8
  2549. MBEDGE_LOOPFILTER h, 8
  2550. %endif
  2551. INIT_XMM sse2
  2552. MBEDGE_LOOPFILTER v, 16
  2553. MBEDGE_LOOPFILTER h, 16
  2554. MBEDGE_LOOPFILTER v, 8
  2555. MBEDGE_LOOPFILTER h, 8
  2556. INIT_XMM ssse3
  2557. MBEDGE_LOOPFILTER v, 16
  2558. MBEDGE_LOOPFILTER h, 16
  2559. MBEDGE_LOOPFILTER v, 8
  2560. MBEDGE_LOOPFILTER h, 8
  2561. INIT_XMM sse4
  2562. MBEDGE_LOOPFILTER h, 16
  2563. MBEDGE_LOOPFILTER h, 8