You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2781 lines
78KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86inc.asm"
  23. %include "x86util.asm"
  24. SECTION_RODATA
  25. fourtap_filter_hw_m: times 4 dw -6, 123
  26. times 4 dw 12, -1
  27. times 4 dw -9, 93
  28. times 4 dw 50, -6
  29. times 4 dw -6, 50
  30. times 4 dw 93, -9
  31. times 4 dw -1, 12
  32. times 4 dw 123, -6
  33. sixtap_filter_hw_m: times 4 dw 2, -11
  34. times 4 dw 108, 36
  35. times 4 dw -8, 1
  36. times 4 dw 3, -16
  37. times 4 dw 77, 77
  38. times 4 dw -16, 3
  39. times 4 dw 1, -8
  40. times 4 dw 36, 108
  41. times 4 dw -11, 2
  42. fourtap_filter_hb_m: times 8 db -6, 123
  43. times 8 db 12, -1
  44. times 8 db -9, 93
  45. times 8 db 50, -6
  46. times 8 db -6, 50
  47. times 8 db 93, -9
  48. times 8 db -1, 12
  49. times 8 db 123, -6
  50. sixtap_filter_hb_m: times 8 db 2, 1
  51. times 8 db -11, 108
  52. times 8 db 36, -8
  53. times 8 db 3, 3
  54. times 8 db -16, 77
  55. times 8 db 77, -16
  56. times 8 db 1, 2
  57. times 8 db -8, 36
  58. times 8 db 108, -11
  59. fourtap_filter_v_m: times 8 dw -6
  60. times 8 dw 123
  61. times 8 dw 12
  62. times 8 dw -1
  63. times 8 dw -9
  64. times 8 dw 93
  65. times 8 dw 50
  66. times 8 dw -6
  67. times 8 dw -6
  68. times 8 dw 50
  69. times 8 dw 93
  70. times 8 dw -9
  71. times 8 dw -1
  72. times 8 dw 12
  73. times 8 dw 123
  74. times 8 dw -6
  75. sixtap_filter_v_m: times 8 dw 2
  76. times 8 dw -11
  77. times 8 dw 108
  78. times 8 dw 36
  79. times 8 dw -8
  80. times 8 dw 1
  81. times 8 dw 3
  82. times 8 dw -16
  83. times 8 dw 77
  84. times 8 dw 77
  85. times 8 dw -16
  86. times 8 dw 3
  87. times 8 dw 1
  88. times 8 dw -8
  89. times 8 dw 36
  90. times 8 dw 108
  91. times 8 dw -11
  92. times 8 dw 2
  93. bilinear_filter_vw_m: times 8 dw 1
  94. times 8 dw 2
  95. times 8 dw 3
  96. times 8 dw 4
  97. times 8 dw 5
  98. times 8 dw 6
  99. times 8 dw 7
  100. bilinear_filter_vb_m: times 8 db 7, 1
  101. times 8 db 6, 2
  102. times 8 db 5, 3
  103. times 8 db 4, 4
  104. times 8 db 3, 5
  105. times 8 db 2, 6
  106. times 8 db 1, 7
  107. %ifdef PIC
  108. %define fourtap_filter_hw picregq
  109. %define sixtap_filter_hw picregq
  110. %define fourtap_filter_hb picregq
  111. %define sixtap_filter_hb picregq
  112. %define fourtap_filter_v picregq
  113. %define sixtap_filter_v picregq
  114. %define bilinear_filter_vw picregq
  115. %define bilinear_filter_vb picregq
  116. %define npicregs 1
  117. %else
  118. %define fourtap_filter_hw fourtap_filter_hw_m
  119. %define sixtap_filter_hw sixtap_filter_hw_m
  120. %define fourtap_filter_hb fourtap_filter_hb_m
  121. %define sixtap_filter_hb sixtap_filter_hb_m
  122. %define fourtap_filter_v fourtap_filter_v_m
  123. %define sixtap_filter_v sixtap_filter_v_m
  124. %define bilinear_filter_vw bilinear_filter_vw_m
  125. %define bilinear_filter_vb bilinear_filter_vb_m
  126. %define npicregs 0
  127. %endif
  128. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  129. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  130. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  131. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  132. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  133. pw_256: times 8 dw 256
  134. pw_20091: times 4 dw 20091
  135. pw_17734: times 4 dw 17734
  136. pb_27_63: times 8 db 27, 63
  137. pb_18_63: times 8 db 18, 63
  138. pb_9_63: times 8 db 9, 63
  139. cextern pb_1
  140. cextern pw_3
  141. cextern pb_3
  142. cextern pw_4
  143. cextern pb_4
  144. cextern pw_9
  145. cextern pw_18
  146. cextern pw_27
  147. cextern pw_63
  148. cextern pw_64
  149. cextern pb_80
  150. cextern pb_F8
  151. cextern pb_FE
  152. SECTION .text
  153. ;-----------------------------------------------------------------------------
  154. ; subpel MC functions:
  155. ;
  156. ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  157. ; uint8_t *src, int srcstride,
  158. ; int height, int mx, int my);
  159. ;-----------------------------------------------------------------------------
  160. %macro FILTER_SSSE3 1
  161. cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
  162. lea mxd, [mxq*3]
  163. mova m3, [filter_h6_shuf2]
  164. mova m4, [filter_h6_shuf3]
  165. %ifdef PIC
  166. lea picregq, [sixtap_filter_hb_m]
  167. %endif
  168. mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
  169. mova m6, [sixtap_filter_hb+mxq*8-32]
  170. mova m7, [sixtap_filter_hb+mxq*8-16]
  171. .nextrow
  172. movu m0, [srcq-2]
  173. mova m1, m0
  174. mova m2, m0
  175. %if mmsize == 8
  176. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  177. ; shuffle with a memory operand
  178. punpcklbw m0, [srcq+3]
  179. %else
  180. pshufb m0, [filter_h6_shuf1]
  181. %endif
  182. pshufb m1, m3
  183. pshufb m2, m4
  184. pmaddubsw m0, m5
  185. pmaddubsw m1, m6
  186. pmaddubsw m2, m7
  187. paddsw m0, m1
  188. paddsw m0, m2
  189. pmulhrsw m0, [pw_256]
  190. packuswb m0, m0
  191. movh [dstq], m0 ; store
  192. ; go to next line
  193. add dstq, dststrideq
  194. add srcq, srcstrideq
  195. dec heightd ; next row
  196. jg .nextrow
  197. REP_RET
  198. cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  199. shl mxd, 4
  200. mova m2, [pw_256]
  201. mova m3, [filter_h2_shuf]
  202. mova m4, [filter_h4_shuf]
  203. %ifdef PIC
  204. lea picregq, [fourtap_filter_hb_m]
  205. %endif
  206. mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
  207. mova m6, [fourtap_filter_hb+mxq]
  208. .nextrow
  209. movu m0, [srcq-1]
  210. mova m1, m0
  211. pshufb m0, m3
  212. pshufb m1, m4
  213. pmaddubsw m0, m5
  214. pmaddubsw m1, m6
  215. paddsw m0, m1
  216. pmulhrsw m0, m2
  217. packuswb m0, m0
  218. movh [dstq], m0 ; store
  219. ; go to next line
  220. add dstq, dststrideq
  221. add srcq, srcstrideq
  222. dec heightd ; next row
  223. jg .nextrow
  224. REP_RET
  225. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  226. shl myd, 4
  227. %ifdef PIC
  228. lea picregq, [fourtap_filter_hb_m]
  229. %endif
  230. mova m5, [fourtap_filter_hb+myq-16]
  231. mova m6, [fourtap_filter_hb+myq]
  232. mova m7, [pw_256]
  233. ; read 3 lines
  234. sub srcq, srcstrideq
  235. movh m0, [srcq]
  236. movh m1, [srcq+ srcstrideq]
  237. movh m2, [srcq+2*srcstrideq]
  238. add srcq, srcstrideq
  239. .nextrow
  240. movh m3, [srcq+2*srcstrideq] ; read new row
  241. mova m4, m0
  242. mova m0, m1
  243. punpcklbw m4, m1
  244. mova m1, m2
  245. punpcklbw m2, m3
  246. pmaddubsw m4, m5
  247. pmaddubsw m2, m6
  248. paddsw m4, m2
  249. mova m2, m3
  250. pmulhrsw m4, m7
  251. packuswb m4, m4
  252. movh [dstq], m4
  253. ; go to next line
  254. add dstq, dststrideq
  255. add srcq, srcstrideq
  256. dec heightd ; next row
  257. jg .nextrow
  258. REP_RET
  259. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  260. lea myd, [myq*3]
  261. %ifdef PIC
  262. lea picregq, [sixtap_filter_hb_m]
  263. %endif
  264. lea myq, [sixtap_filter_hb+myq*8]
  265. ; read 5 lines
  266. sub srcq, srcstrideq
  267. sub srcq, srcstrideq
  268. movh m0, [srcq]
  269. movh m1, [srcq+srcstrideq]
  270. movh m2, [srcq+srcstrideq*2]
  271. lea srcq, [srcq+srcstrideq*2]
  272. add srcq, srcstrideq
  273. movh m3, [srcq]
  274. movh m4, [srcq+srcstrideq]
  275. .nextrow
  276. movh m5, [srcq+2*srcstrideq] ; read new row
  277. mova m6, m0
  278. punpcklbw m6, m5
  279. mova m0, m1
  280. punpcklbw m1, m2
  281. mova m7, m3
  282. punpcklbw m7, m4
  283. pmaddubsw m6, [myq-48]
  284. pmaddubsw m1, [myq-32]
  285. pmaddubsw m7, [myq-16]
  286. paddsw m6, m1
  287. paddsw m6, m7
  288. mova m1, m2
  289. mova m2, m3
  290. pmulhrsw m6, [pw_256]
  291. mova m3, m4
  292. packuswb m6, m6
  293. mova m4, m5
  294. movh [dstq], m6
  295. ; go to next line
  296. add dstq, dststrideq
  297. add srcq, srcstrideq
  298. dec heightd ; next row
  299. jg .nextrow
  300. REP_RET
  301. %endmacro
  302. INIT_MMX ssse3
  303. FILTER_SSSE3 4
  304. INIT_XMM ssse3
  305. FILTER_SSSE3 8
  306. ; 4x4 block, H-only 4-tap filter
  307. INIT_MMX mmx2
  308. cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  309. shl mxd, 4
  310. %ifdef PIC
  311. lea picregq, [fourtap_filter_hw_m]
  312. %endif
  313. movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
  314. movq mm5, [fourtap_filter_hw+mxq]
  315. movq mm7, [pw_64]
  316. pxor mm6, mm6
  317. .nextrow
  318. movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
  319. ; first set of 2 pixels
  320. movq mm2, mm1 ; byte ABCD..
  321. punpcklbw mm1, mm6 ; byte->word ABCD
  322. pshufw mm0, mm2, 9 ; byte CDEF..
  323. punpcklbw mm0, mm6 ; byte->word CDEF
  324. pshufw mm3, mm1, 0x94 ; word ABBC
  325. pshufw mm1, mm0, 0x94 ; word CDDE
  326. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  327. movq mm0, mm1 ; backup for second set of pixels
  328. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  329. paddd mm3, mm1 ; finish 1st 2px
  330. ; second set of 2 pixels, use backup of above
  331. punpckhbw mm2, mm6 ; byte->word EFGH
  332. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  333. pshufw mm1, mm2, 0x94 ; word EFFG
  334. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  335. paddd mm0, mm1 ; finish 2nd 2px
  336. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  337. packssdw mm3, mm0 ; merge dword->word (4px)
  338. paddsw mm3, mm7 ; rounding
  339. psraw mm3, 7
  340. packuswb mm3, mm6 ; clip and word->bytes
  341. movd [dstq], mm3 ; store
  342. ; go to next line
  343. add dstq, dststrideq
  344. add srcq, srcstrideq
  345. dec heightd ; next row
  346. jg .nextrow
  347. REP_RET
  348. ; 4x4 block, H-only 6-tap filter
  349. INIT_MMX mmx2
  350. cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  351. lea mxd, [mxq*3]
  352. %ifdef PIC
  353. lea picregq, [sixtap_filter_hw_m]
  354. %endif
  355. movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
  356. movq mm5, [sixtap_filter_hw+mxq*8-32]
  357. movq mm6, [sixtap_filter_hw+mxq*8-16]
  358. movq mm7, [pw_64]
  359. pxor mm3, mm3
  360. .nextrow
  361. movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
  362. ; first set of 2 pixels
  363. movq mm2, mm1 ; byte ABCD..
  364. punpcklbw mm1, mm3 ; byte->word ABCD
  365. pshufw mm0, mm2, 0x9 ; byte CDEF..
  366. punpckhbw mm2, mm3 ; byte->word EFGH
  367. punpcklbw mm0, mm3 ; byte->word CDEF
  368. pshufw mm1, mm1, 0x94 ; word ABBC
  369. pshufw mm2, mm2, 0x94 ; word EFFG
  370. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  371. pshufw mm3, mm0, 0x94 ; word CDDE
  372. movq mm0, mm3 ; backup for second set of pixels
  373. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  374. paddd mm1, mm3 ; add to 1st 2px cache
  375. movq mm3, mm2 ; backup for second set of pixels
  376. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  377. paddd mm1, mm2 ; finish 1st 2px
  378. ; second set of 2 pixels, use backup of above
  379. movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
  380. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  381. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  382. paddd mm0, mm3 ; add to 2nd 2px cache
  383. pxor mm3, mm3
  384. punpcklbw mm2, mm3 ; byte->word FGHI
  385. pshufw mm2, mm2, 0xE9 ; word GHHI
  386. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  387. paddd mm0, mm2 ; finish 2nd 2px
  388. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  389. packssdw mm1, mm0 ; merge dword->word (4px)
  390. paddsw mm1, mm7 ; rounding
  391. psraw mm1, 7
  392. packuswb mm1, mm3 ; clip and word->bytes
  393. movd [dstq], mm1 ; store
  394. ; go to next line
  395. add dstq, dststrideq
  396. add srcq, srcstrideq
  397. dec heightd ; next row
  398. jg .nextrow
  399. REP_RET
  400. INIT_XMM sse2
  401. cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
  402. shl mxd, 5
  403. %ifdef PIC
  404. lea picregq, [fourtap_filter_v_m]
  405. %endif
  406. lea mxq, [fourtap_filter_v+mxq-32]
  407. pxor m7, m7
  408. mova m4, [pw_64]
  409. mova m5, [mxq+ 0]
  410. mova m6, [mxq+16]
  411. %ifdef m8
  412. mova m8, [mxq+32]
  413. mova m9, [mxq+48]
  414. %endif
  415. .nextrow
  416. movq m0, [srcq-1]
  417. movq m1, [srcq-0]
  418. movq m2, [srcq+1]
  419. movq m3, [srcq+2]
  420. punpcklbw m0, m7
  421. punpcklbw m1, m7
  422. punpcklbw m2, m7
  423. punpcklbw m3, m7
  424. pmullw m0, m5
  425. pmullw m1, m6
  426. %ifdef m8
  427. pmullw m2, m8
  428. pmullw m3, m9
  429. %else
  430. pmullw m2, [mxq+32]
  431. pmullw m3, [mxq+48]
  432. %endif
  433. paddsw m0, m1
  434. paddsw m2, m3
  435. paddsw m0, m2
  436. paddsw m0, m4
  437. psraw m0, 7
  438. packuswb m0, m7
  439. movh [dstq], m0 ; store
  440. ; go to next line
  441. add dstq, dststrideq
  442. add srcq, srcstrideq
  443. dec heightd ; next row
  444. jg .nextrow
  445. REP_RET
  446. INIT_XMM sse2
  447. cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
  448. lea mxd, [mxq*3]
  449. shl mxd, 4
  450. %ifdef PIC
  451. lea picregq, [sixtap_filter_v_m]
  452. %endif
  453. lea mxq, [sixtap_filter_v+mxq-96]
  454. pxor m7, m7
  455. mova m6, [pw_64]
  456. %ifdef m8
  457. mova m8, [mxq+ 0]
  458. mova m9, [mxq+16]
  459. mova m10, [mxq+32]
  460. mova m11, [mxq+48]
  461. mova m12, [mxq+64]
  462. mova m13, [mxq+80]
  463. %endif
  464. .nextrow
  465. movq m0, [srcq-2]
  466. movq m1, [srcq-1]
  467. movq m2, [srcq-0]
  468. movq m3, [srcq+1]
  469. movq m4, [srcq+2]
  470. movq m5, [srcq+3]
  471. punpcklbw m0, m7
  472. punpcklbw m1, m7
  473. punpcklbw m2, m7
  474. punpcklbw m3, m7
  475. punpcklbw m4, m7
  476. punpcklbw m5, m7
  477. %ifdef m8
  478. pmullw m0, m8
  479. pmullw m1, m9
  480. pmullw m2, m10
  481. pmullw m3, m11
  482. pmullw m4, m12
  483. pmullw m5, m13
  484. %else
  485. pmullw m0, [mxq+ 0]
  486. pmullw m1, [mxq+16]
  487. pmullw m2, [mxq+32]
  488. pmullw m3, [mxq+48]
  489. pmullw m4, [mxq+64]
  490. pmullw m5, [mxq+80]
  491. %endif
  492. paddsw m1, m4
  493. paddsw m0, m5
  494. paddsw m1, m2
  495. paddsw m0, m3
  496. paddsw m0, m1
  497. paddsw m0, m6
  498. psraw m0, 7
  499. packuswb m0, m7
  500. movh [dstq], m0 ; store
  501. ; go to next line
  502. add dstq, dststrideq
  503. add srcq, srcstrideq
  504. dec heightd ; next row
  505. jg .nextrow
  506. REP_RET
  507. %macro FILTER_V 1
  508. ; 4x4 block, V-only 4-tap filter
  509. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  510. shl myd, 5
  511. %ifdef PIC
  512. lea picregq, [fourtap_filter_v_m]
  513. %endif
  514. lea myq, [fourtap_filter_v+myq-32]
  515. mova m6, [pw_64]
  516. pxor m7, m7
  517. mova m5, [myq+48]
  518. ; read 3 lines
  519. sub srcq, srcstrideq
  520. movh m0, [srcq]
  521. movh m1, [srcq+ srcstrideq]
  522. movh m2, [srcq+2*srcstrideq]
  523. add srcq, srcstrideq
  524. punpcklbw m0, m7
  525. punpcklbw m1, m7
  526. punpcklbw m2, m7
  527. .nextrow
  528. ; first calculate negative taps (to prevent losing positive overflows)
  529. movh m4, [srcq+2*srcstrideq] ; read new row
  530. punpcklbw m4, m7
  531. mova m3, m4
  532. pmullw m0, [myq+0]
  533. pmullw m4, m5
  534. paddsw m4, m0
  535. ; then calculate positive taps
  536. mova m0, m1
  537. pmullw m1, [myq+16]
  538. paddsw m4, m1
  539. mova m1, m2
  540. pmullw m2, [myq+32]
  541. paddsw m4, m2
  542. mova m2, m3
  543. ; round/clip/store
  544. paddsw m4, m6
  545. psraw m4, 7
  546. packuswb m4, m7
  547. movh [dstq], m4
  548. ; go to next line
  549. add dstq, dststrideq
  550. add srcq, srcstrideq
  551. dec heightd ; next row
  552. jg .nextrow
  553. REP_RET
  554. ; 4x4 block, V-only 6-tap filter
  555. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  556. shl myd, 4
  557. lea myq, [myq*3]
  558. %ifdef PIC
  559. lea picregq, [sixtap_filter_v_m]
  560. %endif
  561. lea myq, [sixtap_filter_v+myq-96]
  562. pxor m7, m7
  563. ; read 5 lines
  564. sub srcq, srcstrideq
  565. sub srcq, srcstrideq
  566. movh m0, [srcq]
  567. movh m1, [srcq+srcstrideq]
  568. movh m2, [srcq+srcstrideq*2]
  569. lea srcq, [srcq+srcstrideq*2]
  570. add srcq, srcstrideq
  571. movh m3, [srcq]
  572. movh m4, [srcq+srcstrideq]
  573. punpcklbw m0, m7
  574. punpcklbw m1, m7
  575. punpcklbw m2, m7
  576. punpcklbw m3, m7
  577. punpcklbw m4, m7
  578. .nextrow
  579. ; first calculate negative taps (to prevent losing positive overflows)
  580. mova m5, m1
  581. pmullw m5, [myq+16]
  582. mova m6, m4
  583. pmullw m6, [myq+64]
  584. paddsw m6, m5
  585. ; then calculate positive taps
  586. movh m5, [srcq+2*srcstrideq] ; read new row
  587. punpcklbw m5, m7
  588. pmullw m0, [myq+0]
  589. paddsw m6, m0
  590. mova m0, m1
  591. mova m1, m2
  592. pmullw m2, [myq+32]
  593. paddsw m6, m2
  594. mova m2, m3
  595. pmullw m3, [myq+48]
  596. paddsw m6, m3
  597. mova m3, m4
  598. mova m4, m5
  599. pmullw m5, [myq+80]
  600. paddsw m6, m5
  601. ; round/clip/store
  602. paddsw m6, [pw_64]
  603. psraw m6, 7
  604. packuswb m6, m7
  605. movh [dstq], m6
  606. ; go to next line
  607. add dstq, dststrideq
  608. add srcq, srcstrideq
  609. dec heightd ; next row
  610. jg .nextrow
  611. REP_RET
  612. %endmacro
  613. INIT_MMX mmx2
  614. FILTER_V 4
  615. INIT_XMM sse2
  616. FILTER_V 8
  617. %macro FILTER_BILINEAR 1
  618. cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
  619. shl myd, 4
  620. %ifdef PIC
  621. lea picregq, [bilinear_filter_vw_m]
  622. %endif
  623. pxor m6, m6
  624. mova m5, [bilinear_filter_vw+myq-1*16]
  625. neg myq
  626. mova m4, [bilinear_filter_vw+myq+7*16]
  627. .nextrow
  628. movh m0, [srcq+srcstrideq*0]
  629. movh m1, [srcq+srcstrideq*1]
  630. movh m3, [srcq+srcstrideq*2]
  631. punpcklbw m0, m6
  632. punpcklbw m1, m6
  633. punpcklbw m3, m6
  634. mova m2, m1
  635. pmullw m0, m4
  636. pmullw m1, m5
  637. pmullw m2, m4
  638. pmullw m3, m5
  639. paddsw m0, m1
  640. paddsw m2, m3
  641. psraw m0, 2
  642. psraw m2, 2
  643. pavgw m0, m6
  644. pavgw m2, m6
  645. %if mmsize == 8
  646. packuswb m0, m0
  647. packuswb m2, m2
  648. movh [dstq+dststrideq*0], m0
  649. movh [dstq+dststrideq*1], m2
  650. %else
  651. packuswb m0, m2
  652. movh [dstq+dststrideq*0], m0
  653. movhps [dstq+dststrideq*1], m0
  654. %endif
  655. lea dstq, [dstq+dststrideq*2]
  656. lea srcq, [srcq+srcstrideq*2]
  657. sub heightd, 2
  658. jg .nextrow
  659. REP_RET
  660. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  661. shl mxd, 4
  662. %ifdef PIC
  663. lea picregq, [bilinear_filter_vw_m]
  664. %endif
  665. pxor m6, m6
  666. mova m5, [bilinear_filter_vw+mxq-1*16]
  667. neg mxq
  668. mova m4, [bilinear_filter_vw+mxq+7*16]
  669. .nextrow
  670. movh m0, [srcq+srcstrideq*0+0]
  671. movh m1, [srcq+srcstrideq*0+1]
  672. movh m2, [srcq+srcstrideq*1+0]
  673. movh m3, [srcq+srcstrideq*1+1]
  674. punpcklbw m0, m6
  675. punpcklbw m1, m6
  676. punpcklbw m2, m6
  677. punpcklbw m3, m6
  678. pmullw m0, m4
  679. pmullw m1, m5
  680. pmullw m2, m4
  681. pmullw m3, m5
  682. paddsw m0, m1
  683. paddsw m2, m3
  684. psraw m0, 2
  685. psraw m2, 2
  686. pavgw m0, m6
  687. pavgw m2, m6
  688. %if mmsize == 8
  689. packuswb m0, m0
  690. packuswb m2, m2
  691. movh [dstq+dststrideq*0], m0
  692. movh [dstq+dststrideq*1], m2
  693. %else
  694. packuswb m0, m2
  695. movh [dstq+dststrideq*0], m0
  696. movhps [dstq+dststrideq*1], m0
  697. %endif
  698. lea dstq, [dstq+dststrideq*2]
  699. lea srcq, [srcq+srcstrideq*2]
  700. sub heightd, 2
  701. jg .nextrow
  702. REP_RET
  703. %endmacro
  704. INIT_MMX mmx2
  705. FILTER_BILINEAR 4
  706. INIT_XMM sse2
  707. FILTER_BILINEAR 8
  708. %macro FILTER_BILINEAR_SSSE3 1
  709. cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
  710. shl myd, 4
  711. %ifdef PIC
  712. lea picregq, [bilinear_filter_vb_m]
  713. %endif
  714. pxor m4, m4
  715. mova m3, [bilinear_filter_vb+myq-16]
  716. .nextrow
  717. movh m0, [srcq+srcstrideq*0]
  718. movh m1, [srcq+srcstrideq*1]
  719. movh m2, [srcq+srcstrideq*2]
  720. punpcklbw m0, m1
  721. punpcklbw m1, m2
  722. pmaddubsw m0, m3
  723. pmaddubsw m1, m3
  724. psraw m0, 2
  725. psraw m1, 2
  726. pavgw m0, m4
  727. pavgw m1, m4
  728. %if mmsize==8
  729. packuswb m0, m0
  730. packuswb m1, m1
  731. movh [dstq+dststrideq*0], m0
  732. movh [dstq+dststrideq*1], m1
  733. %else
  734. packuswb m0, m1
  735. movh [dstq+dststrideq*0], m0
  736. movhps [dstq+dststrideq*1], m0
  737. %endif
  738. lea dstq, [dstq+dststrideq*2]
  739. lea srcq, [srcq+srcstrideq*2]
  740. sub heightd, 2
  741. jg .nextrow
  742. REP_RET
  743. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
  744. shl mxd, 4
  745. %ifdef PIC
  746. lea picregq, [bilinear_filter_vb_m]
  747. %endif
  748. pxor m4, m4
  749. mova m2, [filter_h2_shuf]
  750. mova m3, [bilinear_filter_vb+mxq-16]
  751. .nextrow
  752. movu m0, [srcq+srcstrideq*0]
  753. movu m1, [srcq+srcstrideq*1]
  754. pshufb m0, m2
  755. pshufb m1, m2
  756. pmaddubsw m0, m3
  757. pmaddubsw m1, m3
  758. psraw m0, 2
  759. psraw m1, 2
  760. pavgw m0, m4
  761. pavgw m1, m4
  762. %if mmsize==8
  763. packuswb m0, m0
  764. packuswb m1, m1
  765. movh [dstq+dststrideq*0], m0
  766. movh [dstq+dststrideq*1], m1
  767. %else
  768. packuswb m0, m1
  769. movh [dstq+dststrideq*0], m0
  770. movhps [dstq+dststrideq*1], m0
  771. %endif
  772. lea dstq, [dstq+dststrideq*2]
  773. lea srcq, [srcq+srcstrideq*2]
  774. sub heightd, 2
  775. jg .nextrow
  776. REP_RET
  777. %endmacro
  778. INIT_MMX ssse3
  779. FILTER_BILINEAR_SSSE3 4
  780. INIT_XMM ssse3
  781. FILTER_BILINEAR_SSSE3 8
  782. INIT_MMX mmx
  783. cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
  784. .nextrow:
  785. movq mm0, [srcq+srcstrideq*0]
  786. movq mm1, [srcq+srcstrideq*1]
  787. lea srcq, [srcq+srcstrideq*2]
  788. movq [dstq+dststrideq*0], mm0
  789. movq [dstq+dststrideq*1], mm1
  790. lea dstq, [dstq+dststrideq*2]
  791. sub heightd, 2
  792. jg .nextrow
  793. REP_RET
  794. %if ARCH_X86_32
  795. INIT_MMX mmx
  796. cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
  797. .nextrow:
  798. movq mm0, [srcq+srcstrideq*0+0]
  799. movq mm1, [srcq+srcstrideq*0+8]
  800. movq mm2, [srcq+srcstrideq*1+0]
  801. movq mm3, [srcq+srcstrideq*1+8]
  802. lea srcq, [srcq+srcstrideq*2]
  803. movq [dstq+dststrideq*0+0], mm0
  804. movq [dstq+dststrideq*0+8], mm1
  805. movq [dstq+dststrideq*1+0], mm2
  806. movq [dstq+dststrideq*1+8], mm3
  807. lea dstq, [dstq+dststrideq*2]
  808. sub heightd, 2
  809. jg .nextrow
  810. REP_RET
  811. %endif
  812. INIT_XMM sse
  813. cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
  814. .nextrow:
  815. movups xmm0, [srcq+srcstrideq*0]
  816. movups xmm1, [srcq+srcstrideq*1]
  817. lea srcq, [srcq+srcstrideq*2]
  818. movaps [dstq+dststrideq*0], xmm0
  819. movaps [dstq+dststrideq*1], xmm1
  820. lea dstq, [dstq+dststrideq*2]
  821. sub heightd, 2
  822. jg .nextrow
  823. REP_RET
  824. ;-----------------------------------------------------------------------------
  825. ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  826. ;-----------------------------------------------------------------------------
  827. %macro ADD_DC 4
  828. %4 m2, [dst1q+%3]
  829. %4 m3, [dst1q+strideq+%3]
  830. %4 m4, [dst2q+%3]
  831. %4 m5, [dst2q+strideq+%3]
  832. paddusb m2, %1
  833. paddusb m3, %1
  834. paddusb m4, %1
  835. paddusb m5, %1
  836. psubusb m2, %2
  837. psubusb m3, %2
  838. psubusb m4, %2
  839. psubusb m5, %2
  840. %4 [dst1q+%3], m2
  841. %4 [dst1q+strideq+%3], m3
  842. %4 [dst2q+%3], m4
  843. %4 [dst2q+strideq+%3], m5
  844. %endmacro
  845. INIT_MMX mmx
  846. cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
  847. ; load data
  848. movd m0, [blockq]
  849. ; calculate DC
  850. paddw m0, [pw_4]
  851. pxor m1, m1
  852. psraw m0, 3
  853. movd [blockq], m1
  854. psubw m1, m0
  855. packuswb m0, m0
  856. packuswb m1, m1
  857. punpcklbw m0, m0
  858. punpcklbw m1, m1
  859. punpcklwd m0, m0
  860. punpcklwd m1, m1
  861. ; add DC
  862. DEFINE_ARGS dst1, dst2, stride
  863. lea dst2q, [dst1q+strideq*2]
  864. ADD_DC m0, m1, 0, movh
  865. RET
  866. INIT_XMM sse4
  867. cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
  868. ; load data
  869. movd m0, [blockq]
  870. pxor m1, m1
  871. ; calculate DC
  872. paddw m0, [pw_4]
  873. movd [blockq], m1
  874. DEFINE_ARGS dst1, dst2, stride
  875. lea dst2q, [dst1q+strideq*2]
  876. movd m2, [dst1q]
  877. movd m3, [dst1q+strideq]
  878. movd m4, [dst2q]
  879. movd m5, [dst2q+strideq]
  880. psraw m0, 3
  881. pshuflw m0, m0, 0
  882. punpcklqdq m0, m0
  883. punpckldq m2, m3
  884. punpckldq m4, m5
  885. punpcklbw m2, m1
  886. punpcklbw m4, m1
  887. paddw m2, m0
  888. paddw m4, m0
  889. packuswb m2, m4
  890. movd [dst1q], m2
  891. pextrd [dst1q+strideq], m2, 1
  892. pextrd [dst2q], m2, 2
  893. pextrd [dst2q+strideq], m2, 3
  894. RET
  895. ;-----------------------------------------------------------------------------
  896. ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  897. ;-----------------------------------------------------------------------------
  898. %if ARCH_X86_32
  899. INIT_MMX mmx
  900. cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
  901. ; load data
  902. movd m0, [blockq+32*0] ; A
  903. movd m1, [blockq+32*2] ; C
  904. punpcklwd m0, [blockq+32*1] ; A B
  905. punpcklwd m1, [blockq+32*3] ; C D
  906. punpckldq m0, m1 ; A B C D
  907. pxor m6, m6
  908. ; calculate DC
  909. paddw m0, [pw_4]
  910. movd [blockq+32*0], m6
  911. movd [blockq+32*1], m6
  912. movd [blockq+32*2], m6
  913. movd [blockq+32*3], m6
  914. psraw m0, 3
  915. psubw m6, m0
  916. packuswb m0, m0
  917. packuswb m6, m6
  918. punpcklbw m0, m0 ; AABBCCDD
  919. punpcklbw m6, m6 ; AABBCCDD
  920. movq m1, m0
  921. movq m7, m6
  922. punpcklbw m0, m0 ; AAAABBBB
  923. punpckhbw m1, m1 ; CCCCDDDD
  924. punpcklbw m6, m6 ; AAAABBBB
  925. punpckhbw m7, m7 ; CCCCDDDD
  926. ; add DC
  927. DEFINE_ARGS dst1, dst2, stride
  928. lea dst2q, [dst1q+strideq*2]
  929. ADD_DC m0, m6, 0, mova
  930. ADD_DC m1, m7, 8, mova
  931. RET
  932. %endif
  933. INIT_XMM sse2
  934. cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
  935. ; load data
  936. movd m0, [blockq+32*0] ; A
  937. movd m1, [blockq+32*2] ; C
  938. punpcklwd m0, [blockq+32*1] ; A B
  939. punpcklwd m1, [blockq+32*3] ; C D
  940. punpckldq m0, m1 ; A B C D
  941. pxor m1, m1
  942. ; calculate DC
  943. paddw m0, [pw_4]
  944. movd [blockq+32*0], m1
  945. movd [blockq+32*1], m1
  946. movd [blockq+32*2], m1
  947. movd [blockq+32*3], m1
  948. psraw m0, 3
  949. psubw m1, m0
  950. packuswb m0, m0
  951. packuswb m1, m1
  952. punpcklbw m0, m0
  953. punpcklbw m1, m1
  954. punpcklbw m0, m0
  955. punpcklbw m1, m1
  956. ; add DC
  957. DEFINE_ARGS dst1, dst2, stride
  958. lea dst2q, [dst1q+strideq*2]
  959. ADD_DC m0, m1, 0, mova
  960. RET
  961. ;-----------------------------------------------------------------------------
  962. ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  963. ;-----------------------------------------------------------------------------
  964. INIT_MMX mmx
  965. cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
  966. ; load data
  967. movd m0, [blockq+32*0] ; A
  968. movd m1, [blockq+32*2] ; C
  969. punpcklwd m0, [blockq+32*1] ; A B
  970. punpcklwd m1, [blockq+32*3] ; C D
  971. punpckldq m0, m1 ; A B C D
  972. pxor m6, m6
  973. ; calculate DC
  974. paddw m0, [pw_4]
  975. movd [blockq+32*0], m6
  976. movd [blockq+32*1], m6
  977. movd [blockq+32*2], m6
  978. movd [blockq+32*3], m6
  979. psraw m0, 3
  980. psubw m6, m0
  981. packuswb m0, m0
  982. packuswb m6, m6
  983. punpcklbw m0, m0 ; AABBCCDD
  984. punpcklbw m6, m6 ; AABBCCDD
  985. movq m1, m0
  986. movq m7, m6
  987. punpcklbw m0, m0 ; AAAABBBB
  988. punpckhbw m1, m1 ; CCCCDDDD
  989. punpcklbw m6, m6 ; AAAABBBB
  990. punpckhbw m7, m7 ; CCCCDDDD
  991. ; add DC
  992. DEFINE_ARGS dst1, dst2, stride
  993. lea dst2q, [dst1q+strideq*2]
  994. ADD_DC m0, m6, 0, mova
  995. lea dst1q, [dst1q+strideq*4]
  996. lea dst2q, [dst2q+strideq*4]
  997. ADD_DC m1, m7, 0, mova
  998. RET
  999. ;-----------------------------------------------------------------------------
  1000. ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  1001. ;-----------------------------------------------------------------------------
  1002. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  1003. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  1004. %macro VP8_MULTIPLY_SUMSUB 4
  1005. mova %3, %1
  1006. mova %4, %2
  1007. pmulhw %3, m6 ;20091(1)
  1008. pmulhw %4, m6 ;20091(2)
  1009. paddw %3, %1
  1010. paddw %4, %2
  1011. paddw %1, %1
  1012. paddw %2, %2
  1013. pmulhw %1, m7 ;35468(1)
  1014. pmulhw %2, m7 ;35468(2)
  1015. psubw %1, %4
  1016. paddw %2, %3
  1017. %endmacro
  1018. ; calculate x0=%1+%3; x1=%1-%3
  1019. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  1020. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  1021. ; %5/%6 are temporary registers
  1022. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  1023. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  1024. SUMSUB_BA w, %3, %1, %5 ;t0, t1
  1025. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  1026. SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
  1027. SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
  1028. SWAP %4, %1
  1029. SWAP %4, %3
  1030. %endmacro
  1031. %macro VP8_IDCT_ADD 0
  1032. cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
  1033. ; load block data
  1034. movq m0, [blockq+ 0]
  1035. movq m1, [blockq+ 8]
  1036. movq m2, [blockq+16]
  1037. movq m3, [blockq+24]
  1038. movq m6, [pw_20091]
  1039. movq m7, [pw_17734]
  1040. %if cpuflag(sse)
  1041. xorps xmm0, xmm0
  1042. movaps [blockq+ 0], xmm0
  1043. movaps [blockq+16], xmm0
  1044. %else
  1045. pxor m4, m4
  1046. movq [blockq+ 0], m4
  1047. movq [blockq+ 8], m4
  1048. movq [blockq+16], m4
  1049. movq [blockq+24], m4
  1050. %endif
  1051. ; actual IDCT
  1052. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1053. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1054. paddw m0, [pw_4]
  1055. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1056. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1057. ; store
  1058. pxor m4, m4
  1059. DEFINE_ARGS dst1, dst2, stride
  1060. lea dst2q, [dst1q+2*strideq]
  1061. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
  1062. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
  1063. RET
  1064. %endmacro
  1065. %if ARCH_X86_32
  1066. INIT_MMX mmx
  1067. VP8_IDCT_ADD
  1068. %endif
  1069. INIT_MMX sse
  1070. VP8_IDCT_ADD
  1071. ;-----------------------------------------------------------------------------
  1072. ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
  1073. ;-----------------------------------------------------------------------------
  1074. %macro SCATTER_WHT 3
  1075. movd dc1d, m%1
  1076. movd dc2d, m%2
  1077. mov [blockq+2*16*(0+%3)], dc1w
  1078. mov [blockq+2*16*(1+%3)], dc2w
  1079. shr dc1d, 16
  1080. shr dc2d, 16
  1081. psrlq m%1, 32
  1082. psrlq m%2, 32
  1083. mov [blockq+2*16*(4+%3)], dc1w
  1084. mov [blockq+2*16*(5+%3)], dc2w
  1085. movd dc1d, m%1
  1086. movd dc2d, m%2
  1087. mov [blockq+2*16*(8+%3)], dc1w
  1088. mov [blockq+2*16*(9+%3)], dc2w
  1089. shr dc1d, 16
  1090. shr dc2d, 16
  1091. mov [blockq+2*16*(12+%3)], dc1w
  1092. mov [blockq+2*16*(13+%3)], dc2w
  1093. %endmacro
  1094. %macro HADAMARD4_1D 4
  1095. SUMSUB_BADC w, %2, %1, %4, %3
  1096. SUMSUB_BADC w, %4, %2, %3, %1
  1097. SWAP %1, %4, %3
  1098. %endmacro
  1099. %macro VP8_DC_WHT 0
  1100. cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
  1101. movq m0, [dc1q]
  1102. movq m1, [dc1q+8]
  1103. movq m2, [dc1q+16]
  1104. movq m3, [dc1q+24]
  1105. %if cpuflag(sse)
  1106. xorps xmm0, xmm0
  1107. movaps [dc1q+ 0], xmm0
  1108. movaps [dc1q+16], xmm0
  1109. %else
  1110. pxor m4, m4
  1111. movq [dc1q+ 0], m4
  1112. movq [dc1q+ 8], m4
  1113. movq [dc1q+16], m4
  1114. movq [dc1q+24], m4
  1115. %endif
  1116. HADAMARD4_1D 0, 1, 2, 3
  1117. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1118. paddw m0, [pw_3]
  1119. HADAMARD4_1D 0, 1, 2, 3
  1120. psraw m0, 3
  1121. psraw m1, 3
  1122. psraw m2, 3
  1123. psraw m3, 3
  1124. SCATTER_WHT 0, 1, 0
  1125. SCATTER_WHT 2, 3, 2
  1126. RET
  1127. %endmacro
  1128. %if ARCH_X86_32
  1129. INIT_MMX mmx
  1130. VP8_DC_WHT
  1131. %endif
  1132. INIT_MMX sse
  1133. VP8_DC_WHT
  1134. ;-----------------------------------------------------------------------------
  1135. ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
  1136. ;-----------------------------------------------------------------------------
  1137. ; macro called with 7 mm register indexes as argument, and 4 regular registers
  1138. ;
  1139. ; first 4 mm registers will carry the transposed pixel data
  1140. ; the other three are scratchspace (one would be sufficient, but this allows
  1141. ; for more spreading/pipelining and thus faster execution on OOE CPUs)
  1142. ;
  1143. ; first two regular registers are buf+4*stride and buf+5*stride
  1144. ; third is -stride, fourth is +stride
  1145. %macro READ_8x4_INTERLEAVED 11
  1146. ; interleave 8 (A-H) rows of 4 pixels each
  1147. movd m%1, [%8+%10*4] ; A0-3
  1148. movd m%5, [%9+%10*4] ; B0-3
  1149. movd m%2, [%8+%10*2] ; C0-3
  1150. movd m%6, [%8+%10] ; D0-3
  1151. movd m%3, [%8] ; E0-3
  1152. movd m%7, [%9] ; F0-3
  1153. movd m%4, [%9+%11] ; G0-3
  1154. punpcklbw m%1, m%5 ; A/B interleaved
  1155. movd m%5, [%9+%11*2] ; H0-3
  1156. punpcklbw m%2, m%6 ; C/D interleaved
  1157. punpcklbw m%3, m%7 ; E/F interleaved
  1158. punpcklbw m%4, m%5 ; G/H interleaved
  1159. %endmacro
  1160. ; macro called with 7 mm register indexes as argument, and 5 regular registers
  1161. ; first 11 mean the same as READ_8x4_TRANSPOSED above
  1162. ; fifth regular register is scratchspace to reach the bottom 8 rows, it
  1163. ; will be set to second regular register + 8*stride at the end
  1164. %macro READ_16x4_INTERLEAVED 12
  1165. ; transpose 16 (A-P) rows of 4 pixels each
  1166. lea %12, [r0+8*r2]
  1167. ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
  1168. movd m%1, [%8+%10*4] ; A0-3
  1169. movd m%3, [%12+%10*4] ; I0-3
  1170. movd m%2, [%8+%10*2] ; C0-3
  1171. movd m%4, [%12+%10*2] ; K0-3
  1172. movd m%6, [%8+%10] ; D0-3
  1173. movd m%5, [%12+%10] ; L0-3
  1174. movd m%7, [%12] ; M0-3
  1175. add %12, %11
  1176. punpcklbw m%1, m%3 ; A/I
  1177. movd m%3, [%8] ; E0-3
  1178. punpcklbw m%2, m%4 ; C/K
  1179. punpcklbw m%6, m%5 ; D/L
  1180. punpcklbw m%3, m%7 ; E/M
  1181. punpcklbw m%2, m%6 ; C/D/K/L interleaved
  1182. ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
  1183. movd m%5, [%9+%10*4] ; B0-3
  1184. movd m%4, [%12+%10*4] ; J0-3
  1185. movd m%7, [%9] ; F0-3
  1186. movd m%6, [%12] ; N0-3
  1187. punpcklbw m%5, m%4 ; B/J
  1188. punpcklbw m%7, m%6 ; F/N
  1189. punpcklbw m%1, m%5 ; A/B/I/J interleaved
  1190. punpcklbw m%3, m%7 ; E/F/M/N interleaved
  1191. movd m%4, [%9+%11] ; G0-3
  1192. movd m%6, [%12+%11] ; O0-3
  1193. movd m%5, [%9+%11*2] ; H0-3
  1194. movd m%7, [%12+%11*2] ; P0-3
  1195. punpcklbw m%4, m%6 ; G/O
  1196. punpcklbw m%5, m%7 ; H/P
  1197. punpcklbw m%4, m%5 ; G/H/O/P interleaved
  1198. %endmacro
  1199. ; write 4 mm registers of 2 dwords each
  1200. ; first four arguments are mm register indexes containing source data
  1201. ; last four are registers containing buf+4*stride, buf+5*stride,
  1202. ; -stride and +stride
  1203. %macro WRITE_4x2D 8
  1204. ; write out (2 dwords per register)
  1205. movd [%5+%7*4], m%1
  1206. movd [%5+%7*2], m%2
  1207. movd [%5], m%3
  1208. movd [%6+%8], m%4
  1209. punpckhdq m%1, m%1
  1210. punpckhdq m%2, m%2
  1211. punpckhdq m%3, m%3
  1212. punpckhdq m%4, m%4
  1213. movd [%6+%7*4], m%1
  1214. movd [%5+%7], m%2
  1215. movd [%6], m%3
  1216. movd [%6+%8*2], m%4
  1217. %endmacro
  1218. ; write 4 xmm registers of 4 dwords each
  1219. ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
  1220. ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
  1221. ; we add 1*stride to the third regular registry in the process
  1222. ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
  1223. ; same memory region), or 8 if they cover two separate buffers (third one points to
  1224. ; a different memory region than the first two), allowing for more optimal code for
  1225. ; the 16-width case
  1226. %macro WRITE_4x4D 10
  1227. ; write out (4 dwords per register), start with dwords zero
  1228. movd [%5+%8*4], m%1
  1229. movd [%5], m%2
  1230. movd [%7+%8*4], m%3
  1231. movd [%7], m%4
  1232. ; store dwords 1
  1233. psrldq m%1, 4
  1234. psrldq m%2, 4
  1235. psrldq m%3, 4
  1236. psrldq m%4, 4
  1237. movd [%6+%8*4], m%1
  1238. movd [%6], m%2
  1239. %if %10 == 16
  1240. movd [%6+%9*4], m%3
  1241. %endif
  1242. movd [%7+%9], m%4
  1243. ; write dwords 2
  1244. psrldq m%1, 4
  1245. psrldq m%2, 4
  1246. %if %10 == 8
  1247. movd [%5+%8*2], m%1
  1248. movd %5d, m%3
  1249. %endif
  1250. psrldq m%3, 4
  1251. psrldq m%4, 4
  1252. %if %10 == 16
  1253. movd [%5+%8*2], m%1
  1254. %endif
  1255. movd [%6+%9], m%2
  1256. movd [%7+%8*2], m%3
  1257. movd [%7+%9*2], m%4
  1258. add %7, %9
  1259. ; store dwords 3
  1260. psrldq m%1, 4
  1261. psrldq m%2, 4
  1262. psrldq m%3, 4
  1263. psrldq m%4, 4
  1264. %if %10 == 8
  1265. mov [%7+%8*4], %5d
  1266. movd [%6+%8*2], m%1
  1267. %else
  1268. movd [%5+%8], m%1
  1269. %endif
  1270. movd [%6+%9*2], m%2
  1271. movd [%7+%8*2], m%3
  1272. movd [%7+%9*2], m%4
  1273. %endmacro
  1274. ; write 4 or 8 words in the mmx/xmm registers as 8 lines
  1275. ; 1 and 2 are the registers to write, this can be the same (for SSE2)
  1276. ; for pre-SSE4:
  1277. ; 3 is a general-purpose register that we will clobber
  1278. ; for SSE4:
  1279. ; 3 is a pointer to the destination's 5th line
  1280. ; 4 is a pointer to the destination's 4th line
  1281. ; 5/6 is -stride and +stride
  1282. %macro WRITE_2x4W 6
  1283. movd %3d, %1
  1284. punpckhdq %1, %1
  1285. mov [%4+%5*4], %3w
  1286. shr %3, 16
  1287. add %4, %6
  1288. mov [%4+%5*4], %3w
  1289. movd %3d, %1
  1290. add %4, %5
  1291. mov [%4+%5*2], %3w
  1292. shr %3, 16
  1293. mov [%4+%5 ], %3w
  1294. movd %3d, %2
  1295. punpckhdq %2, %2
  1296. mov [%4 ], %3w
  1297. shr %3, 16
  1298. mov [%4+%6 ], %3w
  1299. movd %3d, %2
  1300. add %4, %6
  1301. mov [%4+%6 ], %3w
  1302. shr %3, 16
  1303. mov [%4+%6*2], %3w
  1304. add %4, %5
  1305. %endmacro
  1306. %macro WRITE_8W 5
  1307. %if cpuflag(sse4)
  1308. pextrw [%3+%4*4], %1, 0
  1309. pextrw [%2+%4*4], %1, 1
  1310. pextrw [%3+%4*2], %1, 2
  1311. pextrw [%3+%4 ], %1, 3
  1312. pextrw [%3 ], %1, 4
  1313. pextrw [%2 ], %1, 5
  1314. pextrw [%2+%5 ], %1, 6
  1315. pextrw [%2+%5*2], %1, 7
  1316. %else
  1317. movd %2d, %1
  1318. psrldq %1, 4
  1319. mov [%3+%4*4], %2w
  1320. shr %2, 16
  1321. add %3, %5
  1322. mov [%3+%4*4], %2w
  1323. movd %2d, %1
  1324. psrldq %1, 4
  1325. add %3, %4
  1326. mov [%3+%4*2], %2w
  1327. shr %2, 16
  1328. mov [%3+%4 ], %2w
  1329. movd %2d, %1
  1330. psrldq %1, 4
  1331. mov [%3 ], %2w
  1332. shr %2, 16
  1333. mov [%3+%5 ], %2w
  1334. movd %2d, %1
  1335. add %3, %5
  1336. mov [%3+%5 ], %2w
  1337. shr %2, 16
  1338. mov [%3+%5*2], %2w
  1339. %endif
  1340. %endmacro
  1341. %macro SIMPLE_LOOPFILTER 2
  1342. cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
  1343. %if mmsize == 8 ; mmx/mmxext
  1344. mov cntrq, 2
  1345. %endif
  1346. %if cpuflag(ssse3)
  1347. pxor m0, m0
  1348. %endif
  1349. SPLATB_REG m7, flim, m0 ; splat "flim" into register
  1350. ; set up indexes to address 4 rows
  1351. %if mmsize == 8
  1352. DEFINE_ARGS dst1, mstride, stride, cntr, dst2
  1353. %else
  1354. DEFINE_ARGS dst1, mstride, stride, dst3, dst2
  1355. %endif
  1356. mov strideq, mstrideq
  1357. neg mstrideq
  1358. %ifidn %1, h
  1359. lea dst1q, [dst1q+4*strideq-2]
  1360. %endif
  1361. %if mmsize == 8 ; mmx / mmxext
  1362. .next8px
  1363. %endif
  1364. %ifidn %1, v
  1365. ; read 4 half/full rows of pixels
  1366. mova m0, [dst1q+mstrideq*2] ; p1
  1367. mova m1, [dst1q+mstrideq] ; p0
  1368. mova m2, [dst1q] ; q0
  1369. mova m3, [dst1q+ strideq] ; q1
  1370. %else ; h
  1371. lea dst2q, [dst1q+ strideq]
  1372. %if mmsize == 8 ; mmx/mmxext
  1373. READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
  1374. %else ; sse2
  1375. READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
  1376. %endif
  1377. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1378. %endif
  1379. ; simple_limit
  1380. mova m5, m2 ; m5=backup of q0
  1381. mova m6, m1 ; m6=backup of p0
  1382. psubusb m1, m2 ; p0-q0
  1383. psubusb m2, m6 ; q0-p0
  1384. por m1, m2 ; FFABS(p0-q0)
  1385. paddusb m1, m1 ; m1=FFABS(p0-q0)*2
  1386. mova m4, m3
  1387. mova m2, m0
  1388. psubusb m3, m0 ; q1-p1
  1389. psubusb m0, m4 ; p1-q1
  1390. por m3, m0 ; FFABS(p1-q1)
  1391. mova m0, [pb_80]
  1392. pxor m2, m0
  1393. pxor m4, m0
  1394. psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
  1395. pand m3, [pb_FE]
  1396. psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
  1397. paddusb m3, m1
  1398. psubusb m3, m7
  1399. pxor m1, m1
  1400. pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
  1401. ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
  1402. mova m4, m5
  1403. pxor m5, m0
  1404. pxor m0, m6
  1405. psubsb m5, m0 ; q0-p0 (signed)
  1406. paddsb m2, m5
  1407. paddsb m2, m5
  1408. paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
  1409. pand m2, m3 ; apply filter mask (m3)
  1410. mova m3, [pb_F8]
  1411. mova m1, m2
  1412. paddsb m2, [pb_4] ; f1<<3=a+4
  1413. paddsb m1, [pb_3] ; f2<<3=a+3
  1414. pand m2, m3
  1415. pand m1, m3 ; cache f2<<3
  1416. pxor m0, m0
  1417. pxor m3, m3
  1418. pcmpgtb m0, m2 ; which values are <0?
  1419. psubb m3, m2 ; -f1<<3
  1420. psrlq m2, 3 ; +f1
  1421. psrlq m3, 3 ; -f1
  1422. pand m3, m0
  1423. pandn m0, m2
  1424. psubusb m4, m0
  1425. paddusb m4, m3 ; q0-f1
  1426. pxor m0, m0
  1427. pxor m3, m3
  1428. pcmpgtb m0, m1 ; which values are <0?
  1429. psubb m3, m1 ; -f2<<3
  1430. psrlq m1, 3 ; +f2
  1431. psrlq m3, 3 ; -f2
  1432. pand m3, m0
  1433. pandn m0, m1
  1434. paddusb m6, m0
  1435. psubusb m6, m3 ; p0+f2
  1436. ; store
  1437. %ifidn %1, v
  1438. mova [dst1q], m4
  1439. mova [dst1q+mstrideq], m6
  1440. %else ; h
  1441. inc dst1q
  1442. SBUTTERFLY bw, 6, 4, 0
  1443. %if mmsize == 16 ; sse2
  1444. %if cpuflag(sse4)
  1445. inc dst2q
  1446. %endif
  1447. WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
  1448. lea dst2q, [dst3q+mstrideq+1]
  1449. %if cpuflag(sse4)
  1450. inc dst3q
  1451. %endif
  1452. WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
  1453. %else ; mmx/mmxext
  1454. WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
  1455. %endif
  1456. %endif
  1457. %if mmsize == 8 ; mmx/mmxext
  1458. ; next 8 pixels
  1459. %ifidn %1, v
  1460. add dst1q, 8 ; advance 8 cols = pixels
  1461. %else ; h
  1462. lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
  1463. %endif
  1464. dec cntrq
  1465. jg .next8px
  1466. REP_RET
  1467. %else ; sse2
  1468. RET
  1469. %endif
  1470. %endmacro
  1471. %if ARCH_X86_32
  1472. INIT_MMX mmx
  1473. SIMPLE_LOOPFILTER v, 4
  1474. SIMPLE_LOOPFILTER h, 5
  1475. INIT_MMX mmx2
  1476. SIMPLE_LOOPFILTER v, 4
  1477. SIMPLE_LOOPFILTER h, 5
  1478. %endif
  1479. INIT_XMM sse2
  1480. SIMPLE_LOOPFILTER v, 3
  1481. SIMPLE_LOOPFILTER h, 5
  1482. INIT_XMM ssse3
  1483. SIMPLE_LOOPFILTER v, 3
  1484. SIMPLE_LOOPFILTER h, 5
  1485. INIT_XMM sse4
  1486. SIMPLE_LOOPFILTER h, 5
  1487. ;-----------------------------------------------------------------------------
  1488. ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1489. ; int flimE, int flimI, int hev_thr);
  1490. ;-----------------------------------------------------------------------------
  1491. %macro INNER_LOOPFILTER 2
  1492. %if %2 == 8 ; chroma
  1493. cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr
  1494. %else ; luma
  1495. cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
  1496. %endif
  1497. %if cpuflag(ssse3)
  1498. pxor m7, m7
  1499. %endif
  1500. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1501. %ifidn %1, v ; [3]=hev() result
  1502. %assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15)
  1503. %else ; h ; extra storage space for transposes
  1504. %assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15)
  1505. %endif
  1506. ; splat function arguments
  1507. SPLATB_REG m0, flimEq, m7 ; E
  1508. SPLATB_REG m1, flimIq, m7 ; I
  1509. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1510. SUB rsp, pad
  1511. %define m_flimE [rsp]
  1512. %define m_flimI [rsp+mmsize]
  1513. %define m_hevthr [rsp+mmsize*2]
  1514. %define m_maskres [rsp+mmsize*3]
  1515. %define m_p0backup [rsp+mmsize*3]
  1516. %define m_q0backup [rsp+mmsize*4]
  1517. mova m_flimE, m0
  1518. mova m_flimI, m1
  1519. mova m_hevthr, m2
  1520. %else
  1521. %define m_flimE m9
  1522. %define m_flimI m10
  1523. %define m_hevthr m11
  1524. %define m_maskres m12
  1525. %define m_p0backup m12
  1526. %define m_q0backup m8
  1527. ; splat function arguments
  1528. SPLATB_REG m_flimE, flimEq, m7 ; E
  1529. SPLATB_REG m_flimI, flimIq, m7 ; I
  1530. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  1531. %endif
  1532. %if %2 == 8 ; chroma
  1533. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  1534. %elif mmsize == 8
  1535. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  1536. mov cntrq, 2
  1537. %else
  1538. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  1539. %endif
  1540. mov strideq, mstrideq
  1541. neg mstrideq
  1542. %ifidn %1, h
  1543. lea dst1q, [dst1q+strideq*4-4]
  1544. %if %2 == 8 ; chroma
  1545. lea dst8q, [dst8q+strideq*4-4]
  1546. %endif
  1547. %endif
  1548. %if mmsize == 8
  1549. .next8px:
  1550. %endif
  1551. ; read
  1552. lea dst2q, [dst1q+strideq]
  1553. %ifidn %1, v
  1554. %if %2 == 8 && mmsize == 16
  1555. %define movrow movh
  1556. %else
  1557. %define movrow mova
  1558. %endif
  1559. movrow m0, [dst1q+mstrideq*4] ; p3
  1560. movrow m1, [dst2q+mstrideq*4] ; p2
  1561. movrow m2, [dst1q+mstrideq*2] ; p1
  1562. movrow m5, [dst2q] ; q1
  1563. movrow m6, [dst2q+ strideq*1] ; q2
  1564. movrow m7, [dst2q+ strideq*2] ; q3
  1565. %if mmsize == 16 && %2 == 8
  1566. movhps m0, [dst8q+mstrideq*4]
  1567. movhps m2, [dst8q+mstrideq*2]
  1568. add dst8q, strideq
  1569. movhps m1, [dst8q+mstrideq*4]
  1570. movhps m5, [dst8q]
  1571. movhps m6, [dst8q+ strideq ]
  1572. movhps m7, [dst8q+ strideq*2]
  1573. add dst8q, mstrideq
  1574. %endif
  1575. %elif mmsize == 8 ; mmx/mmxext (h)
  1576. ; read 8 rows of 8px each
  1577. movu m0, [dst1q+mstrideq*4]
  1578. movu m1, [dst2q+mstrideq*4]
  1579. movu m2, [dst1q+mstrideq*2]
  1580. movu m3, [dst1q+mstrideq ]
  1581. movu m4, [dst1q]
  1582. movu m5, [dst2q]
  1583. movu m6, [dst2q+ strideq ]
  1584. ; 8x8 transpose
  1585. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1586. mova m_q0backup, m1
  1587. movu m7, [dst2q+ strideq*2]
  1588. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1589. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1590. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1591. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1592. mova m1, m_q0backup
  1593. mova m_q0backup, m2 ; store q0
  1594. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1595. mova m_p0backup, m5 ; store p0
  1596. SWAP 1, 4
  1597. SWAP 2, 4
  1598. SWAP 6, 3
  1599. SWAP 5, 3
  1600. %else ; sse2 (h)
  1601. %if %2 == 16
  1602. lea dst8q, [dst1q+ strideq*8]
  1603. %endif
  1604. ; read 16 rows of 8px each, interleave
  1605. movh m0, [dst1q+mstrideq*4]
  1606. movh m1, [dst8q+mstrideq*4]
  1607. movh m2, [dst1q+mstrideq*2]
  1608. movh m5, [dst8q+mstrideq*2]
  1609. movh m3, [dst1q+mstrideq ]
  1610. movh m6, [dst8q+mstrideq ]
  1611. movh m4, [dst1q]
  1612. movh m7, [dst8q]
  1613. punpcklbw m0, m1 ; A/I
  1614. punpcklbw m2, m5 ; C/K
  1615. punpcklbw m3, m6 ; D/L
  1616. punpcklbw m4, m7 ; E/M
  1617. add dst8q, strideq
  1618. movh m1, [dst2q+mstrideq*4]
  1619. movh m6, [dst8q+mstrideq*4]
  1620. movh m5, [dst2q]
  1621. movh m7, [dst8q]
  1622. punpcklbw m1, m6 ; B/J
  1623. punpcklbw m5, m7 ; F/N
  1624. movh m6, [dst2q+ strideq ]
  1625. movh m7, [dst8q+ strideq ]
  1626. punpcklbw m6, m7 ; G/O
  1627. ; 8x16 transpose
  1628. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1629. %ifdef m8
  1630. SWAP 1, 8
  1631. %else
  1632. mova m_q0backup, m1
  1633. %endif
  1634. movh m7, [dst2q+ strideq*2]
  1635. movh m1, [dst8q+ strideq*2]
  1636. punpcklbw m7, m1 ; H/P
  1637. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1638. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1639. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1640. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1641. %ifdef m8
  1642. SWAP 1, 8
  1643. SWAP 2, 8
  1644. %else
  1645. mova m1, m_q0backup
  1646. mova m_q0backup, m2 ; store q0
  1647. %endif
  1648. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1649. %ifdef m12
  1650. SWAP 5, 12
  1651. %else
  1652. mova m_p0backup, m5 ; store p0
  1653. %endif
  1654. SWAP 1, 4
  1655. SWAP 2, 4
  1656. SWAP 6, 3
  1657. SWAP 5, 3
  1658. %endif
  1659. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  1660. mova m4, m1
  1661. SWAP 4, 1
  1662. psubusb m4, m0 ; p2-p3
  1663. psubusb m0, m1 ; p3-p2
  1664. por m0, m4 ; abs(p3-p2)
  1665. mova m4, m2
  1666. SWAP 4, 2
  1667. psubusb m4, m1 ; p1-p2
  1668. psubusb m1, m2 ; p2-p1
  1669. por m1, m4 ; abs(p2-p1)
  1670. mova m4, m6
  1671. SWAP 4, 6
  1672. psubusb m4, m7 ; q2-q3
  1673. psubusb m7, m6 ; q3-q2
  1674. por m7, m4 ; abs(q3-q2)
  1675. mova m4, m5
  1676. SWAP 4, 5
  1677. psubusb m4, m6 ; q1-q2
  1678. psubusb m6, m5 ; q2-q1
  1679. por m6, m4 ; abs(q2-q1)
  1680. %if notcpuflag(mmx2)
  1681. mova m4, m_flimI
  1682. pxor m3, m3
  1683. psubusb m0, m4
  1684. psubusb m1, m4
  1685. psubusb m7, m4
  1686. psubusb m6, m4
  1687. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  1688. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  1689. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  1690. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  1691. pand m0, m1
  1692. pand m7, m6
  1693. pand m0, m7
  1694. %else ; mmxext/sse2
  1695. pmaxub m0, m1
  1696. pmaxub m6, m7
  1697. pmaxub m0, m6
  1698. %endif
  1699. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  1700. SWAP 7, 3 ; now m7 is zero
  1701. %ifidn %1, v
  1702. movrow m3, [dst1q+mstrideq ] ; p0
  1703. %if mmsize == 16 && %2 == 8
  1704. movhps m3, [dst8q+mstrideq ]
  1705. %endif
  1706. %elifdef m12
  1707. SWAP 3, 12
  1708. %else
  1709. mova m3, m_p0backup
  1710. %endif
  1711. mova m1, m2
  1712. SWAP 1, 2
  1713. mova m6, m3
  1714. SWAP 3, 6
  1715. psubusb m1, m3 ; p1-p0
  1716. psubusb m6, m2 ; p0-p1
  1717. por m1, m6 ; abs(p1-p0)
  1718. %if notcpuflag(mmx2)
  1719. mova m6, m1
  1720. psubusb m1, m4
  1721. psubusb m6, m_hevthr
  1722. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  1723. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  1724. pand m0, m1
  1725. mova m_maskres, m6
  1726. %else ; mmxext/sse2
  1727. pmaxub m0, m1 ; max_I
  1728. SWAP 1, 4 ; max_hev_thresh
  1729. %endif
  1730. SWAP 6, 4 ; now m6 is I
  1731. %ifidn %1, v
  1732. movrow m4, [dst1q] ; q0
  1733. %if mmsize == 16 && %2 == 8
  1734. movhps m4, [dst8q]
  1735. %endif
  1736. %elifdef m8
  1737. SWAP 4, 8
  1738. %else
  1739. mova m4, m_q0backup
  1740. %endif
  1741. mova m1, m4
  1742. SWAP 1, 4
  1743. mova m7, m5
  1744. SWAP 7, 5
  1745. psubusb m1, m5 ; q0-q1
  1746. psubusb m7, m4 ; q1-q0
  1747. por m1, m7 ; abs(q1-q0)
  1748. %if notcpuflag(mmx2)
  1749. mova m7, m1
  1750. psubusb m1, m6
  1751. psubusb m7, m_hevthr
  1752. pxor m6, m6
  1753. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  1754. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  1755. mova m6, m_maskres
  1756. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  1757. pand m6, m7
  1758. %else ; mmxext/sse2
  1759. pxor m7, m7
  1760. pmaxub m0, m1
  1761. pmaxub m6, m1
  1762. psubusb m0, m_flimI
  1763. psubusb m6, m_hevthr
  1764. pcmpeqb m0, m7 ; max(abs(..)) <= I
  1765. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  1766. %endif
  1767. %ifdef m12
  1768. SWAP 6, 12
  1769. %else
  1770. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  1771. %endif
  1772. ; simple_limit
  1773. mova m1, m3
  1774. SWAP 1, 3
  1775. mova m6, m4 ; keep copies of p0/q0 around for later use
  1776. SWAP 6, 4
  1777. psubusb m1, m4 ; p0-q0
  1778. psubusb m6, m3 ; q0-p0
  1779. por m1, m6 ; abs(q0-p0)
  1780. paddusb m1, m1 ; m1=2*abs(q0-p0)
  1781. mova m7, m2
  1782. SWAP 7, 2
  1783. mova m6, m5
  1784. SWAP 6, 5
  1785. psubusb m7, m5 ; p1-q1
  1786. psubusb m6, m2 ; q1-p1
  1787. por m7, m6 ; abs(q1-p1)
  1788. pxor m6, m6
  1789. pand m7, [pb_FE]
  1790. psrlq m7, 1 ; abs(q1-p1)/2
  1791. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  1792. psubusb m7, m_flimE
  1793. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  1794. pand m0, m7 ; normal_limit result
  1795. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  1796. %ifdef m8 ; x86-64 && sse2
  1797. mova m8, [pb_80]
  1798. %define m_pb_80 m8
  1799. %else ; x86-32 or mmx/mmxext
  1800. %define m_pb_80 [pb_80]
  1801. %endif
  1802. mova m1, m4
  1803. mova m7, m3
  1804. pxor m1, m_pb_80
  1805. pxor m7, m_pb_80
  1806. psubsb m1, m7 ; (signed) q0-p0
  1807. mova m6, m2
  1808. mova m7, m5
  1809. pxor m6, m_pb_80
  1810. pxor m7, m_pb_80
  1811. psubsb m6, m7 ; (signed) p1-q1
  1812. mova m7, m_maskres
  1813. pandn m7, m6
  1814. paddsb m7, m1
  1815. paddsb m7, m1
  1816. paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
  1817. pand m7, m0
  1818. mova m1, [pb_F8]
  1819. mova m6, m7
  1820. paddsb m7, [pb_3]
  1821. paddsb m6, [pb_4]
  1822. pand m7, m1
  1823. pand m6, m1
  1824. pxor m1, m1
  1825. pxor m0, m0
  1826. pcmpgtb m1, m7
  1827. psubb m0, m7
  1828. psrlq m7, 3 ; +f2
  1829. psrlq m0, 3 ; -f2
  1830. pand m0, m1
  1831. pandn m1, m7
  1832. psubusb m3, m0
  1833. paddusb m3, m1 ; p0+f2
  1834. pxor m1, m1
  1835. pxor m0, m0
  1836. pcmpgtb m0, m6
  1837. psubb m1, m6
  1838. psrlq m6, 3 ; +f1
  1839. psrlq m1, 3 ; -f1
  1840. pand m1, m0
  1841. pandn m0, m6
  1842. psubusb m4, m0
  1843. paddusb m4, m1 ; q0-f1
  1844. %ifdef m12
  1845. SWAP 6, 12
  1846. %else
  1847. mova m6, m_maskres
  1848. %endif
  1849. %if notcpuflag(mmx2)
  1850. mova m7, [pb_1]
  1851. %else ; mmxext/sse2
  1852. pxor m7, m7
  1853. %endif
  1854. pand m0, m6
  1855. pand m1, m6
  1856. %if notcpuflag(mmx2)
  1857. paddusb m0, m7
  1858. pand m1, [pb_FE]
  1859. pandn m7, m0
  1860. psrlq m1, 1
  1861. psrlq m7, 1
  1862. SWAP 0, 7
  1863. %else ; mmxext/sse2
  1864. psubusb m1, [pb_1]
  1865. pavgb m0, m7 ; a
  1866. pavgb m1, m7 ; -a
  1867. %endif
  1868. psubusb m5, m0
  1869. psubusb m2, m1
  1870. paddusb m5, m1 ; q1-a
  1871. paddusb m2, m0 ; p1+a
  1872. ; store
  1873. %ifidn %1, v
  1874. movrow [dst1q+mstrideq*2], m2
  1875. movrow [dst1q+mstrideq ], m3
  1876. movrow [dst1q], m4
  1877. movrow [dst1q+ strideq ], m5
  1878. %if mmsize == 16 && %2 == 8
  1879. movhps [dst8q+mstrideq*2], m2
  1880. movhps [dst8q+mstrideq ], m3
  1881. movhps [dst8q], m4
  1882. movhps [dst8q+ strideq ], m5
  1883. %endif
  1884. %else ; h
  1885. add dst1q, 2
  1886. add dst2q, 2
  1887. ; 4x8/16 transpose
  1888. TRANSPOSE4x4B 2, 3, 4, 5, 6
  1889. %if mmsize == 8 ; mmx/mmxext (h)
  1890. WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
  1891. %else ; sse2 (h)
  1892. lea dst8q, [dst8q+mstrideq +2]
  1893. WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  1894. %endif
  1895. %endif
  1896. %if mmsize == 8
  1897. %if %2 == 8 ; chroma
  1898. %ifidn %1, h
  1899. sub dst1q, 2
  1900. %endif
  1901. cmp dst1q, dst8q
  1902. mov dst1q, dst8q
  1903. jnz .next8px
  1904. %else
  1905. %ifidn %1, h
  1906. lea dst1q, [dst1q+ strideq*8-2]
  1907. %else ; v
  1908. add dst1q, 8
  1909. %endif
  1910. dec cntrq
  1911. jg .next8px
  1912. %endif
  1913. %endif
  1914. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  1915. ADD rsp, pad
  1916. %endif
  1917. RET
  1918. %endmacro
  1919. %if ARCH_X86_32
  1920. INIT_MMX mmx
  1921. INNER_LOOPFILTER v, 16
  1922. INNER_LOOPFILTER h, 16
  1923. INNER_LOOPFILTER v, 8
  1924. INNER_LOOPFILTER h, 8
  1925. INIT_MMX mmx2
  1926. INNER_LOOPFILTER v, 16
  1927. INNER_LOOPFILTER h, 16
  1928. INNER_LOOPFILTER v, 8
  1929. INNER_LOOPFILTER h, 8
  1930. %endif
  1931. INIT_XMM sse2
  1932. INNER_LOOPFILTER v, 16
  1933. INNER_LOOPFILTER h, 16
  1934. INNER_LOOPFILTER v, 8
  1935. INNER_LOOPFILTER h, 8
  1936. INIT_XMM ssse3
  1937. INNER_LOOPFILTER v, 16
  1938. INNER_LOOPFILTER h, 16
  1939. INNER_LOOPFILTER v, 8
  1940. INNER_LOOPFILTER h, 8
  1941. ;-----------------------------------------------------------------------------
  1942. ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1943. ; int flimE, int flimI, int hev_thr);
  1944. ;-----------------------------------------------------------------------------
  1945. %macro MBEDGE_LOOPFILTER 2
  1946. %if %2 == 8 ; chroma
  1947. cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr
  1948. %else ; luma
  1949. cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr
  1950. %endif
  1951. %if cpuflag(ssse3)
  1952. pxor m7, m7
  1953. %endif
  1954. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1955. %if mmsize == 16 ; [3]=hev() result
  1956. ; [4]=filter tmp result
  1957. ; [5]/[6] = p2/q2 backup
  1958. ; [7]=lim_res sign result
  1959. %assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15)
  1960. %else ; 8 ; extra storage space for transposes
  1961. %assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15)
  1962. %endif
  1963. ; splat function arguments
  1964. SPLATB_REG m0, flimEq, m7 ; E
  1965. SPLATB_REG m1, flimIq, m7 ; I
  1966. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1967. SUB rsp, pad
  1968. %define m_flimE [rsp]
  1969. %define m_flimI [rsp+mmsize]
  1970. %define m_hevthr [rsp+mmsize*2]
  1971. %define m_maskres [rsp+mmsize*3]
  1972. %define m_limres [rsp+mmsize*4]
  1973. %define m_p0backup [rsp+mmsize*3]
  1974. %define m_q0backup [rsp+mmsize*4]
  1975. %define m_p2backup [rsp+mmsize*5]
  1976. %define m_q2backup [rsp+mmsize*6]
  1977. %if mmsize == 16
  1978. %define m_limsign [rsp]
  1979. %else
  1980. %define m_limsign [rsp+mmsize*7]
  1981. %endif
  1982. mova m_flimE, m0
  1983. mova m_flimI, m1
  1984. mova m_hevthr, m2
  1985. %else ; sse2 on x86-64
  1986. %define m_flimE m9
  1987. %define m_flimI m10
  1988. %define m_hevthr m11
  1989. %define m_maskres m12
  1990. %define m_limres m8
  1991. %define m_p0backup m12
  1992. %define m_q0backup m8
  1993. %define m_p2backup m13
  1994. %define m_q2backup m14
  1995. %define m_limsign m9
  1996. ; splat function arguments
  1997. SPLATB_REG m_flimE, flimEq, m7 ; E
  1998. SPLATB_REG m_flimI, flimIq, m7 ; I
  1999. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  2000. %endif
  2001. %if %2 == 8 ; chroma
  2002. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  2003. %elif mmsize == 8
  2004. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  2005. mov cntrq, 2
  2006. %else
  2007. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  2008. %endif
  2009. mov strideq, mstrideq
  2010. neg mstrideq
  2011. %ifidn %1, h
  2012. lea dst1q, [dst1q+strideq*4-4]
  2013. %if %2 == 8 ; chroma
  2014. lea dst8q, [dst8q+strideq*4-4]
  2015. %endif
  2016. %endif
  2017. %if mmsize == 8
  2018. .next8px:
  2019. %endif
  2020. ; read
  2021. lea dst2q, [dst1q+ strideq ]
  2022. %ifidn %1, v
  2023. %if %2 == 8 && mmsize == 16
  2024. %define movrow movh
  2025. %else
  2026. %define movrow mova
  2027. %endif
  2028. movrow m0, [dst1q+mstrideq*4] ; p3
  2029. movrow m1, [dst2q+mstrideq*4] ; p2
  2030. movrow m2, [dst1q+mstrideq*2] ; p1
  2031. movrow m5, [dst2q] ; q1
  2032. movrow m6, [dst2q+ strideq ] ; q2
  2033. movrow m7, [dst2q+ strideq*2] ; q3
  2034. %if mmsize == 16 && %2 == 8
  2035. movhps m0, [dst8q+mstrideq*4]
  2036. movhps m2, [dst8q+mstrideq*2]
  2037. add dst8q, strideq
  2038. movhps m1, [dst8q+mstrideq*4]
  2039. movhps m5, [dst8q]
  2040. movhps m6, [dst8q+ strideq ]
  2041. movhps m7, [dst8q+ strideq*2]
  2042. add dst8q, mstrideq
  2043. %endif
  2044. %elif mmsize == 8 ; mmx/mmxext (h)
  2045. ; read 8 rows of 8px each
  2046. movu m0, [dst1q+mstrideq*4]
  2047. movu m1, [dst2q+mstrideq*4]
  2048. movu m2, [dst1q+mstrideq*2]
  2049. movu m3, [dst1q+mstrideq ]
  2050. movu m4, [dst1q]
  2051. movu m5, [dst2q]
  2052. movu m6, [dst2q+ strideq ]
  2053. ; 8x8 transpose
  2054. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2055. mova m_q0backup, m1
  2056. movu m7, [dst2q+ strideq*2]
  2057. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2058. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2059. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2060. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2061. mova m1, m_q0backup
  2062. mova m_q0backup, m2 ; store q0
  2063. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2064. mova m_p0backup, m5 ; store p0
  2065. SWAP 1, 4
  2066. SWAP 2, 4
  2067. SWAP 6, 3
  2068. SWAP 5, 3
  2069. %else ; sse2 (h)
  2070. %if %2 == 16
  2071. lea dst8q, [dst1q+ strideq*8 ]
  2072. %endif
  2073. ; read 16 rows of 8px each, interleave
  2074. movh m0, [dst1q+mstrideq*4]
  2075. movh m1, [dst8q+mstrideq*4]
  2076. movh m2, [dst1q+mstrideq*2]
  2077. movh m5, [dst8q+mstrideq*2]
  2078. movh m3, [dst1q+mstrideq ]
  2079. movh m6, [dst8q+mstrideq ]
  2080. movh m4, [dst1q]
  2081. movh m7, [dst8q]
  2082. punpcklbw m0, m1 ; A/I
  2083. punpcklbw m2, m5 ; C/K
  2084. punpcklbw m3, m6 ; D/L
  2085. punpcklbw m4, m7 ; E/M
  2086. add dst8q, strideq
  2087. movh m1, [dst2q+mstrideq*4]
  2088. movh m6, [dst8q+mstrideq*4]
  2089. movh m5, [dst2q]
  2090. movh m7, [dst8q]
  2091. punpcklbw m1, m6 ; B/J
  2092. punpcklbw m5, m7 ; F/N
  2093. movh m6, [dst2q+ strideq ]
  2094. movh m7, [dst8q+ strideq ]
  2095. punpcklbw m6, m7 ; G/O
  2096. ; 8x16 transpose
  2097. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2098. %ifdef m8
  2099. SWAP 1, 8
  2100. %else
  2101. mova m_q0backup, m1
  2102. %endif
  2103. movh m7, [dst2q+ strideq*2]
  2104. movh m1, [dst8q+ strideq*2]
  2105. punpcklbw m7, m1 ; H/P
  2106. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2107. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2108. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2109. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2110. %ifdef m8
  2111. SWAP 1, 8
  2112. SWAP 2, 8
  2113. %else
  2114. mova m1, m_q0backup
  2115. mova m_q0backup, m2 ; store q0
  2116. %endif
  2117. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2118. %ifdef m12
  2119. SWAP 5, 12
  2120. %else
  2121. mova m_p0backup, m5 ; store p0
  2122. %endif
  2123. SWAP 1, 4
  2124. SWAP 2, 4
  2125. SWAP 6, 3
  2126. SWAP 5, 3
  2127. %endif
  2128. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  2129. mova m4, m1
  2130. SWAP 4, 1
  2131. psubusb m4, m0 ; p2-p3
  2132. psubusb m0, m1 ; p3-p2
  2133. por m0, m4 ; abs(p3-p2)
  2134. mova m4, m2
  2135. SWAP 4, 2
  2136. psubusb m4, m1 ; p1-p2
  2137. mova m_p2backup, m1
  2138. psubusb m1, m2 ; p2-p1
  2139. por m1, m4 ; abs(p2-p1)
  2140. mova m4, m6
  2141. SWAP 4, 6
  2142. psubusb m4, m7 ; q2-q3
  2143. psubusb m7, m6 ; q3-q2
  2144. por m7, m4 ; abs(q3-q2)
  2145. mova m4, m5
  2146. SWAP 4, 5
  2147. psubusb m4, m6 ; q1-q2
  2148. mova m_q2backup, m6
  2149. psubusb m6, m5 ; q2-q1
  2150. por m6, m4 ; abs(q2-q1)
  2151. %if notcpuflag(mmx2)
  2152. mova m4, m_flimI
  2153. pxor m3, m3
  2154. psubusb m0, m4
  2155. psubusb m1, m4
  2156. psubusb m7, m4
  2157. psubusb m6, m4
  2158. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  2159. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  2160. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  2161. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  2162. pand m0, m1
  2163. pand m7, m6
  2164. pand m0, m7
  2165. %else ; mmxext/sse2
  2166. pmaxub m0, m1
  2167. pmaxub m6, m7
  2168. pmaxub m0, m6
  2169. %endif
  2170. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  2171. SWAP 7, 3 ; now m7 is zero
  2172. %ifidn %1, v
  2173. movrow m3, [dst1q+mstrideq ] ; p0
  2174. %if mmsize == 16 && %2 == 8
  2175. movhps m3, [dst8q+mstrideq ]
  2176. %endif
  2177. %elifdef m12
  2178. SWAP 3, 12
  2179. %else
  2180. mova m3, m_p0backup
  2181. %endif
  2182. mova m1, m2
  2183. SWAP 1, 2
  2184. mova m6, m3
  2185. SWAP 3, 6
  2186. psubusb m1, m3 ; p1-p0
  2187. psubusb m6, m2 ; p0-p1
  2188. por m1, m6 ; abs(p1-p0)
  2189. %if notcpuflag(mmx2)
  2190. mova m6, m1
  2191. psubusb m1, m4
  2192. psubusb m6, m_hevthr
  2193. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  2194. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  2195. pand m0, m1
  2196. mova m_maskres, m6
  2197. %else ; mmxext/sse2
  2198. pmaxub m0, m1 ; max_I
  2199. SWAP 1, 4 ; max_hev_thresh
  2200. %endif
  2201. SWAP 6, 4 ; now m6 is I
  2202. %ifidn %1, v
  2203. movrow m4, [dst1q] ; q0
  2204. %if mmsize == 16 && %2 == 8
  2205. movhps m4, [dst8q]
  2206. %endif
  2207. %elifdef m8
  2208. SWAP 4, 8
  2209. %else
  2210. mova m4, m_q0backup
  2211. %endif
  2212. mova m1, m4
  2213. SWAP 1, 4
  2214. mova m7, m5
  2215. SWAP 7, 5
  2216. psubusb m1, m5 ; q0-q1
  2217. psubusb m7, m4 ; q1-q0
  2218. por m1, m7 ; abs(q1-q0)
  2219. %if notcpuflag(mmx2)
  2220. mova m7, m1
  2221. psubusb m1, m6
  2222. psubusb m7, m_hevthr
  2223. pxor m6, m6
  2224. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  2225. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  2226. mova m6, m_maskres
  2227. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  2228. pand m6, m7
  2229. %else ; mmxext/sse2
  2230. pxor m7, m7
  2231. pmaxub m0, m1
  2232. pmaxub m6, m1
  2233. psubusb m0, m_flimI
  2234. psubusb m6, m_hevthr
  2235. pcmpeqb m0, m7 ; max(abs(..)) <= I
  2236. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  2237. %endif
  2238. %ifdef m12
  2239. SWAP 6, 12
  2240. %else
  2241. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  2242. %endif
  2243. ; simple_limit
  2244. mova m1, m3
  2245. SWAP 1, 3
  2246. mova m6, m4 ; keep copies of p0/q0 around for later use
  2247. SWAP 6, 4
  2248. psubusb m1, m4 ; p0-q0
  2249. psubusb m6, m3 ; q0-p0
  2250. por m1, m6 ; abs(q0-p0)
  2251. paddusb m1, m1 ; m1=2*abs(q0-p0)
  2252. mova m7, m2
  2253. SWAP 7, 2
  2254. mova m6, m5
  2255. SWAP 6, 5
  2256. psubusb m7, m5 ; p1-q1
  2257. psubusb m6, m2 ; q1-p1
  2258. por m7, m6 ; abs(q1-p1)
  2259. pxor m6, m6
  2260. pand m7, [pb_FE]
  2261. psrlq m7, 1 ; abs(q1-p1)/2
  2262. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  2263. psubusb m7, m_flimE
  2264. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  2265. pand m0, m7 ; normal_limit result
  2266. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  2267. %ifdef m8 ; x86-64 && sse2
  2268. mova m8, [pb_80]
  2269. %define m_pb_80 m8
  2270. %else ; x86-32 or mmx/mmxext
  2271. %define m_pb_80 [pb_80]
  2272. %endif
  2273. mova m1, m4
  2274. mova m7, m3
  2275. pxor m1, m_pb_80
  2276. pxor m7, m_pb_80
  2277. psubsb m1, m7 ; (signed) q0-p0
  2278. mova m6, m2
  2279. mova m7, m5
  2280. pxor m6, m_pb_80
  2281. pxor m7, m_pb_80
  2282. psubsb m6, m7 ; (signed) p1-q1
  2283. mova m7, m_maskres
  2284. paddsb m6, m1
  2285. paddsb m6, m1
  2286. paddsb m6, m1
  2287. pand m6, m0
  2288. %ifdef m8
  2289. mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
  2290. pand m_limres, m7
  2291. %else
  2292. mova m0, m6
  2293. pand m0, m7
  2294. mova m_limres, m0
  2295. %endif
  2296. pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
  2297. mova m1, [pb_F8]
  2298. mova m6, m7
  2299. paddsb m7, [pb_3]
  2300. paddsb m6, [pb_4]
  2301. pand m7, m1
  2302. pand m6, m1
  2303. pxor m1, m1
  2304. pxor m0, m0
  2305. pcmpgtb m1, m7
  2306. psubb m0, m7
  2307. psrlq m7, 3 ; +f2
  2308. psrlq m0, 3 ; -f2
  2309. pand m0, m1
  2310. pandn m1, m7
  2311. psubusb m3, m0
  2312. paddusb m3, m1 ; p0+f2
  2313. pxor m1, m1
  2314. pxor m0, m0
  2315. pcmpgtb m0, m6
  2316. psubb m1, m6
  2317. psrlq m6, 3 ; +f1
  2318. psrlq m1, 3 ; -f1
  2319. pand m1, m0
  2320. pandn m0, m6
  2321. psubusb m4, m0
  2322. paddusb m4, m1 ; q0-f1
  2323. ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
  2324. %if cpuflag(ssse3)
  2325. mova m7, [pb_1]
  2326. %else
  2327. mova m7, [pw_63]
  2328. %endif
  2329. %ifdef m8
  2330. SWAP 1, 8
  2331. %else
  2332. mova m1, m_limres
  2333. %endif
  2334. pxor m0, m0
  2335. mova m6, m1
  2336. pcmpgtb m0, m1 ; which are negative
  2337. %if cpuflag(ssse3)
  2338. punpcklbw m6, m7 ; interleave with "1" for rounding
  2339. punpckhbw m1, m7
  2340. %else
  2341. punpcklbw m6, m0 ; signed byte->word
  2342. punpckhbw m1, m0
  2343. %endif
  2344. mova m_limsign, m0
  2345. %if cpuflag(ssse3)
  2346. mova m7, [pb_27_63]
  2347. %ifndef m8
  2348. mova m_limres, m1
  2349. %endif
  2350. %ifdef m10
  2351. SWAP 0, 10 ; don't lose lim_sign copy
  2352. %endif
  2353. mova m0, m7
  2354. pmaddubsw m7, m6
  2355. SWAP 6, 7
  2356. pmaddubsw m0, m1
  2357. SWAP 1, 0
  2358. %ifdef m10
  2359. SWAP 0, 10
  2360. %else
  2361. mova m0, m_limsign
  2362. %endif
  2363. %else
  2364. mova m_maskres, m6 ; backup for later in filter
  2365. mova m_limres, m1
  2366. pmullw m6, [pw_27]
  2367. pmullw m1, [pw_27]
  2368. paddw m6, m7
  2369. paddw m1, m7
  2370. %endif
  2371. psraw m6, 7
  2372. psraw m1, 7
  2373. packsswb m6, m1 ; a0
  2374. pxor m1, m1
  2375. psubb m1, m6
  2376. pand m1, m0 ; -a0
  2377. pandn m0, m6 ; +a0
  2378. %if cpuflag(ssse3)
  2379. mova m6, [pb_18_63] ; pipelining
  2380. %endif
  2381. psubusb m3, m1
  2382. paddusb m4, m1
  2383. paddusb m3, m0 ; p0+a0
  2384. psubusb m4, m0 ; q0-a0
  2385. %if cpuflag(ssse3)
  2386. SWAP 6, 7
  2387. %ifdef m10
  2388. SWAP 1, 10
  2389. %else
  2390. mova m1, m_limres
  2391. %endif
  2392. mova m0, m7
  2393. pmaddubsw m7, m6
  2394. SWAP 6, 7
  2395. pmaddubsw m0, m1
  2396. SWAP 1, 0
  2397. %ifdef m10
  2398. SWAP 0, 10
  2399. %endif
  2400. mova m0, m_limsign
  2401. %else
  2402. mova m6, m_maskres
  2403. mova m1, m_limres
  2404. pmullw m6, [pw_18]
  2405. pmullw m1, [pw_18]
  2406. paddw m6, m7
  2407. paddw m1, m7
  2408. %endif
  2409. mova m0, m_limsign
  2410. psraw m6, 7
  2411. psraw m1, 7
  2412. packsswb m6, m1 ; a1
  2413. pxor m1, m1
  2414. psubb m1, m6
  2415. pand m1, m0 ; -a1
  2416. pandn m0, m6 ; +a1
  2417. %if cpuflag(ssse3)
  2418. mova m6, [pb_9_63]
  2419. %endif
  2420. psubusb m2, m1
  2421. paddusb m5, m1
  2422. paddusb m2, m0 ; p1+a1
  2423. psubusb m5, m0 ; q1-a1
  2424. %if cpuflag(ssse3)
  2425. SWAP 6, 7
  2426. %ifdef m10
  2427. SWAP 1, 10
  2428. %else
  2429. mova m1, m_limres
  2430. %endif
  2431. mova m0, m7
  2432. pmaddubsw m7, m6
  2433. SWAP 6, 7
  2434. pmaddubsw m0, m1
  2435. SWAP 1, 0
  2436. %else
  2437. %ifdef m8
  2438. SWAP 6, 12
  2439. SWAP 1, 8
  2440. %else
  2441. mova m6, m_maskres
  2442. mova m1, m_limres
  2443. %endif
  2444. pmullw m6, [pw_9]
  2445. pmullw m1, [pw_9]
  2446. paddw m6, m7
  2447. paddw m1, m7
  2448. %endif
  2449. %ifdef m9
  2450. SWAP 7, 9
  2451. %else
  2452. mova m7, m_limsign
  2453. %endif
  2454. psraw m6, 7
  2455. psraw m1, 7
  2456. packsswb m6, m1 ; a1
  2457. pxor m0, m0
  2458. psubb m0, m6
  2459. pand m0, m7 ; -a1
  2460. pandn m7, m6 ; +a1
  2461. %ifdef m8
  2462. SWAP 1, 13
  2463. SWAP 6, 14
  2464. %else
  2465. mova m1, m_p2backup
  2466. mova m6, m_q2backup
  2467. %endif
  2468. psubusb m1, m0
  2469. paddusb m6, m0
  2470. paddusb m1, m7 ; p1+a1
  2471. psubusb m6, m7 ; q1-a1
  2472. ; store
  2473. %ifidn %1, v
  2474. movrow [dst2q+mstrideq*4], m1
  2475. movrow [dst1q+mstrideq*2], m2
  2476. movrow [dst1q+mstrideq ], m3
  2477. movrow [dst1q], m4
  2478. movrow [dst2q], m5
  2479. movrow [dst2q+ strideq ], m6
  2480. %if mmsize == 16 && %2 == 8
  2481. add dst8q, mstrideq
  2482. movhps [dst8q+mstrideq*2], m1
  2483. movhps [dst8q+mstrideq ], m2
  2484. movhps [dst8q], m3
  2485. add dst8q, strideq
  2486. movhps [dst8q], m4
  2487. movhps [dst8q+ strideq ], m5
  2488. movhps [dst8q+ strideq*2], m6
  2489. %endif
  2490. %else ; h
  2491. inc dst1q
  2492. inc dst2q
  2493. ; 4x8/16 transpose
  2494. TRANSPOSE4x4B 1, 2, 3, 4, 0
  2495. SBUTTERFLY bw, 5, 6, 0
  2496. %if mmsize == 8 ; mmx/mmxext (h)
  2497. WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
  2498. add dst1q, 4
  2499. WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
  2500. %else ; sse2 (h)
  2501. lea dst8q, [dst8q+mstrideq+1]
  2502. WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  2503. lea dst1q, [dst2q+mstrideq+4]
  2504. lea dst8q, [dst8q+mstrideq+4]
  2505. %if cpuflag(sse4)
  2506. add dst2q, 4
  2507. %endif
  2508. WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
  2509. %if cpuflag(sse4)
  2510. lea dst2q, [dst8q+ strideq ]
  2511. %endif
  2512. WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
  2513. %endif
  2514. %endif
  2515. %if mmsize == 8
  2516. %if %2 == 8 ; chroma
  2517. %ifidn %1, h
  2518. sub dst1q, 5
  2519. %endif
  2520. cmp dst1q, dst8q
  2521. mov dst1q, dst8q
  2522. jnz .next8px
  2523. %else
  2524. %ifidn %1, h
  2525. lea dst1q, [dst1q+ strideq*8-5]
  2526. %else ; v
  2527. add dst1q, 8
  2528. %endif
  2529. dec cntrq
  2530. jg .next8px
  2531. %endif
  2532. %endif
  2533. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  2534. ADD rsp, pad
  2535. %endif
  2536. RET
  2537. %endmacro
  2538. %if ARCH_X86_32
  2539. INIT_MMX mmx
  2540. MBEDGE_LOOPFILTER v, 16
  2541. MBEDGE_LOOPFILTER h, 16
  2542. MBEDGE_LOOPFILTER v, 8
  2543. MBEDGE_LOOPFILTER h, 8
  2544. INIT_MMX mmx2
  2545. MBEDGE_LOOPFILTER v, 16
  2546. MBEDGE_LOOPFILTER h, 16
  2547. MBEDGE_LOOPFILTER v, 8
  2548. MBEDGE_LOOPFILTER h, 8
  2549. %endif
  2550. INIT_XMM sse2
  2551. MBEDGE_LOOPFILTER v, 16
  2552. MBEDGE_LOOPFILTER h, 16
  2553. MBEDGE_LOOPFILTER v, 8
  2554. MBEDGE_LOOPFILTER h, 8
  2555. INIT_XMM ssse3
  2556. MBEDGE_LOOPFILTER v, 16
  2557. MBEDGE_LOOPFILTER h, 16
  2558. MBEDGE_LOOPFILTER v, 8
  2559. MBEDGE_LOOPFILTER h, 8
  2560. INIT_XMM sse4
  2561. MBEDGE_LOOPFILTER h, 16
  2562. MBEDGE_LOOPFILTER h, 8