You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2780 lines
78KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA
  24. fourtap_filter_hw_m: times 4 dw -6, 123
  25. times 4 dw 12, -1
  26. times 4 dw -9, 93
  27. times 4 dw 50, -6
  28. times 4 dw -6, 50
  29. times 4 dw 93, -9
  30. times 4 dw -1, 12
  31. times 4 dw 123, -6
  32. sixtap_filter_hw_m: times 4 dw 2, -11
  33. times 4 dw 108, 36
  34. times 4 dw -8, 1
  35. times 4 dw 3, -16
  36. times 4 dw 77, 77
  37. times 4 dw -16, 3
  38. times 4 dw 1, -8
  39. times 4 dw 36, 108
  40. times 4 dw -11, 2
  41. fourtap_filter_hb_m: times 8 db -6, 123
  42. times 8 db 12, -1
  43. times 8 db -9, 93
  44. times 8 db 50, -6
  45. times 8 db -6, 50
  46. times 8 db 93, -9
  47. times 8 db -1, 12
  48. times 8 db 123, -6
  49. sixtap_filter_hb_m: times 8 db 2, 1
  50. times 8 db -11, 108
  51. times 8 db 36, -8
  52. times 8 db 3, 3
  53. times 8 db -16, 77
  54. times 8 db 77, -16
  55. times 8 db 1, 2
  56. times 8 db -8, 36
  57. times 8 db 108, -11
  58. fourtap_filter_v_m: times 8 dw -6
  59. times 8 dw 123
  60. times 8 dw 12
  61. times 8 dw -1
  62. times 8 dw -9
  63. times 8 dw 93
  64. times 8 dw 50
  65. times 8 dw -6
  66. times 8 dw -6
  67. times 8 dw 50
  68. times 8 dw 93
  69. times 8 dw -9
  70. times 8 dw -1
  71. times 8 dw 12
  72. times 8 dw 123
  73. times 8 dw -6
  74. sixtap_filter_v_m: times 8 dw 2
  75. times 8 dw -11
  76. times 8 dw 108
  77. times 8 dw 36
  78. times 8 dw -8
  79. times 8 dw 1
  80. times 8 dw 3
  81. times 8 dw -16
  82. times 8 dw 77
  83. times 8 dw 77
  84. times 8 dw -16
  85. times 8 dw 3
  86. times 8 dw 1
  87. times 8 dw -8
  88. times 8 dw 36
  89. times 8 dw 108
  90. times 8 dw -11
  91. times 8 dw 2
  92. bilinear_filter_vw_m: times 8 dw 1
  93. times 8 dw 2
  94. times 8 dw 3
  95. times 8 dw 4
  96. times 8 dw 5
  97. times 8 dw 6
  98. times 8 dw 7
  99. bilinear_filter_vb_m: times 8 db 7, 1
  100. times 8 db 6, 2
  101. times 8 db 5, 3
  102. times 8 db 4, 4
  103. times 8 db 3, 5
  104. times 8 db 2, 6
  105. times 8 db 1, 7
  106. %ifdef PIC
  107. %define fourtap_filter_hw picregq
  108. %define sixtap_filter_hw picregq
  109. %define fourtap_filter_hb picregq
  110. %define sixtap_filter_hb picregq
  111. %define fourtap_filter_v picregq
  112. %define sixtap_filter_v picregq
  113. %define bilinear_filter_vw picregq
  114. %define bilinear_filter_vb picregq
  115. %define npicregs 1
  116. %else
  117. %define fourtap_filter_hw fourtap_filter_hw_m
  118. %define sixtap_filter_hw sixtap_filter_hw_m
  119. %define fourtap_filter_hb fourtap_filter_hb_m
  120. %define sixtap_filter_hb sixtap_filter_hb_m
  121. %define fourtap_filter_v fourtap_filter_v_m
  122. %define sixtap_filter_v sixtap_filter_v_m
  123. %define bilinear_filter_vw bilinear_filter_vw_m
  124. %define bilinear_filter_vb bilinear_filter_vb_m
  125. %define npicregs 0
  126. %endif
  127. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  128. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  129. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  130. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  131. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  132. pw_256: times 8 dw 256
  133. pw_20091: times 4 dw 20091
  134. pw_17734: times 4 dw 17734
  135. pb_27_63: times 8 db 27, 63
  136. pb_18_63: times 8 db 18, 63
  137. pb_9_63: times 8 db 9, 63
  138. cextern pb_1
  139. cextern pw_3
  140. cextern pb_3
  141. cextern pw_4
  142. cextern pb_4
  143. cextern pw_9
  144. cextern pw_18
  145. cextern pw_27
  146. cextern pw_63
  147. cextern pw_64
  148. cextern pb_80
  149. cextern pb_F8
  150. cextern pb_FE
  151. SECTION .text
  152. ;-----------------------------------------------------------------------------
  153. ; subpel MC functions:
  154. ;
  155. ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  156. ; uint8_t *src, int srcstride,
  157. ; int height, int mx, int my);
  158. ;-----------------------------------------------------------------------------
  159. %macro FILTER_SSSE3 1
  160. cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
  161. lea mxd, [mxq*3]
  162. mova m3, [filter_h6_shuf2]
  163. mova m4, [filter_h6_shuf3]
  164. %ifdef PIC
  165. lea picregq, [sixtap_filter_hb_m]
  166. %endif
  167. mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
  168. mova m6, [sixtap_filter_hb+mxq*8-32]
  169. mova m7, [sixtap_filter_hb+mxq*8-16]
  170. .nextrow:
  171. movu m0, [srcq-2]
  172. mova m1, m0
  173. mova m2, m0
  174. %if mmsize == 8
  175. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  176. ; shuffle with a memory operand
  177. punpcklbw m0, [srcq+3]
  178. %else
  179. pshufb m0, [filter_h6_shuf1]
  180. %endif
  181. pshufb m1, m3
  182. pshufb m2, m4
  183. pmaddubsw m0, m5
  184. pmaddubsw m1, m6
  185. pmaddubsw m2, m7
  186. paddsw m0, m1
  187. paddsw m0, m2
  188. pmulhrsw m0, [pw_256]
  189. packuswb m0, m0
  190. movh [dstq], m0 ; store
  191. ; go to next line
  192. add dstq, dststrideq
  193. add srcq, srcstrideq
  194. dec heightd ; next row
  195. jg .nextrow
  196. REP_RET
  197. cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  198. shl mxd, 4
  199. mova m2, [pw_256]
  200. mova m3, [filter_h2_shuf]
  201. mova m4, [filter_h4_shuf]
  202. %ifdef PIC
  203. lea picregq, [fourtap_filter_hb_m]
  204. %endif
  205. mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
  206. mova m6, [fourtap_filter_hb+mxq]
  207. .nextrow:
  208. movu m0, [srcq-1]
  209. mova m1, m0
  210. pshufb m0, m3
  211. pshufb m1, m4
  212. pmaddubsw m0, m5
  213. pmaddubsw m1, m6
  214. paddsw m0, m1
  215. pmulhrsw m0, m2
  216. packuswb m0, m0
  217. movh [dstq], m0 ; store
  218. ; go to next line
  219. add dstq, dststrideq
  220. add srcq, srcstrideq
  221. dec heightd ; next row
  222. jg .nextrow
  223. REP_RET
  224. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  225. shl myd, 4
  226. %ifdef PIC
  227. lea picregq, [fourtap_filter_hb_m]
  228. %endif
  229. mova m5, [fourtap_filter_hb+myq-16]
  230. mova m6, [fourtap_filter_hb+myq]
  231. mova m7, [pw_256]
  232. ; read 3 lines
  233. sub srcq, srcstrideq
  234. movh m0, [srcq]
  235. movh m1, [srcq+ srcstrideq]
  236. movh m2, [srcq+2*srcstrideq]
  237. add srcq, srcstrideq
  238. .nextrow:
  239. movh m3, [srcq+2*srcstrideq] ; read new row
  240. mova m4, m0
  241. mova m0, m1
  242. punpcklbw m4, m1
  243. mova m1, m2
  244. punpcklbw m2, m3
  245. pmaddubsw m4, m5
  246. pmaddubsw m2, m6
  247. paddsw m4, m2
  248. mova m2, m3
  249. pmulhrsw m4, m7
  250. packuswb m4, m4
  251. movh [dstq], m4
  252. ; go to next line
  253. add dstq, dststrideq
  254. add srcq, srcstrideq
  255. dec heightd ; next row
  256. jg .nextrow
  257. REP_RET
  258. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  259. lea myd, [myq*3]
  260. %ifdef PIC
  261. lea picregq, [sixtap_filter_hb_m]
  262. %endif
  263. lea myq, [sixtap_filter_hb+myq*8]
  264. ; read 5 lines
  265. sub srcq, srcstrideq
  266. sub srcq, srcstrideq
  267. movh m0, [srcq]
  268. movh m1, [srcq+srcstrideq]
  269. movh m2, [srcq+srcstrideq*2]
  270. lea srcq, [srcq+srcstrideq*2]
  271. add srcq, srcstrideq
  272. movh m3, [srcq]
  273. movh m4, [srcq+srcstrideq]
  274. .nextrow:
  275. movh m5, [srcq+2*srcstrideq] ; read new row
  276. mova m6, m0
  277. punpcklbw m6, m5
  278. mova m0, m1
  279. punpcklbw m1, m2
  280. mova m7, m3
  281. punpcklbw m7, m4
  282. pmaddubsw m6, [myq-48]
  283. pmaddubsw m1, [myq-32]
  284. pmaddubsw m7, [myq-16]
  285. paddsw m6, m1
  286. paddsw m6, m7
  287. mova m1, m2
  288. mova m2, m3
  289. pmulhrsw m6, [pw_256]
  290. mova m3, m4
  291. packuswb m6, m6
  292. mova m4, m5
  293. movh [dstq], m6
  294. ; go to next line
  295. add dstq, dststrideq
  296. add srcq, srcstrideq
  297. dec heightd ; next row
  298. jg .nextrow
  299. REP_RET
  300. %endmacro
  301. INIT_MMX ssse3
  302. FILTER_SSSE3 4
  303. INIT_XMM ssse3
  304. FILTER_SSSE3 8
  305. ; 4x4 block, H-only 4-tap filter
  306. INIT_MMX mmxext
  307. cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  308. shl mxd, 4
  309. %ifdef PIC
  310. lea picregq, [fourtap_filter_hw_m]
  311. %endif
  312. movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
  313. movq mm5, [fourtap_filter_hw+mxq]
  314. movq mm7, [pw_64]
  315. pxor mm6, mm6
  316. .nextrow:
  317. movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
  318. ; first set of 2 pixels
  319. movq mm2, mm1 ; byte ABCD..
  320. punpcklbw mm1, mm6 ; byte->word ABCD
  321. pshufw mm0, mm2, 9 ; byte CDEF..
  322. punpcklbw mm0, mm6 ; byte->word CDEF
  323. pshufw mm3, mm1, 0x94 ; word ABBC
  324. pshufw mm1, mm0, 0x94 ; word CDDE
  325. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  326. movq mm0, mm1 ; backup for second set of pixels
  327. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  328. paddd mm3, mm1 ; finish 1st 2px
  329. ; second set of 2 pixels, use backup of above
  330. punpckhbw mm2, mm6 ; byte->word EFGH
  331. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  332. pshufw mm1, mm2, 0x94 ; word EFFG
  333. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  334. paddd mm0, mm1 ; finish 2nd 2px
  335. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  336. packssdw mm3, mm0 ; merge dword->word (4px)
  337. paddsw mm3, mm7 ; rounding
  338. psraw mm3, 7
  339. packuswb mm3, mm6 ; clip and word->bytes
  340. movd [dstq], mm3 ; store
  341. ; go to next line
  342. add dstq, dststrideq
  343. add srcq, srcstrideq
  344. dec heightd ; next row
  345. jg .nextrow
  346. REP_RET
  347. ; 4x4 block, H-only 6-tap filter
  348. INIT_MMX mmxext
  349. cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  350. lea mxd, [mxq*3]
  351. %ifdef PIC
  352. lea picregq, [sixtap_filter_hw_m]
  353. %endif
  354. movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
  355. movq mm5, [sixtap_filter_hw+mxq*8-32]
  356. movq mm6, [sixtap_filter_hw+mxq*8-16]
  357. movq mm7, [pw_64]
  358. pxor mm3, mm3
  359. .nextrow:
  360. movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
  361. ; first set of 2 pixels
  362. movq mm2, mm1 ; byte ABCD..
  363. punpcklbw mm1, mm3 ; byte->word ABCD
  364. pshufw mm0, mm2, 0x9 ; byte CDEF..
  365. punpckhbw mm2, mm3 ; byte->word EFGH
  366. punpcklbw mm0, mm3 ; byte->word CDEF
  367. pshufw mm1, mm1, 0x94 ; word ABBC
  368. pshufw mm2, mm2, 0x94 ; word EFFG
  369. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  370. pshufw mm3, mm0, 0x94 ; word CDDE
  371. movq mm0, mm3 ; backup for second set of pixels
  372. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  373. paddd mm1, mm3 ; add to 1st 2px cache
  374. movq mm3, mm2 ; backup for second set of pixels
  375. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  376. paddd mm1, mm2 ; finish 1st 2px
  377. ; second set of 2 pixels, use backup of above
  378. movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
  379. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  380. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  381. paddd mm0, mm3 ; add to 2nd 2px cache
  382. pxor mm3, mm3
  383. punpcklbw mm2, mm3 ; byte->word FGHI
  384. pshufw mm2, mm2, 0xE9 ; word GHHI
  385. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  386. paddd mm0, mm2 ; finish 2nd 2px
  387. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  388. packssdw mm1, mm0 ; merge dword->word (4px)
  389. paddsw mm1, mm7 ; rounding
  390. psraw mm1, 7
  391. packuswb mm1, mm3 ; clip and word->bytes
  392. movd [dstq], mm1 ; store
  393. ; go to next line
  394. add dstq, dststrideq
  395. add srcq, srcstrideq
  396. dec heightd ; next row
  397. jg .nextrow
  398. REP_RET
  399. INIT_XMM sse2
  400. cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
  401. shl mxd, 5
  402. %ifdef PIC
  403. lea picregq, [fourtap_filter_v_m]
  404. %endif
  405. lea mxq, [fourtap_filter_v+mxq-32]
  406. pxor m7, m7
  407. mova m4, [pw_64]
  408. mova m5, [mxq+ 0]
  409. mova m6, [mxq+16]
  410. %ifdef m8
  411. mova m8, [mxq+32]
  412. mova m9, [mxq+48]
  413. %endif
  414. .nextrow:
  415. movq m0, [srcq-1]
  416. movq m1, [srcq-0]
  417. movq m2, [srcq+1]
  418. movq m3, [srcq+2]
  419. punpcklbw m0, m7
  420. punpcklbw m1, m7
  421. punpcklbw m2, m7
  422. punpcklbw m3, m7
  423. pmullw m0, m5
  424. pmullw m1, m6
  425. %ifdef m8
  426. pmullw m2, m8
  427. pmullw m3, m9
  428. %else
  429. pmullw m2, [mxq+32]
  430. pmullw m3, [mxq+48]
  431. %endif
  432. paddsw m0, m1
  433. paddsw m2, m3
  434. paddsw m0, m2
  435. paddsw m0, m4
  436. psraw m0, 7
  437. packuswb m0, m7
  438. movh [dstq], m0 ; store
  439. ; go to next line
  440. add dstq, dststrideq
  441. add srcq, srcstrideq
  442. dec heightd ; next row
  443. jg .nextrow
  444. REP_RET
  445. INIT_XMM sse2
  446. cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
  447. lea mxd, [mxq*3]
  448. shl mxd, 4
  449. %ifdef PIC
  450. lea picregq, [sixtap_filter_v_m]
  451. %endif
  452. lea mxq, [sixtap_filter_v+mxq-96]
  453. pxor m7, m7
  454. mova m6, [pw_64]
  455. %ifdef m8
  456. mova m8, [mxq+ 0]
  457. mova m9, [mxq+16]
  458. mova m10, [mxq+32]
  459. mova m11, [mxq+48]
  460. mova m12, [mxq+64]
  461. mova m13, [mxq+80]
  462. %endif
  463. .nextrow:
  464. movq m0, [srcq-2]
  465. movq m1, [srcq-1]
  466. movq m2, [srcq-0]
  467. movq m3, [srcq+1]
  468. movq m4, [srcq+2]
  469. movq m5, [srcq+3]
  470. punpcklbw m0, m7
  471. punpcklbw m1, m7
  472. punpcklbw m2, m7
  473. punpcklbw m3, m7
  474. punpcklbw m4, m7
  475. punpcklbw m5, m7
  476. %ifdef m8
  477. pmullw m0, m8
  478. pmullw m1, m9
  479. pmullw m2, m10
  480. pmullw m3, m11
  481. pmullw m4, m12
  482. pmullw m5, m13
  483. %else
  484. pmullw m0, [mxq+ 0]
  485. pmullw m1, [mxq+16]
  486. pmullw m2, [mxq+32]
  487. pmullw m3, [mxq+48]
  488. pmullw m4, [mxq+64]
  489. pmullw m5, [mxq+80]
  490. %endif
  491. paddsw m1, m4
  492. paddsw m0, m5
  493. paddsw m1, m2
  494. paddsw m0, m3
  495. paddsw m0, m1
  496. paddsw m0, m6
  497. psraw m0, 7
  498. packuswb m0, m7
  499. movh [dstq], m0 ; store
  500. ; go to next line
  501. add dstq, dststrideq
  502. add srcq, srcstrideq
  503. dec heightd ; next row
  504. jg .nextrow
  505. REP_RET
  506. %macro FILTER_V 1
  507. ; 4x4 block, V-only 4-tap filter
  508. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  509. shl myd, 5
  510. %ifdef PIC
  511. lea picregq, [fourtap_filter_v_m]
  512. %endif
  513. lea myq, [fourtap_filter_v+myq-32]
  514. mova m6, [pw_64]
  515. pxor m7, m7
  516. mova m5, [myq+48]
  517. ; read 3 lines
  518. sub srcq, srcstrideq
  519. movh m0, [srcq]
  520. movh m1, [srcq+ srcstrideq]
  521. movh m2, [srcq+2*srcstrideq]
  522. add srcq, srcstrideq
  523. punpcklbw m0, m7
  524. punpcklbw m1, m7
  525. punpcklbw m2, m7
  526. .nextrow:
  527. ; first calculate negative taps (to prevent losing positive overflows)
  528. movh m4, [srcq+2*srcstrideq] ; read new row
  529. punpcklbw m4, m7
  530. mova m3, m4
  531. pmullw m0, [myq+0]
  532. pmullw m4, m5
  533. paddsw m4, m0
  534. ; then calculate positive taps
  535. mova m0, m1
  536. pmullw m1, [myq+16]
  537. paddsw m4, m1
  538. mova m1, m2
  539. pmullw m2, [myq+32]
  540. paddsw m4, m2
  541. mova m2, m3
  542. ; round/clip/store
  543. paddsw m4, m6
  544. psraw m4, 7
  545. packuswb m4, m7
  546. movh [dstq], m4
  547. ; go to next line
  548. add dstq, dststrideq
  549. add srcq, srcstrideq
  550. dec heightd ; next row
  551. jg .nextrow
  552. REP_RET
  553. ; 4x4 block, V-only 6-tap filter
  554. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  555. shl myd, 4
  556. lea myq, [myq*3]
  557. %ifdef PIC
  558. lea picregq, [sixtap_filter_v_m]
  559. %endif
  560. lea myq, [sixtap_filter_v+myq-96]
  561. pxor m7, m7
  562. ; read 5 lines
  563. sub srcq, srcstrideq
  564. sub srcq, srcstrideq
  565. movh m0, [srcq]
  566. movh m1, [srcq+srcstrideq]
  567. movh m2, [srcq+srcstrideq*2]
  568. lea srcq, [srcq+srcstrideq*2]
  569. add srcq, srcstrideq
  570. movh m3, [srcq]
  571. movh m4, [srcq+srcstrideq]
  572. punpcklbw m0, m7
  573. punpcklbw m1, m7
  574. punpcklbw m2, m7
  575. punpcklbw m3, m7
  576. punpcklbw m4, m7
  577. .nextrow:
  578. ; first calculate negative taps (to prevent losing positive overflows)
  579. mova m5, m1
  580. pmullw m5, [myq+16]
  581. mova m6, m4
  582. pmullw m6, [myq+64]
  583. paddsw m6, m5
  584. ; then calculate positive taps
  585. movh m5, [srcq+2*srcstrideq] ; read new row
  586. punpcklbw m5, m7
  587. pmullw m0, [myq+0]
  588. paddsw m6, m0
  589. mova m0, m1
  590. mova m1, m2
  591. pmullw m2, [myq+32]
  592. paddsw m6, m2
  593. mova m2, m3
  594. pmullw m3, [myq+48]
  595. paddsw m6, m3
  596. mova m3, m4
  597. mova m4, m5
  598. pmullw m5, [myq+80]
  599. paddsw m6, m5
  600. ; round/clip/store
  601. paddsw m6, [pw_64]
  602. psraw m6, 7
  603. packuswb m6, m7
  604. movh [dstq], m6
  605. ; go to next line
  606. add dstq, dststrideq
  607. add srcq, srcstrideq
  608. dec heightd ; next row
  609. jg .nextrow
  610. REP_RET
  611. %endmacro
  612. INIT_MMX mmxext
  613. FILTER_V 4
  614. INIT_XMM sse2
  615. FILTER_V 8
  616. %macro FILTER_BILINEAR 1
  617. cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
  618. shl myd, 4
  619. %ifdef PIC
  620. lea picregq, [bilinear_filter_vw_m]
  621. %endif
  622. pxor m6, m6
  623. mova m5, [bilinear_filter_vw+myq-1*16]
  624. neg myq
  625. mova m4, [bilinear_filter_vw+myq+7*16]
  626. .nextrow:
  627. movh m0, [srcq+srcstrideq*0]
  628. movh m1, [srcq+srcstrideq*1]
  629. movh m3, [srcq+srcstrideq*2]
  630. punpcklbw m0, m6
  631. punpcklbw m1, m6
  632. punpcklbw m3, m6
  633. mova m2, m1
  634. pmullw m0, m4
  635. pmullw m1, m5
  636. pmullw m2, m4
  637. pmullw m3, m5
  638. paddsw m0, m1
  639. paddsw m2, m3
  640. psraw m0, 2
  641. psraw m2, 2
  642. pavgw m0, m6
  643. pavgw m2, m6
  644. %if mmsize == 8
  645. packuswb m0, m0
  646. packuswb m2, m2
  647. movh [dstq+dststrideq*0], m0
  648. movh [dstq+dststrideq*1], m2
  649. %else
  650. packuswb m0, m2
  651. movh [dstq+dststrideq*0], m0
  652. movhps [dstq+dststrideq*1], m0
  653. %endif
  654. lea dstq, [dstq+dststrideq*2]
  655. lea srcq, [srcq+srcstrideq*2]
  656. sub heightd, 2
  657. jg .nextrow
  658. REP_RET
  659. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  660. shl mxd, 4
  661. %ifdef PIC
  662. lea picregq, [bilinear_filter_vw_m]
  663. %endif
  664. pxor m6, m6
  665. mova m5, [bilinear_filter_vw+mxq-1*16]
  666. neg mxq
  667. mova m4, [bilinear_filter_vw+mxq+7*16]
  668. .nextrow:
  669. movh m0, [srcq+srcstrideq*0+0]
  670. movh m1, [srcq+srcstrideq*0+1]
  671. movh m2, [srcq+srcstrideq*1+0]
  672. movh m3, [srcq+srcstrideq*1+1]
  673. punpcklbw m0, m6
  674. punpcklbw m1, m6
  675. punpcklbw m2, m6
  676. punpcklbw m3, m6
  677. pmullw m0, m4
  678. pmullw m1, m5
  679. pmullw m2, m4
  680. pmullw m3, m5
  681. paddsw m0, m1
  682. paddsw m2, m3
  683. psraw m0, 2
  684. psraw m2, 2
  685. pavgw m0, m6
  686. pavgw m2, m6
  687. %if mmsize == 8
  688. packuswb m0, m0
  689. packuswb m2, m2
  690. movh [dstq+dststrideq*0], m0
  691. movh [dstq+dststrideq*1], m2
  692. %else
  693. packuswb m0, m2
  694. movh [dstq+dststrideq*0], m0
  695. movhps [dstq+dststrideq*1], m0
  696. %endif
  697. lea dstq, [dstq+dststrideq*2]
  698. lea srcq, [srcq+srcstrideq*2]
  699. sub heightd, 2
  700. jg .nextrow
  701. REP_RET
  702. %endmacro
  703. INIT_MMX mmxext
  704. FILTER_BILINEAR 4
  705. INIT_XMM sse2
  706. FILTER_BILINEAR 8
  707. %macro FILTER_BILINEAR_SSSE3 1
  708. cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
  709. shl myd, 4
  710. %ifdef PIC
  711. lea picregq, [bilinear_filter_vb_m]
  712. %endif
  713. pxor m4, m4
  714. mova m3, [bilinear_filter_vb+myq-16]
  715. .nextrow:
  716. movh m0, [srcq+srcstrideq*0]
  717. movh m1, [srcq+srcstrideq*1]
  718. movh m2, [srcq+srcstrideq*2]
  719. punpcklbw m0, m1
  720. punpcklbw m1, m2
  721. pmaddubsw m0, m3
  722. pmaddubsw m1, m3
  723. psraw m0, 2
  724. psraw m1, 2
  725. pavgw m0, m4
  726. pavgw m1, m4
  727. %if mmsize==8
  728. packuswb m0, m0
  729. packuswb m1, m1
  730. movh [dstq+dststrideq*0], m0
  731. movh [dstq+dststrideq*1], m1
  732. %else
  733. packuswb m0, m1
  734. movh [dstq+dststrideq*0], m0
  735. movhps [dstq+dststrideq*1], m0
  736. %endif
  737. lea dstq, [dstq+dststrideq*2]
  738. lea srcq, [srcq+srcstrideq*2]
  739. sub heightd, 2
  740. jg .nextrow
  741. REP_RET
  742. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
  743. shl mxd, 4
  744. %ifdef PIC
  745. lea picregq, [bilinear_filter_vb_m]
  746. %endif
  747. pxor m4, m4
  748. mova m2, [filter_h2_shuf]
  749. mova m3, [bilinear_filter_vb+mxq-16]
  750. .nextrow:
  751. movu m0, [srcq+srcstrideq*0]
  752. movu m1, [srcq+srcstrideq*1]
  753. pshufb m0, m2
  754. pshufb m1, m2
  755. pmaddubsw m0, m3
  756. pmaddubsw m1, m3
  757. psraw m0, 2
  758. psraw m1, 2
  759. pavgw m0, m4
  760. pavgw m1, m4
  761. %if mmsize==8
  762. packuswb m0, m0
  763. packuswb m1, m1
  764. movh [dstq+dststrideq*0], m0
  765. movh [dstq+dststrideq*1], m1
  766. %else
  767. packuswb m0, m1
  768. movh [dstq+dststrideq*0], m0
  769. movhps [dstq+dststrideq*1], m0
  770. %endif
  771. lea dstq, [dstq+dststrideq*2]
  772. lea srcq, [srcq+srcstrideq*2]
  773. sub heightd, 2
  774. jg .nextrow
  775. REP_RET
  776. %endmacro
  777. INIT_MMX ssse3
  778. FILTER_BILINEAR_SSSE3 4
  779. INIT_XMM ssse3
  780. FILTER_BILINEAR_SSSE3 8
  781. INIT_MMX mmx
  782. cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
  783. .nextrow:
  784. movq mm0, [srcq+srcstrideq*0]
  785. movq mm1, [srcq+srcstrideq*1]
  786. lea srcq, [srcq+srcstrideq*2]
  787. movq [dstq+dststrideq*0], mm0
  788. movq [dstq+dststrideq*1], mm1
  789. lea dstq, [dstq+dststrideq*2]
  790. sub heightd, 2
  791. jg .nextrow
  792. REP_RET
  793. %if ARCH_X86_32
  794. INIT_MMX mmx
  795. cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
  796. .nextrow:
  797. movq mm0, [srcq+srcstrideq*0+0]
  798. movq mm1, [srcq+srcstrideq*0+8]
  799. movq mm2, [srcq+srcstrideq*1+0]
  800. movq mm3, [srcq+srcstrideq*1+8]
  801. lea srcq, [srcq+srcstrideq*2]
  802. movq [dstq+dststrideq*0+0], mm0
  803. movq [dstq+dststrideq*0+8], mm1
  804. movq [dstq+dststrideq*1+0], mm2
  805. movq [dstq+dststrideq*1+8], mm3
  806. lea dstq, [dstq+dststrideq*2]
  807. sub heightd, 2
  808. jg .nextrow
  809. REP_RET
  810. %endif
  811. INIT_XMM sse
  812. cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
  813. .nextrow:
  814. movups xmm0, [srcq+srcstrideq*0]
  815. movups xmm1, [srcq+srcstrideq*1]
  816. lea srcq, [srcq+srcstrideq*2]
  817. movaps [dstq+dststrideq*0], xmm0
  818. movaps [dstq+dststrideq*1], xmm1
  819. lea dstq, [dstq+dststrideq*2]
  820. sub heightd, 2
  821. jg .nextrow
  822. REP_RET
  823. ;-----------------------------------------------------------------------------
  824. ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  825. ;-----------------------------------------------------------------------------
  826. %macro ADD_DC 4
  827. %4 m2, [dst1q+%3]
  828. %4 m3, [dst1q+strideq+%3]
  829. %4 m4, [dst2q+%3]
  830. %4 m5, [dst2q+strideq+%3]
  831. paddusb m2, %1
  832. paddusb m3, %1
  833. paddusb m4, %1
  834. paddusb m5, %1
  835. psubusb m2, %2
  836. psubusb m3, %2
  837. psubusb m4, %2
  838. psubusb m5, %2
  839. %4 [dst1q+%3], m2
  840. %4 [dst1q+strideq+%3], m3
  841. %4 [dst2q+%3], m4
  842. %4 [dst2q+strideq+%3], m5
  843. %endmacro
  844. INIT_MMX mmx
  845. cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
  846. ; load data
  847. movd m0, [blockq]
  848. ; calculate DC
  849. paddw m0, [pw_4]
  850. pxor m1, m1
  851. psraw m0, 3
  852. movd [blockq], m1
  853. psubw m1, m0
  854. packuswb m0, m0
  855. packuswb m1, m1
  856. punpcklbw m0, m0
  857. punpcklbw m1, m1
  858. punpcklwd m0, m0
  859. punpcklwd m1, m1
  860. ; add DC
  861. DEFINE_ARGS dst1, dst2, stride
  862. lea dst2q, [dst1q+strideq*2]
  863. ADD_DC m0, m1, 0, movh
  864. RET
  865. INIT_XMM sse4
  866. cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
  867. ; load data
  868. movd m0, [blockq]
  869. pxor m1, m1
  870. ; calculate DC
  871. paddw m0, [pw_4]
  872. movd [blockq], m1
  873. DEFINE_ARGS dst1, dst2, stride
  874. lea dst2q, [dst1q+strideq*2]
  875. movd m2, [dst1q]
  876. movd m3, [dst1q+strideq]
  877. movd m4, [dst2q]
  878. movd m5, [dst2q+strideq]
  879. psraw m0, 3
  880. pshuflw m0, m0, 0
  881. punpcklqdq m0, m0
  882. punpckldq m2, m3
  883. punpckldq m4, m5
  884. punpcklbw m2, m1
  885. punpcklbw m4, m1
  886. paddw m2, m0
  887. paddw m4, m0
  888. packuswb m2, m4
  889. movd [dst1q], m2
  890. pextrd [dst1q+strideq], m2, 1
  891. pextrd [dst2q], m2, 2
  892. pextrd [dst2q+strideq], m2, 3
  893. RET
  894. ;-----------------------------------------------------------------------------
  895. ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  896. ;-----------------------------------------------------------------------------
  897. %if ARCH_X86_32
  898. INIT_MMX mmx
  899. cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
  900. ; load data
  901. movd m0, [blockq+32*0] ; A
  902. movd m1, [blockq+32*2] ; C
  903. punpcklwd m0, [blockq+32*1] ; A B
  904. punpcklwd m1, [blockq+32*3] ; C D
  905. punpckldq m0, m1 ; A B C D
  906. pxor m6, m6
  907. ; calculate DC
  908. paddw m0, [pw_4]
  909. movd [blockq+32*0], m6
  910. movd [blockq+32*1], m6
  911. movd [blockq+32*2], m6
  912. movd [blockq+32*3], m6
  913. psraw m0, 3
  914. psubw m6, m0
  915. packuswb m0, m0
  916. packuswb m6, m6
  917. punpcklbw m0, m0 ; AABBCCDD
  918. punpcklbw m6, m6 ; AABBCCDD
  919. movq m1, m0
  920. movq m7, m6
  921. punpcklbw m0, m0 ; AAAABBBB
  922. punpckhbw m1, m1 ; CCCCDDDD
  923. punpcklbw m6, m6 ; AAAABBBB
  924. punpckhbw m7, m7 ; CCCCDDDD
  925. ; add DC
  926. DEFINE_ARGS dst1, dst2, stride
  927. lea dst2q, [dst1q+strideq*2]
  928. ADD_DC m0, m6, 0, mova
  929. ADD_DC m1, m7, 8, mova
  930. RET
  931. %endif
  932. INIT_XMM sse2
  933. cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
  934. ; load data
  935. movd m0, [blockq+32*0] ; A
  936. movd m1, [blockq+32*2] ; C
  937. punpcklwd m0, [blockq+32*1] ; A B
  938. punpcklwd m1, [blockq+32*3] ; C D
  939. punpckldq m0, m1 ; A B C D
  940. pxor m1, m1
  941. ; calculate DC
  942. paddw m0, [pw_4]
  943. movd [blockq+32*0], m1
  944. movd [blockq+32*1], m1
  945. movd [blockq+32*2], m1
  946. movd [blockq+32*3], m1
  947. psraw m0, 3
  948. psubw m1, m0
  949. packuswb m0, m0
  950. packuswb m1, m1
  951. punpcklbw m0, m0
  952. punpcklbw m1, m1
  953. punpcklbw m0, m0
  954. punpcklbw m1, m1
  955. ; add DC
  956. DEFINE_ARGS dst1, dst2, stride
  957. lea dst2q, [dst1q+strideq*2]
  958. ADD_DC m0, m1, 0, mova
  959. RET
  960. ;-----------------------------------------------------------------------------
  961. ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  962. ;-----------------------------------------------------------------------------
  963. INIT_MMX mmx
  964. cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
  965. ; load data
  966. movd m0, [blockq+32*0] ; A
  967. movd m1, [blockq+32*2] ; C
  968. punpcklwd m0, [blockq+32*1] ; A B
  969. punpcklwd m1, [blockq+32*3] ; C D
  970. punpckldq m0, m1 ; A B C D
  971. pxor m6, m6
  972. ; calculate DC
  973. paddw m0, [pw_4]
  974. movd [blockq+32*0], m6
  975. movd [blockq+32*1], m6
  976. movd [blockq+32*2], m6
  977. movd [blockq+32*3], m6
  978. psraw m0, 3
  979. psubw m6, m0
  980. packuswb m0, m0
  981. packuswb m6, m6
  982. punpcklbw m0, m0 ; AABBCCDD
  983. punpcklbw m6, m6 ; AABBCCDD
  984. movq m1, m0
  985. movq m7, m6
  986. punpcklbw m0, m0 ; AAAABBBB
  987. punpckhbw m1, m1 ; CCCCDDDD
  988. punpcklbw m6, m6 ; AAAABBBB
  989. punpckhbw m7, m7 ; CCCCDDDD
  990. ; add DC
  991. DEFINE_ARGS dst1, dst2, stride
  992. lea dst2q, [dst1q+strideq*2]
  993. ADD_DC m0, m6, 0, mova
  994. lea dst1q, [dst1q+strideq*4]
  995. lea dst2q, [dst2q+strideq*4]
  996. ADD_DC m1, m7, 0, mova
  997. RET
  998. ;-----------------------------------------------------------------------------
  999. ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  1000. ;-----------------------------------------------------------------------------
  1001. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  1002. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  1003. %macro VP8_MULTIPLY_SUMSUB 4
  1004. mova %3, %1
  1005. mova %4, %2
  1006. pmulhw %3, m6 ;20091(1)
  1007. pmulhw %4, m6 ;20091(2)
  1008. paddw %3, %1
  1009. paddw %4, %2
  1010. paddw %1, %1
  1011. paddw %2, %2
  1012. pmulhw %1, m7 ;35468(1)
  1013. pmulhw %2, m7 ;35468(2)
  1014. psubw %1, %4
  1015. paddw %2, %3
  1016. %endmacro
  1017. ; calculate x0=%1+%3; x1=%1-%3
  1018. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  1019. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  1020. ; %5/%6 are temporary registers
  1021. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  1022. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  1023. SUMSUB_BA w, %3, %1, %5 ;t0, t1
  1024. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  1025. SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
  1026. SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
  1027. SWAP %4, %1
  1028. SWAP %4, %3
  1029. %endmacro
  1030. %macro VP8_IDCT_ADD 0
  1031. cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
  1032. ; load block data
  1033. movq m0, [blockq+ 0]
  1034. movq m1, [blockq+ 8]
  1035. movq m2, [blockq+16]
  1036. movq m3, [blockq+24]
  1037. movq m6, [pw_20091]
  1038. movq m7, [pw_17734]
  1039. %if cpuflag(sse)
  1040. xorps xmm0, xmm0
  1041. movaps [blockq+ 0], xmm0
  1042. movaps [blockq+16], xmm0
  1043. %else
  1044. pxor m4, m4
  1045. movq [blockq+ 0], m4
  1046. movq [blockq+ 8], m4
  1047. movq [blockq+16], m4
  1048. movq [blockq+24], m4
  1049. %endif
  1050. ; actual IDCT
  1051. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1052. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1053. paddw m0, [pw_4]
  1054. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1055. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1056. ; store
  1057. pxor m4, m4
  1058. DEFINE_ARGS dst1, dst2, stride
  1059. lea dst2q, [dst1q+2*strideq]
  1060. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
  1061. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
  1062. RET
  1063. %endmacro
  1064. %if ARCH_X86_32
  1065. INIT_MMX mmx
  1066. VP8_IDCT_ADD
  1067. %endif
  1068. INIT_MMX sse
  1069. VP8_IDCT_ADD
  1070. ;-----------------------------------------------------------------------------
  1071. ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
  1072. ;-----------------------------------------------------------------------------
  1073. %macro SCATTER_WHT 3
  1074. movd dc1d, m%1
  1075. movd dc2d, m%2
  1076. mov [blockq+2*16*(0+%3)], dc1w
  1077. mov [blockq+2*16*(1+%3)], dc2w
  1078. shr dc1d, 16
  1079. shr dc2d, 16
  1080. psrlq m%1, 32
  1081. psrlq m%2, 32
  1082. mov [blockq+2*16*(4+%3)], dc1w
  1083. mov [blockq+2*16*(5+%3)], dc2w
  1084. movd dc1d, m%1
  1085. movd dc2d, m%2
  1086. mov [blockq+2*16*(8+%3)], dc1w
  1087. mov [blockq+2*16*(9+%3)], dc2w
  1088. shr dc1d, 16
  1089. shr dc2d, 16
  1090. mov [blockq+2*16*(12+%3)], dc1w
  1091. mov [blockq+2*16*(13+%3)], dc2w
  1092. %endmacro
  1093. %macro HADAMARD4_1D 4
  1094. SUMSUB_BADC w, %2, %1, %4, %3
  1095. SUMSUB_BADC w, %4, %2, %3, %1
  1096. SWAP %1, %4, %3
  1097. %endmacro
  1098. %macro VP8_DC_WHT 0
  1099. cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
  1100. movq m0, [dc1q]
  1101. movq m1, [dc1q+8]
  1102. movq m2, [dc1q+16]
  1103. movq m3, [dc1q+24]
  1104. %if cpuflag(sse)
  1105. xorps xmm0, xmm0
  1106. movaps [dc1q+ 0], xmm0
  1107. movaps [dc1q+16], xmm0
  1108. %else
  1109. pxor m4, m4
  1110. movq [dc1q+ 0], m4
  1111. movq [dc1q+ 8], m4
  1112. movq [dc1q+16], m4
  1113. movq [dc1q+24], m4
  1114. %endif
  1115. HADAMARD4_1D 0, 1, 2, 3
  1116. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1117. paddw m0, [pw_3]
  1118. HADAMARD4_1D 0, 1, 2, 3
  1119. psraw m0, 3
  1120. psraw m1, 3
  1121. psraw m2, 3
  1122. psraw m3, 3
  1123. SCATTER_WHT 0, 1, 0
  1124. SCATTER_WHT 2, 3, 2
  1125. RET
  1126. %endmacro
  1127. %if ARCH_X86_32
  1128. INIT_MMX mmx
  1129. VP8_DC_WHT
  1130. %endif
  1131. INIT_MMX sse
  1132. VP8_DC_WHT
  1133. ;-----------------------------------------------------------------------------
  1134. ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
  1135. ;-----------------------------------------------------------------------------
  1136. ; macro called with 7 mm register indexes as argument, and 4 regular registers
  1137. ;
  1138. ; first 4 mm registers will carry the transposed pixel data
  1139. ; the other three are scratchspace (one would be sufficient, but this allows
  1140. ; for more spreading/pipelining and thus faster execution on OOE CPUs)
  1141. ;
  1142. ; first two regular registers are buf+4*stride and buf+5*stride
  1143. ; third is -stride, fourth is +stride
  1144. %macro READ_8x4_INTERLEAVED 11
  1145. ; interleave 8 (A-H) rows of 4 pixels each
  1146. movd m%1, [%8+%10*4] ; A0-3
  1147. movd m%5, [%9+%10*4] ; B0-3
  1148. movd m%2, [%8+%10*2] ; C0-3
  1149. movd m%6, [%8+%10] ; D0-3
  1150. movd m%3, [%8] ; E0-3
  1151. movd m%7, [%9] ; F0-3
  1152. movd m%4, [%9+%11] ; G0-3
  1153. punpcklbw m%1, m%5 ; A/B interleaved
  1154. movd m%5, [%9+%11*2] ; H0-3
  1155. punpcklbw m%2, m%6 ; C/D interleaved
  1156. punpcklbw m%3, m%7 ; E/F interleaved
  1157. punpcklbw m%4, m%5 ; G/H interleaved
  1158. %endmacro
  1159. ; macro called with 7 mm register indexes as argument, and 5 regular registers
  1160. ; first 11 mean the same as READ_8x4_TRANSPOSED above
  1161. ; fifth regular register is scratchspace to reach the bottom 8 rows, it
  1162. ; will be set to second regular register + 8*stride at the end
  1163. %macro READ_16x4_INTERLEAVED 12
  1164. ; transpose 16 (A-P) rows of 4 pixels each
  1165. lea %12, [r0+8*r2]
  1166. ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
  1167. movd m%1, [%8+%10*4] ; A0-3
  1168. movd m%3, [%12+%10*4] ; I0-3
  1169. movd m%2, [%8+%10*2] ; C0-3
  1170. movd m%4, [%12+%10*2] ; K0-3
  1171. movd m%6, [%8+%10] ; D0-3
  1172. movd m%5, [%12+%10] ; L0-3
  1173. movd m%7, [%12] ; M0-3
  1174. add %12, %11
  1175. punpcklbw m%1, m%3 ; A/I
  1176. movd m%3, [%8] ; E0-3
  1177. punpcklbw m%2, m%4 ; C/K
  1178. punpcklbw m%6, m%5 ; D/L
  1179. punpcklbw m%3, m%7 ; E/M
  1180. punpcklbw m%2, m%6 ; C/D/K/L interleaved
  1181. ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
  1182. movd m%5, [%9+%10*4] ; B0-3
  1183. movd m%4, [%12+%10*4] ; J0-3
  1184. movd m%7, [%9] ; F0-3
  1185. movd m%6, [%12] ; N0-3
  1186. punpcklbw m%5, m%4 ; B/J
  1187. punpcklbw m%7, m%6 ; F/N
  1188. punpcklbw m%1, m%5 ; A/B/I/J interleaved
  1189. punpcklbw m%3, m%7 ; E/F/M/N interleaved
  1190. movd m%4, [%9+%11] ; G0-3
  1191. movd m%6, [%12+%11] ; O0-3
  1192. movd m%5, [%9+%11*2] ; H0-3
  1193. movd m%7, [%12+%11*2] ; P0-3
  1194. punpcklbw m%4, m%6 ; G/O
  1195. punpcklbw m%5, m%7 ; H/P
  1196. punpcklbw m%4, m%5 ; G/H/O/P interleaved
  1197. %endmacro
  1198. ; write 4 mm registers of 2 dwords each
  1199. ; first four arguments are mm register indexes containing source data
  1200. ; last four are registers containing buf+4*stride, buf+5*stride,
  1201. ; -stride and +stride
  1202. %macro WRITE_4x2D 8
  1203. ; write out (2 dwords per register)
  1204. movd [%5+%7*4], m%1
  1205. movd [%5+%7*2], m%2
  1206. movd [%5], m%3
  1207. movd [%6+%8], m%4
  1208. punpckhdq m%1, m%1
  1209. punpckhdq m%2, m%2
  1210. punpckhdq m%3, m%3
  1211. punpckhdq m%4, m%4
  1212. movd [%6+%7*4], m%1
  1213. movd [%5+%7], m%2
  1214. movd [%6], m%3
  1215. movd [%6+%8*2], m%4
  1216. %endmacro
  1217. ; write 4 xmm registers of 4 dwords each
  1218. ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
  1219. ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
  1220. ; we add 1*stride to the third regular registry in the process
  1221. ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
  1222. ; same memory region), or 8 if they cover two separate buffers (third one points to
  1223. ; a different memory region than the first two), allowing for more optimal code for
  1224. ; the 16-width case
  1225. %macro WRITE_4x4D 10
  1226. ; write out (4 dwords per register), start with dwords zero
  1227. movd [%5+%8*4], m%1
  1228. movd [%5], m%2
  1229. movd [%7+%8*4], m%3
  1230. movd [%7], m%4
  1231. ; store dwords 1
  1232. psrldq m%1, 4
  1233. psrldq m%2, 4
  1234. psrldq m%3, 4
  1235. psrldq m%4, 4
  1236. movd [%6+%8*4], m%1
  1237. movd [%6], m%2
  1238. %if %10 == 16
  1239. movd [%6+%9*4], m%3
  1240. %endif
  1241. movd [%7+%9], m%4
  1242. ; write dwords 2
  1243. psrldq m%1, 4
  1244. psrldq m%2, 4
  1245. %if %10 == 8
  1246. movd [%5+%8*2], m%1
  1247. movd %5d, m%3
  1248. %endif
  1249. psrldq m%3, 4
  1250. psrldq m%4, 4
  1251. %if %10 == 16
  1252. movd [%5+%8*2], m%1
  1253. %endif
  1254. movd [%6+%9], m%2
  1255. movd [%7+%8*2], m%3
  1256. movd [%7+%9*2], m%4
  1257. add %7, %9
  1258. ; store dwords 3
  1259. psrldq m%1, 4
  1260. psrldq m%2, 4
  1261. psrldq m%3, 4
  1262. psrldq m%4, 4
  1263. %if %10 == 8
  1264. mov [%7+%8*4], %5d
  1265. movd [%6+%8*2], m%1
  1266. %else
  1267. movd [%5+%8], m%1
  1268. %endif
  1269. movd [%6+%9*2], m%2
  1270. movd [%7+%8*2], m%3
  1271. movd [%7+%9*2], m%4
  1272. %endmacro
  1273. ; write 4 or 8 words in the mmx/xmm registers as 8 lines
  1274. ; 1 and 2 are the registers to write, this can be the same (for SSE2)
  1275. ; for pre-SSE4:
  1276. ; 3 is a general-purpose register that we will clobber
  1277. ; for SSE4:
  1278. ; 3 is a pointer to the destination's 5th line
  1279. ; 4 is a pointer to the destination's 4th line
  1280. ; 5/6 is -stride and +stride
  1281. %macro WRITE_2x4W 6
  1282. movd %3d, %1
  1283. punpckhdq %1, %1
  1284. mov [%4+%5*4], %3w
  1285. shr %3, 16
  1286. add %4, %6
  1287. mov [%4+%5*4], %3w
  1288. movd %3d, %1
  1289. add %4, %5
  1290. mov [%4+%5*2], %3w
  1291. shr %3, 16
  1292. mov [%4+%5 ], %3w
  1293. movd %3d, %2
  1294. punpckhdq %2, %2
  1295. mov [%4 ], %3w
  1296. shr %3, 16
  1297. mov [%4+%6 ], %3w
  1298. movd %3d, %2
  1299. add %4, %6
  1300. mov [%4+%6 ], %3w
  1301. shr %3, 16
  1302. mov [%4+%6*2], %3w
  1303. add %4, %5
  1304. %endmacro
  1305. %macro WRITE_8W 5
  1306. %if cpuflag(sse4)
  1307. pextrw [%3+%4*4], %1, 0
  1308. pextrw [%2+%4*4], %1, 1
  1309. pextrw [%3+%4*2], %1, 2
  1310. pextrw [%3+%4 ], %1, 3
  1311. pextrw [%3 ], %1, 4
  1312. pextrw [%2 ], %1, 5
  1313. pextrw [%2+%5 ], %1, 6
  1314. pextrw [%2+%5*2], %1, 7
  1315. %else
  1316. movd %2d, %1
  1317. psrldq %1, 4
  1318. mov [%3+%4*4], %2w
  1319. shr %2, 16
  1320. add %3, %5
  1321. mov [%3+%4*4], %2w
  1322. movd %2d, %1
  1323. psrldq %1, 4
  1324. add %3, %4
  1325. mov [%3+%4*2], %2w
  1326. shr %2, 16
  1327. mov [%3+%4 ], %2w
  1328. movd %2d, %1
  1329. psrldq %1, 4
  1330. mov [%3 ], %2w
  1331. shr %2, 16
  1332. mov [%3+%5 ], %2w
  1333. movd %2d, %1
  1334. add %3, %5
  1335. mov [%3+%5 ], %2w
  1336. shr %2, 16
  1337. mov [%3+%5*2], %2w
  1338. %endif
  1339. %endmacro
  1340. %macro SIMPLE_LOOPFILTER 2
  1341. cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
  1342. %if mmsize == 8 ; mmx/mmxext
  1343. mov cntrq, 2
  1344. %endif
  1345. %if cpuflag(ssse3)
  1346. pxor m0, m0
  1347. %endif
  1348. SPLATB_REG m7, flim, m0 ; splat "flim" into register
  1349. ; set up indexes to address 4 rows
  1350. %if mmsize == 8
  1351. DEFINE_ARGS dst1, mstride, stride, cntr, dst2
  1352. %else
  1353. DEFINE_ARGS dst1, mstride, stride, dst3, dst2
  1354. %endif
  1355. mov strideq, mstrideq
  1356. neg mstrideq
  1357. %ifidn %1, h
  1358. lea dst1q, [dst1q+4*strideq-2]
  1359. %endif
  1360. %if mmsize == 8 ; mmx / mmxext
  1361. .next8px:
  1362. %endif
  1363. %ifidn %1, v
  1364. ; read 4 half/full rows of pixels
  1365. mova m0, [dst1q+mstrideq*2] ; p1
  1366. mova m1, [dst1q+mstrideq] ; p0
  1367. mova m2, [dst1q] ; q0
  1368. mova m3, [dst1q+ strideq] ; q1
  1369. %else ; h
  1370. lea dst2q, [dst1q+ strideq]
  1371. %if mmsize == 8 ; mmx/mmxext
  1372. READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
  1373. %else ; sse2
  1374. READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
  1375. %endif
  1376. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1377. %endif
  1378. ; simple_limit
  1379. mova m5, m2 ; m5=backup of q0
  1380. mova m6, m1 ; m6=backup of p0
  1381. psubusb m1, m2 ; p0-q0
  1382. psubusb m2, m6 ; q0-p0
  1383. por m1, m2 ; FFABS(p0-q0)
  1384. paddusb m1, m1 ; m1=FFABS(p0-q0)*2
  1385. mova m4, m3
  1386. mova m2, m0
  1387. psubusb m3, m0 ; q1-p1
  1388. psubusb m0, m4 ; p1-q1
  1389. por m3, m0 ; FFABS(p1-q1)
  1390. mova m0, [pb_80]
  1391. pxor m2, m0
  1392. pxor m4, m0
  1393. psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
  1394. pand m3, [pb_FE]
  1395. psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
  1396. paddusb m3, m1
  1397. psubusb m3, m7
  1398. pxor m1, m1
  1399. pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
  1400. ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
  1401. mova m4, m5
  1402. pxor m5, m0
  1403. pxor m0, m6
  1404. psubsb m5, m0 ; q0-p0 (signed)
  1405. paddsb m2, m5
  1406. paddsb m2, m5
  1407. paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
  1408. pand m2, m3 ; apply filter mask (m3)
  1409. mova m3, [pb_F8]
  1410. mova m1, m2
  1411. paddsb m2, [pb_4] ; f1<<3=a+4
  1412. paddsb m1, [pb_3] ; f2<<3=a+3
  1413. pand m2, m3
  1414. pand m1, m3 ; cache f2<<3
  1415. pxor m0, m0
  1416. pxor m3, m3
  1417. pcmpgtb m0, m2 ; which values are <0?
  1418. psubb m3, m2 ; -f1<<3
  1419. psrlq m2, 3 ; +f1
  1420. psrlq m3, 3 ; -f1
  1421. pand m3, m0
  1422. pandn m0, m2
  1423. psubusb m4, m0
  1424. paddusb m4, m3 ; q0-f1
  1425. pxor m0, m0
  1426. pxor m3, m3
  1427. pcmpgtb m0, m1 ; which values are <0?
  1428. psubb m3, m1 ; -f2<<3
  1429. psrlq m1, 3 ; +f2
  1430. psrlq m3, 3 ; -f2
  1431. pand m3, m0
  1432. pandn m0, m1
  1433. paddusb m6, m0
  1434. psubusb m6, m3 ; p0+f2
  1435. ; store
  1436. %ifidn %1, v
  1437. mova [dst1q], m4
  1438. mova [dst1q+mstrideq], m6
  1439. %else ; h
  1440. inc dst1q
  1441. SBUTTERFLY bw, 6, 4, 0
  1442. %if mmsize == 16 ; sse2
  1443. %if cpuflag(sse4)
  1444. inc dst2q
  1445. %endif
  1446. WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
  1447. lea dst2q, [dst3q+mstrideq+1]
  1448. %if cpuflag(sse4)
  1449. inc dst3q
  1450. %endif
  1451. WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
  1452. %else ; mmx/mmxext
  1453. WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
  1454. %endif
  1455. %endif
  1456. %if mmsize == 8 ; mmx/mmxext
  1457. ; next 8 pixels
  1458. %ifidn %1, v
  1459. add dst1q, 8 ; advance 8 cols = pixels
  1460. %else ; h
  1461. lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
  1462. %endif
  1463. dec cntrq
  1464. jg .next8px
  1465. REP_RET
  1466. %else ; sse2
  1467. RET
  1468. %endif
  1469. %endmacro
  1470. %if ARCH_X86_32
  1471. INIT_MMX mmx
  1472. SIMPLE_LOOPFILTER v, 4
  1473. SIMPLE_LOOPFILTER h, 5
  1474. INIT_MMX mmxext
  1475. SIMPLE_LOOPFILTER v, 4
  1476. SIMPLE_LOOPFILTER h, 5
  1477. %endif
  1478. INIT_XMM sse2
  1479. SIMPLE_LOOPFILTER v, 3
  1480. SIMPLE_LOOPFILTER h, 5
  1481. INIT_XMM ssse3
  1482. SIMPLE_LOOPFILTER v, 3
  1483. SIMPLE_LOOPFILTER h, 5
  1484. INIT_XMM sse4
  1485. SIMPLE_LOOPFILTER h, 5
  1486. ;-----------------------------------------------------------------------------
  1487. ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1488. ; int flimE, int flimI, int hev_thr);
  1489. ;-----------------------------------------------------------------------------
  1490. %macro INNER_LOOPFILTER 2
  1491. %if %2 == 8 ; chroma
  1492. cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr
  1493. %else ; luma
  1494. cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
  1495. %endif
  1496. %if cpuflag(ssse3)
  1497. pxor m7, m7
  1498. %endif
  1499. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1500. %ifidn %1, v ; [3]=hev() result
  1501. %assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15)
  1502. %else ; h ; extra storage space for transposes
  1503. %assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15)
  1504. %endif
  1505. ; splat function arguments
  1506. SPLATB_REG m0, flimEq, m7 ; E
  1507. SPLATB_REG m1, flimIq, m7 ; I
  1508. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1509. SUB rsp, pad
  1510. %define m_flimE [rsp]
  1511. %define m_flimI [rsp+mmsize]
  1512. %define m_hevthr [rsp+mmsize*2]
  1513. %define m_maskres [rsp+mmsize*3]
  1514. %define m_p0backup [rsp+mmsize*3]
  1515. %define m_q0backup [rsp+mmsize*4]
  1516. mova m_flimE, m0
  1517. mova m_flimI, m1
  1518. mova m_hevthr, m2
  1519. %else
  1520. %define m_flimE m9
  1521. %define m_flimI m10
  1522. %define m_hevthr m11
  1523. %define m_maskres m12
  1524. %define m_p0backup m12
  1525. %define m_q0backup m8
  1526. ; splat function arguments
  1527. SPLATB_REG m_flimE, flimEq, m7 ; E
  1528. SPLATB_REG m_flimI, flimIq, m7 ; I
  1529. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  1530. %endif
  1531. %if %2 == 8 ; chroma
  1532. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  1533. %elif mmsize == 8
  1534. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  1535. mov cntrq, 2
  1536. %else
  1537. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  1538. %endif
  1539. mov strideq, mstrideq
  1540. neg mstrideq
  1541. %ifidn %1, h
  1542. lea dst1q, [dst1q+strideq*4-4]
  1543. %if %2 == 8 ; chroma
  1544. lea dst8q, [dst8q+strideq*4-4]
  1545. %endif
  1546. %endif
  1547. %if mmsize == 8
  1548. .next8px:
  1549. %endif
  1550. ; read
  1551. lea dst2q, [dst1q+strideq]
  1552. %ifidn %1, v
  1553. %if %2 == 8 && mmsize == 16
  1554. %define movrow movh
  1555. %else
  1556. %define movrow mova
  1557. %endif
  1558. movrow m0, [dst1q+mstrideq*4] ; p3
  1559. movrow m1, [dst2q+mstrideq*4] ; p2
  1560. movrow m2, [dst1q+mstrideq*2] ; p1
  1561. movrow m5, [dst2q] ; q1
  1562. movrow m6, [dst2q+ strideq*1] ; q2
  1563. movrow m7, [dst2q+ strideq*2] ; q3
  1564. %if mmsize == 16 && %2 == 8
  1565. movhps m0, [dst8q+mstrideq*4]
  1566. movhps m2, [dst8q+mstrideq*2]
  1567. add dst8q, strideq
  1568. movhps m1, [dst8q+mstrideq*4]
  1569. movhps m5, [dst8q]
  1570. movhps m6, [dst8q+ strideq ]
  1571. movhps m7, [dst8q+ strideq*2]
  1572. add dst8q, mstrideq
  1573. %endif
  1574. %elif mmsize == 8 ; mmx/mmxext (h)
  1575. ; read 8 rows of 8px each
  1576. movu m0, [dst1q+mstrideq*4]
  1577. movu m1, [dst2q+mstrideq*4]
  1578. movu m2, [dst1q+mstrideq*2]
  1579. movu m3, [dst1q+mstrideq ]
  1580. movu m4, [dst1q]
  1581. movu m5, [dst2q]
  1582. movu m6, [dst2q+ strideq ]
  1583. ; 8x8 transpose
  1584. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1585. mova m_q0backup, m1
  1586. movu m7, [dst2q+ strideq*2]
  1587. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1588. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1589. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1590. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1591. mova m1, m_q0backup
  1592. mova m_q0backup, m2 ; store q0
  1593. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1594. mova m_p0backup, m5 ; store p0
  1595. SWAP 1, 4
  1596. SWAP 2, 4
  1597. SWAP 6, 3
  1598. SWAP 5, 3
  1599. %else ; sse2 (h)
  1600. %if %2 == 16
  1601. lea dst8q, [dst1q+ strideq*8]
  1602. %endif
  1603. ; read 16 rows of 8px each, interleave
  1604. movh m0, [dst1q+mstrideq*4]
  1605. movh m1, [dst8q+mstrideq*4]
  1606. movh m2, [dst1q+mstrideq*2]
  1607. movh m5, [dst8q+mstrideq*2]
  1608. movh m3, [dst1q+mstrideq ]
  1609. movh m6, [dst8q+mstrideq ]
  1610. movh m4, [dst1q]
  1611. movh m7, [dst8q]
  1612. punpcklbw m0, m1 ; A/I
  1613. punpcklbw m2, m5 ; C/K
  1614. punpcklbw m3, m6 ; D/L
  1615. punpcklbw m4, m7 ; E/M
  1616. add dst8q, strideq
  1617. movh m1, [dst2q+mstrideq*4]
  1618. movh m6, [dst8q+mstrideq*4]
  1619. movh m5, [dst2q]
  1620. movh m7, [dst8q]
  1621. punpcklbw m1, m6 ; B/J
  1622. punpcklbw m5, m7 ; F/N
  1623. movh m6, [dst2q+ strideq ]
  1624. movh m7, [dst8q+ strideq ]
  1625. punpcklbw m6, m7 ; G/O
  1626. ; 8x16 transpose
  1627. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1628. %ifdef m8
  1629. SWAP 1, 8
  1630. %else
  1631. mova m_q0backup, m1
  1632. %endif
  1633. movh m7, [dst2q+ strideq*2]
  1634. movh m1, [dst8q+ strideq*2]
  1635. punpcklbw m7, m1 ; H/P
  1636. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1637. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1638. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1639. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1640. %ifdef m8
  1641. SWAP 1, 8
  1642. SWAP 2, 8
  1643. %else
  1644. mova m1, m_q0backup
  1645. mova m_q0backup, m2 ; store q0
  1646. %endif
  1647. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1648. %ifdef m12
  1649. SWAP 5, 12
  1650. %else
  1651. mova m_p0backup, m5 ; store p0
  1652. %endif
  1653. SWAP 1, 4
  1654. SWAP 2, 4
  1655. SWAP 6, 3
  1656. SWAP 5, 3
  1657. %endif
  1658. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  1659. mova m4, m1
  1660. SWAP 4, 1
  1661. psubusb m4, m0 ; p2-p3
  1662. psubusb m0, m1 ; p3-p2
  1663. por m0, m4 ; abs(p3-p2)
  1664. mova m4, m2
  1665. SWAP 4, 2
  1666. psubusb m4, m1 ; p1-p2
  1667. psubusb m1, m2 ; p2-p1
  1668. por m1, m4 ; abs(p2-p1)
  1669. mova m4, m6
  1670. SWAP 4, 6
  1671. psubusb m4, m7 ; q2-q3
  1672. psubusb m7, m6 ; q3-q2
  1673. por m7, m4 ; abs(q3-q2)
  1674. mova m4, m5
  1675. SWAP 4, 5
  1676. psubusb m4, m6 ; q1-q2
  1677. psubusb m6, m5 ; q2-q1
  1678. por m6, m4 ; abs(q2-q1)
  1679. %if notcpuflag(mmxext)
  1680. mova m4, m_flimI
  1681. pxor m3, m3
  1682. psubusb m0, m4
  1683. psubusb m1, m4
  1684. psubusb m7, m4
  1685. psubusb m6, m4
  1686. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  1687. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  1688. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  1689. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  1690. pand m0, m1
  1691. pand m7, m6
  1692. pand m0, m7
  1693. %else ; mmxext/sse2
  1694. pmaxub m0, m1
  1695. pmaxub m6, m7
  1696. pmaxub m0, m6
  1697. %endif
  1698. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  1699. SWAP 7, 3 ; now m7 is zero
  1700. %ifidn %1, v
  1701. movrow m3, [dst1q+mstrideq ] ; p0
  1702. %if mmsize == 16 && %2 == 8
  1703. movhps m3, [dst8q+mstrideq ]
  1704. %endif
  1705. %elifdef m12
  1706. SWAP 3, 12
  1707. %else
  1708. mova m3, m_p0backup
  1709. %endif
  1710. mova m1, m2
  1711. SWAP 1, 2
  1712. mova m6, m3
  1713. SWAP 3, 6
  1714. psubusb m1, m3 ; p1-p0
  1715. psubusb m6, m2 ; p0-p1
  1716. por m1, m6 ; abs(p1-p0)
  1717. %if notcpuflag(mmxext)
  1718. mova m6, m1
  1719. psubusb m1, m4
  1720. psubusb m6, m_hevthr
  1721. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  1722. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  1723. pand m0, m1
  1724. mova m_maskres, m6
  1725. %else ; mmxext/sse2
  1726. pmaxub m0, m1 ; max_I
  1727. SWAP 1, 4 ; max_hev_thresh
  1728. %endif
  1729. SWAP 6, 4 ; now m6 is I
  1730. %ifidn %1, v
  1731. movrow m4, [dst1q] ; q0
  1732. %if mmsize == 16 && %2 == 8
  1733. movhps m4, [dst8q]
  1734. %endif
  1735. %elifdef m8
  1736. SWAP 4, 8
  1737. %else
  1738. mova m4, m_q0backup
  1739. %endif
  1740. mova m1, m4
  1741. SWAP 1, 4
  1742. mova m7, m5
  1743. SWAP 7, 5
  1744. psubusb m1, m5 ; q0-q1
  1745. psubusb m7, m4 ; q1-q0
  1746. por m1, m7 ; abs(q1-q0)
  1747. %if notcpuflag(mmxext)
  1748. mova m7, m1
  1749. psubusb m1, m6
  1750. psubusb m7, m_hevthr
  1751. pxor m6, m6
  1752. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  1753. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  1754. mova m6, m_maskres
  1755. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  1756. pand m6, m7
  1757. %else ; mmxext/sse2
  1758. pxor m7, m7
  1759. pmaxub m0, m1
  1760. pmaxub m6, m1
  1761. psubusb m0, m_flimI
  1762. psubusb m6, m_hevthr
  1763. pcmpeqb m0, m7 ; max(abs(..)) <= I
  1764. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  1765. %endif
  1766. %ifdef m12
  1767. SWAP 6, 12
  1768. %else
  1769. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  1770. %endif
  1771. ; simple_limit
  1772. mova m1, m3
  1773. SWAP 1, 3
  1774. mova m6, m4 ; keep copies of p0/q0 around for later use
  1775. SWAP 6, 4
  1776. psubusb m1, m4 ; p0-q0
  1777. psubusb m6, m3 ; q0-p0
  1778. por m1, m6 ; abs(q0-p0)
  1779. paddusb m1, m1 ; m1=2*abs(q0-p0)
  1780. mova m7, m2
  1781. SWAP 7, 2
  1782. mova m6, m5
  1783. SWAP 6, 5
  1784. psubusb m7, m5 ; p1-q1
  1785. psubusb m6, m2 ; q1-p1
  1786. por m7, m6 ; abs(q1-p1)
  1787. pxor m6, m6
  1788. pand m7, [pb_FE]
  1789. psrlq m7, 1 ; abs(q1-p1)/2
  1790. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  1791. psubusb m7, m_flimE
  1792. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  1793. pand m0, m7 ; normal_limit result
  1794. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  1795. %ifdef m8 ; x86-64 && sse2
  1796. mova m8, [pb_80]
  1797. %define m_pb_80 m8
  1798. %else ; x86-32 or mmx/mmxext
  1799. %define m_pb_80 [pb_80]
  1800. %endif
  1801. mova m1, m4
  1802. mova m7, m3
  1803. pxor m1, m_pb_80
  1804. pxor m7, m_pb_80
  1805. psubsb m1, m7 ; (signed) q0-p0
  1806. mova m6, m2
  1807. mova m7, m5
  1808. pxor m6, m_pb_80
  1809. pxor m7, m_pb_80
  1810. psubsb m6, m7 ; (signed) p1-q1
  1811. mova m7, m_maskres
  1812. pandn m7, m6
  1813. paddsb m7, m1
  1814. paddsb m7, m1
  1815. paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
  1816. pand m7, m0
  1817. mova m1, [pb_F8]
  1818. mova m6, m7
  1819. paddsb m7, [pb_3]
  1820. paddsb m6, [pb_4]
  1821. pand m7, m1
  1822. pand m6, m1
  1823. pxor m1, m1
  1824. pxor m0, m0
  1825. pcmpgtb m1, m7
  1826. psubb m0, m7
  1827. psrlq m7, 3 ; +f2
  1828. psrlq m0, 3 ; -f2
  1829. pand m0, m1
  1830. pandn m1, m7
  1831. psubusb m3, m0
  1832. paddusb m3, m1 ; p0+f2
  1833. pxor m1, m1
  1834. pxor m0, m0
  1835. pcmpgtb m0, m6
  1836. psubb m1, m6
  1837. psrlq m6, 3 ; +f1
  1838. psrlq m1, 3 ; -f1
  1839. pand m1, m0
  1840. pandn m0, m6
  1841. psubusb m4, m0
  1842. paddusb m4, m1 ; q0-f1
  1843. %ifdef m12
  1844. SWAP 6, 12
  1845. %else
  1846. mova m6, m_maskres
  1847. %endif
  1848. %if notcpuflag(mmxext)
  1849. mova m7, [pb_1]
  1850. %else ; mmxext/sse2
  1851. pxor m7, m7
  1852. %endif
  1853. pand m0, m6
  1854. pand m1, m6
  1855. %if notcpuflag(mmxext)
  1856. paddusb m0, m7
  1857. pand m1, [pb_FE]
  1858. pandn m7, m0
  1859. psrlq m1, 1
  1860. psrlq m7, 1
  1861. SWAP 0, 7
  1862. %else ; mmxext/sse2
  1863. psubusb m1, [pb_1]
  1864. pavgb m0, m7 ; a
  1865. pavgb m1, m7 ; -a
  1866. %endif
  1867. psubusb m5, m0
  1868. psubusb m2, m1
  1869. paddusb m5, m1 ; q1-a
  1870. paddusb m2, m0 ; p1+a
  1871. ; store
  1872. %ifidn %1, v
  1873. movrow [dst1q+mstrideq*2], m2
  1874. movrow [dst1q+mstrideq ], m3
  1875. movrow [dst1q], m4
  1876. movrow [dst1q+ strideq ], m5
  1877. %if mmsize == 16 && %2 == 8
  1878. movhps [dst8q+mstrideq*2], m2
  1879. movhps [dst8q+mstrideq ], m3
  1880. movhps [dst8q], m4
  1881. movhps [dst8q+ strideq ], m5
  1882. %endif
  1883. %else ; h
  1884. add dst1q, 2
  1885. add dst2q, 2
  1886. ; 4x8/16 transpose
  1887. TRANSPOSE4x4B 2, 3, 4, 5, 6
  1888. %if mmsize == 8 ; mmx/mmxext (h)
  1889. WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
  1890. %else ; sse2 (h)
  1891. lea dst8q, [dst8q+mstrideq +2]
  1892. WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  1893. %endif
  1894. %endif
  1895. %if mmsize == 8
  1896. %if %2 == 8 ; chroma
  1897. %ifidn %1, h
  1898. sub dst1q, 2
  1899. %endif
  1900. cmp dst1q, dst8q
  1901. mov dst1q, dst8q
  1902. jnz .next8px
  1903. %else
  1904. %ifidn %1, h
  1905. lea dst1q, [dst1q+ strideq*8-2]
  1906. %else ; v
  1907. add dst1q, 8
  1908. %endif
  1909. dec cntrq
  1910. jg .next8px
  1911. %endif
  1912. %endif
  1913. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  1914. ADD rsp, pad
  1915. %endif
  1916. RET
  1917. %endmacro
  1918. %if ARCH_X86_32
  1919. INIT_MMX mmx
  1920. INNER_LOOPFILTER v, 16
  1921. INNER_LOOPFILTER h, 16
  1922. INNER_LOOPFILTER v, 8
  1923. INNER_LOOPFILTER h, 8
  1924. INIT_MMX mmxext
  1925. INNER_LOOPFILTER v, 16
  1926. INNER_LOOPFILTER h, 16
  1927. INNER_LOOPFILTER v, 8
  1928. INNER_LOOPFILTER h, 8
  1929. %endif
  1930. INIT_XMM sse2
  1931. INNER_LOOPFILTER v, 16
  1932. INNER_LOOPFILTER h, 16
  1933. INNER_LOOPFILTER v, 8
  1934. INNER_LOOPFILTER h, 8
  1935. INIT_XMM ssse3
  1936. INNER_LOOPFILTER v, 16
  1937. INNER_LOOPFILTER h, 16
  1938. INNER_LOOPFILTER v, 8
  1939. INNER_LOOPFILTER h, 8
  1940. ;-----------------------------------------------------------------------------
  1941. ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1942. ; int flimE, int flimI, int hev_thr);
  1943. ;-----------------------------------------------------------------------------
  1944. %macro MBEDGE_LOOPFILTER 2
  1945. %if %2 == 8 ; chroma
  1946. cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr
  1947. %else ; luma
  1948. cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr
  1949. %endif
  1950. %if cpuflag(ssse3)
  1951. pxor m7, m7
  1952. %endif
  1953. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1954. %if mmsize == 16 ; [3]=hev() result
  1955. ; [4]=filter tmp result
  1956. ; [5]/[6] = p2/q2 backup
  1957. ; [7]=lim_res sign result
  1958. %assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15)
  1959. %else ; 8 ; extra storage space for transposes
  1960. %assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15)
  1961. %endif
  1962. ; splat function arguments
  1963. SPLATB_REG m0, flimEq, m7 ; E
  1964. SPLATB_REG m1, flimIq, m7 ; I
  1965. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1966. SUB rsp, pad
  1967. %define m_flimE [rsp]
  1968. %define m_flimI [rsp+mmsize]
  1969. %define m_hevthr [rsp+mmsize*2]
  1970. %define m_maskres [rsp+mmsize*3]
  1971. %define m_limres [rsp+mmsize*4]
  1972. %define m_p0backup [rsp+mmsize*3]
  1973. %define m_q0backup [rsp+mmsize*4]
  1974. %define m_p2backup [rsp+mmsize*5]
  1975. %define m_q2backup [rsp+mmsize*6]
  1976. %if mmsize == 16
  1977. %define m_limsign [rsp]
  1978. %else
  1979. %define m_limsign [rsp+mmsize*7]
  1980. %endif
  1981. mova m_flimE, m0
  1982. mova m_flimI, m1
  1983. mova m_hevthr, m2
  1984. %else ; sse2 on x86-64
  1985. %define m_flimE m9
  1986. %define m_flimI m10
  1987. %define m_hevthr m11
  1988. %define m_maskres m12
  1989. %define m_limres m8
  1990. %define m_p0backup m12
  1991. %define m_q0backup m8
  1992. %define m_p2backup m13
  1993. %define m_q2backup m14
  1994. %define m_limsign m9
  1995. ; splat function arguments
  1996. SPLATB_REG m_flimE, flimEq, m7 ; E
  1997. SPLATB_REG m_flimI, flimIq, m7 ; I
  1998. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  1999. %endif
  2000. %if %2 == 8 ; chroma
  2001. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  2002. %elif mmsize == 8
  2003. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  2004. mov cntrq, 2
  2005. %else
  2006. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  2007. %endif
  2008. mov strideq, mstrideq
  2009. neg mstrideq
  2010. %ifidn %1, h
  2011. lea dst1q, [dst1q+strideq*4-4]
  2012. %if %2 == 8 ; chroma
  2013. lea dst8q, [dst8q+strideq*4-4]
  2014. %endif
  2015. %endif
  2016. %if mmsize == 8
  2017. .next8px:
  2018. %endif
  2019. ; read
  2020. lea dst2q, [dst1q+ strideq ]
  2021. %ifidn %1, v
  2022. %if %2 == 8 && mmsize == 16
  2023. %define movrow movh
  2024. %else
  2025. %define movrow mova
  2026. %endif
  2027. movrow m0, [dst1q+mstrideq*4] ; p3
  2028. movrow m1, [dst2q+mstrideq*4] ; p2
  2029. movrow m2, [dst1q+mstrideq*2] ; p1
  2030. movrow m5, [dst2q] ; q1
  2031. movrow m6, [dst2q+ strideq ] ; q2
  2032. movrow m7, [dst2q+ strideq*2] ; q3
  2033. %if mmsize == 16 && %2 == 8
  2034. movhps m0, [dst8q+mstrideq*4]
  2035. movhps m2, [dst8q+mstrideq*2]
  2036. add dst8q, strideq
  2037. movhps m1, [dst8q+mstrideq*4]
  2038. movhps m5, [dst8q]
  2039. movhps m6, [dst8q+ strideq ]
  2040. movhps m7, [dst8q+ strideq*2]
  2041. add dst8q, mstrideq
  2042. %endif
  2043. %elif mmsize == 8 ; mmx/mmxext (h)
  2044. ; read 8 rows of 8px each
  2045. movu m0, [dst1q+mstrideq*4]
  2046. movu m1, [dst2q+mstrideq*4]
  2047. movu m2, [dst1q+mstrideq*2]
  2048. movu m3, [dst1q+mstrideq ]
  2049. movu m4, [dst1q]
  2050. movu m5, [dst2q]
  2051. movu m6, [dst2q+ strideq ]
  2052. ; 8x8 transpose
  2053. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2054. mova m_q0backup, m1
  2055. movu m7, [dst2q+ strideq*2]
  2056. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2057. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2058. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2059. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2060. mova m1, m_q0backup
  2061. mova m_q0backup, m2 ; store q0
  2062. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2063. mova m_p0backup, m5 ; store p0
  2064. SWAP 1, 4
  2065. SWAP 2, 4
  2066. SWAP 6, 3
  2067. SWAP 5, 3
  2068. %else ; sse2 (h)
  2069. %if %2 == 16
  2070. lea dst8q, [dst1q+ strideq*8 ]
  2071. %endif
  2072. ; read 16 rows of 8px each, interleave
  2073. movh m0, [dst1q+mstrideq*4]
  2074. movh m1, [dst8q+mstrideq*4]
  2075. movh m2, [dst1q+mstrideq*2]
  2076. movh m5, [dst8q+mstrideq*2]
  2077. movh m3, [dst1q+mstrideq ]
  2078. movh m6, [dst8q+mstrideq ]
  2079. movh m4, [dst1q]
  2080. movh m7, [dst8q]
  2081. punpcklbw m0, m1 ; A/I
  2082. punpcklbw m2, m5 ; C/K
  2083. punpcklbw m3, m6 ; D/L
  2084. punpcklbw m4, m7 ; E/M
  2085. add dst8q, strideq
  2086. movh m1, [dst2q+mstrideq*4]
  2087. movh m6, [dst8q+mstrideq*4]
  2088. movh m5, [dst2q]
  2089. movh m7, [dst8q]
  2090. punpcklbw m1, m6 ; B/J
  2091. punpcklbw m5, m7 ; F/N
  2092. movh m6, [dst2q+ strideq ]
  2093. movh m7, [dst8q+ strideq ]
  2094. punpcklbw m6, m7 ; G/O
  2095. ; 8x16 transpose
  2096. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2097. %ifdef m8
  2098. SWAP 1, 8
  2099. %else
  2100. mova m_q0backup, m1
  2101. %endif
  2102. movh m7, [dst2q+ strideq*2]
  2103. movh m1, [dst8q+ strideq*2]
  2104. punpcklbw m7, m1 ; H/P
  2105. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2106. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2107. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2108. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2109. %ifdef m8
  2110. SWAP 1, 8
  2111. SWAP 2, 8
  2112. %else
  2113. mova m1, m_q0backup
  2114. mova m_q0backup, m2 ; store q0
  2115. %endif
  2116. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2117. %ifdef m12
  2118. SWAP 5, 12
  2119. %else
  2120. mova m_p0backup, m5 ; store p0
  2121. %endif
  2122. SWAP 1, 4
  2123. SWAP 2, 4
  2124. SWAP 6, 3
  2125. SWAP 5, 3
  2126. %endif
  2127. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  2128. mova m4, m1
  2129. SWAP 4, 1
  2130. psubusb m4, m0 ; p2-p3
  2131. psubusb m0, m1 ; p3-p2
  2132. por m0, m4 ; abs(p3-p2)
  2133. mova m4, m2
  2134. SWAP 4, 2
  2135. psubusb m4, m1 ; p1-p2
  2136. mova m_p2backup, m1
  2137. psubusb m1, m2 ; p2-p1
  2138. por m1, m4 ; abs(p2-p1)
  2139. mova m4, m6
  2140. SWAP 4, 6
  2141. psubusb m4, m7 ; q2-q3
  2142. psubusb m7, m6 ; q3-q2
  2143. por m7, m4 ; abs(q3-q2)
  2144. mova m4, m5
  2145. SWAP 4, 5
  2146. psubusb m4, m6 ; q1-q2
  2147. mova m_q2backup, m6
  2148. psubusb m6, m5 ; q2-q1
  2149. por m6, m4 ; abs(q2-q1)
  2150. %if notcpuflag(mmxext)
  2151. mova m4, m_flimI
  2152. pxor m3, m3
  2153. psubusb m0, m4
  2154. psubusb m1, m4
  2155. psubusb m7, m4
  2156. psubusb m6, m4
  2157. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  2158. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  2159. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  2160. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  2161. pand m0, m1
  2162. pand m7, m6
  2163. pand m0, m7
  2164. %else ; mmxext/sse2
  2165. pmaxub m0, m1
  2166. pmaxub m6, m7
  2167. pmaxub m0, m6
  2168. %endif
  2169. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  2170. SWAP 7, 3 ; now m7 is zero
  2171. %ifidn %1, v
  2172. movrow m3, [dst1q+mstrideq ] ; p0
  2173. %if mmsize == 16 && %2 == 8
  2174. movhps m3, [dst8q+mstrideq ]
  2175. %endif
  2176. %elifdef m12
  2177. SWAP 3, 12
  2178. %else
  2179. mova m3, m_p0backup
  2180. %endif
  2181. mova m1, m2
  2182. SWAP 1, 2
  2183. mova m6, m3
  2184. SWAP 3, 6
  2185. psubusb m1, m3 ; p1-p0
  2186. psubusb m6, m2 ; p0-p1
  2187. por m1, m6 ; abs(p1-p0)
  2188. %if notcpuflag(mmxext)
  2189. mova m6, m1
  2190. psubusb m1, m4
  2191. psubusb m6, m_hevthr
  2192. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  2193. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  2194. pand m0, m1
  2195. mova m_maskres, m6
  2196. %else ; mmxext/sse2
  2197. pmaxub m0, m1 ; max_I
  2198. SWAP 1, 4 ; max_hev_thresh
  2199. %endif
  2200. SWAP 6, 4 ; now m6 is I
  2201. %ifidn %1, v
  2202. movrow m4, [dst1q] ; q0
  2203. %if mmsize == 16 && %2 == 8
  2204. movhps m4, [dst8q]
  2205. %endif
  2206. %elifdef m8
  2207. SWAP 4, 8
  2208. %else
  2209. mova m4, m_q0backup
  2210. %endif
  2211. mova m1, m4
  2212. SWAP 1, 4
  2213. mova m7, m5
  2214. SWAP 7, 5
  2215. psubusb m1, m5 ; q0-q1
  2216. psubusb m7, m4 ; q1-q0
  2217. por m1, m7 ; abs(q1-q0)
  2218. %if notcpuflag(mmxext)
  2219. mova m7, m1
  2220. psubusb m1, m6
  2221. psubusb m7, m_hevthr
  2222. pxor m6, m6
  2223. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  2224. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  2225. mova m6, m_maskres
  2226. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  2227. pand m6, m7
  2228. %else ; mmxext/sse2
  2229. pxor m7, m7
  2230. pmaxub m0, m1
  2231. pmaxub m6, m1
  2232. psubusb m0, m_flimI
  2233. psubusb m6, m_hevthr
  2234. pcmpeqb m0, m7 ; max(abs(..)) <= I
  2235. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  2236. %endif
  2237. %ifdef m12
  2238. SWAP 6, 12
  2239. %else
  2240. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  2241. %endif
  2242. ; simple_limit
  2243. mova m1, m3
  2244. SWAP 1, 3
  2245. mova m6, m4 ; keep copies of p0/q0 around for later use
  2246. SWAP 6, 4
  2247. psubusb m1, m4 ; p0-q0
  2248. psubusb m6, m3 ; q0-p0
  2249. por m1, m6 ; abs(q0-p0)
  2250. paddusb m1, m1 ; m1=2*abs(q0-p0)
  2251. mova m7, m2
  2252. SWAP 7, 2
  2253. mova m6, m5
  2254. SWAP 6, 5
  2255. psubusb m7, m5 ; p1-q1
  2256. psubusb m6, m2 ; q1-p1
  2257. por m7, m6 ; abs(q1-p1)
  2258. pxor m6, m6
  2259. pand m7, [pb_FE]
  2260. psrlq m7, 1 ; abs(q1-p1)/2
  2261. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  2262. psubusb m7, m_flimE
  2263. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  2264. pand m0, m7 ; normal_limit result
  2265. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  2266. %ifdef m8 ; x86-64 && sse2
  2267. mova m8, [pb_80]
  2268. %define m_pb_80 m8
  2269. %else ; x86-32 or mmx/mmxext
  2270. %define m_pb_80 [pb_80]
  2271. %endif
  2272. mova m1, m4
  2273. mova m7, m3
  2274. pxor m1, m_pb_80
  2275. pxor m7, m_pb_80
  2276. psubsb m1, m7 ; (signed) q0-p0
  2277. mova m6, m2
  2278. mova m7, m5
  2279. pxor m6, m_pb_80
  2280. pxor m7, m_pb_80
  2281. psubsb m6, m7 ; (signed) p1-q1
  2282. mova m7, m_maskres
  2283. paddsb m6, m1
  2284. paddsb m6, m1
  2285. paddsb m6, m1
  2286. pand m6, m0
  2287. %ifdef m8
  2288. mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
  2289. pand m_limres, m7
  2290. %else
  2291. mova m0, m6
  2292. pand m0, m7
  2293. mova m_limres, m0
  2294. %endif
  2295. pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
  2296. mova m1, [pb_F8]
  2297. mova m6, m7
  2298. paddsb m7, [pb_3]
  2299. paddsb m6, [pb_4]
  2300. pand m7, m1
  2301. pand m6, m1
  2302. pxor m1, m1
  2303. pxor m0, m0
  2304. pcmpgtb m1, m7
  2305. psubb m0, m7
  2306. psrlq m7, 3 ; +f2
  2307. psrlq m0, 3 ; -f2
  2308. pand m0, m1
  2309. pandn m1, m7
  2310. psubusb m3, m0
  2311. paddusb m3, m1 ; p0+f2
  2312. pxor m1, m1
  2313. pxor m0, m0
  2314. pcmpgtb m0, m6
  2315. psubb m1, m6
  2316. psrlq m6, 3 ; +f1
  2317. psrlq m1, 3 ; -f1
  2318. pand m1, m0
  2319. pandn m0, m6
  2320. psubusb m4, m0
  2321. paddusb m4, m1 ; q0-f1
  2322. ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
  2323. %if cpuflag(ssse3)
  2324. mova m7, [pb_1]
  2325. %else
  2326. mova m7, [pw_63]
  2327. %endif
  2328. %ifdef m8
  2329. SWAP 1, 8
  2330. %else
  2331. mova m1, m_limres
  2332. %endif
  2333. pxor m0, m0
  2334. mova m6, m1
  2335. pcmpgtb m0, m1 ; which are negative
  2336. %if cpuflag(ssse3)
  2337. punpcklbw m6, m7 ; interleave with "1" for rounding
  2338. punpckhbw m1, m7
  2339. %else
  2340. punpcklbw m6, m0 ; signed byte->word
  2341. punpckhbw m1, m0
  2342. %endif
  2343. mova m_limsign, m0
  2344. %if cpuflag(ssse3)
  2345. mova m7, [pb_27_63]
  2346. %ifndef m8
  2347. mova m_limres, m1
  2348. %endif
  2349. %ifdef m10
  2350. SWAP 0, 10 ; don't lose lim_sign copy
  2351. %endif
  2352. mova m0, m7
  2353. pmaddubsw m7, m6
  2354. SWAP 6, 7
  2355. pmaddubsw m0, m1
  2356. SWAP 1, 0
  2357. %ifdef m10
  2358. SWAP 0, 10
  2359. %else
  2360. mova m0, m_limsign
  2361. %endif
  2362. %else
  2363. mova m_maskres, m6 ; backup for later in filter
  2364. mova m_limres, m1
  2365. pmullw m6, [pw_27]
  2366. pmullw m1, [pw_27]
  2367. paddw m6, m7
  2368. paddw m1, m7
  2369. %endif
  2370. psraw m6, 7
  2371. psraw m1, 7
  2372. packsswb m6, m1 ; a0
  2373. pxor m1, m1
  2374. psubb m1, m6
  2375. pand m1, m0 ; -a0
  2376. pandn m0, m6 ; +a0
  2377. %if cpuflag(ssse3)
  2378. mova m6, [pb_18_63] ; pipelining
  2379. %endif
  2380. psubusb m3, m1
  2381. paddusb m4, m1
  2382. paddusb m3, m0 ; p0+a0
  2383. psubusb m4, m0 ; q0-a0
  2384. %if cpuflag(ssse3)
  2385. SWAP 6, 7
  2386. %ifdef m10
  2387. SWAP 1, 10
  2388. %else
  2389. mova m1, m_limres
  2390. %endif
  2391. mova m0, m7
  2392. pmaddubsw m7, m6
  2393. SWAP 6, 7
  2394. pmaddubsw m0, m1
  2395. SWAP 1, 0
  2396. %ifdef m10
  2397. SWAP 0, 10
  2398. %endif
  2399. mova m0, m_limsign
  2400. %else
  2401. mova m6, m_maskres
  2402. mova m1, m_limres
  2403. pmullw m6, [pw_18]
  2404. pmullw m1, [pw_18]
  2405. paddw m6, m7
  2406. paddw m1, m7
  2407. %endif
  2408. mova m0, m_limsign
  2409. psraw m6, 7
  2410. psraw m1, 7
  2411. packsswb m6, m1 ; a1
  2412. pxor m1, m1
  2413. psubb m1, m6
  2414. pand m1, m0 ; -a1
  2415. pandn m0, m6 ; +a1
  2416. %if cpuflag(ssse3)
  2417. mova m6, [pb_9_63]
  2418. %endif
  2419. psubusb m2, m1
  2420. paddusb m5, m1
  2421. paddusb m2, m0 ; p1+a1
  2422. psubusb m5, m0 ; q1-a1
  2423. %if cpuflag(ssse3)
  2424. SWAP 6, 7
  2425. %ifdef m10
  2426. SWAP 1, 10
  2427. %else
  2428. mova m1, m_limres
  2429. %endif
  2430. mova m0, m7
  2431. pmaddubsw m7, m6
  2432. SWAP 6, 7
  2433. pmaddubsw m0, m1
  2434. SWAP 1, 0
  2435. %else
  2436. %ifdef m8
  2437. SWAP 6, 12
  2438. SWAP 1, 8
  2439. %else
  2440. mova m6, m_maskres
  2441. mova m1, m_limres
  2442. %endif
  2443. pmullw m6, [pw_9]
  2444. pmullw m1, [pw_9]
  2445. paddw m6, m7
  2446. paddw m1, m7
  2447. %endif
  2448. %ifdef m9
  2449. SWAP 7, 9
  2450. %else
  2451. mova m7, m_limsign
  2452. %endif
  2453. psraw m6, 7
  2454. psraw m1, 7
  2455. packsswb m6, m1 ; a1
  2456. pxor m0, m0
  2457. psubb m0, m6
  2458. pand m0, m7 ; -a1
  2459. pandn m7, m6 ; +a1
  2460. %ifdef m8
  2461. SWAP 1, 13
  2462. SWAP 6, 14
  2463. %else
  2464. mova m1, m_p2backup
  2465. mova m6, m_q2backup
  2466. %endif
  2467. psubusb m1, m0
  2468. paddusb m6, m0
  2469. paddusb m1, m7 ; p1+a1
  2470. psubusb m6, m7 ; q1-a1
  2471. ; store
  2472. %ifidn %1, v
  2473. movrow [dst2q+mstrideq*4], m1
  2474. movrow [dst1q+mstrideq*2], m2
  2475. movrow [dst1q+mstrideq ], m3
  2476. movrow [dst1q], m4
  2477. movrow [dst2q], m5
  2478. movrow [dst2q+ strideq ], m6
  2479. %if mmsize == 16 && %2 == 8
  2480. add dst8q, mstrideq
  2481. movhps [dst8q+mstrideq*2], m1
  2482. movhps [dst8q+mstrideq ], m2
  2483. movhps [dst8q], m3
  2484. add dst8q, strideq
  2485. movhps [dst8q], m4
  2486. movhps [dst8q+ strideq ], m5
  2487. movhps [dst8q+ strideq*2], m6
  2488. %endif
  2489. %else ; h
  2490. inc dst1q
  2491. inc dst2q
  2492. ; 4x8/16 transpose
  2493. TRANSPOSE4x4B 1, 2, 3, 4, 0
  2494. SBUTTERFLY bw, 5, 6, 0
  2495. %if mmsize == 8 ; mmx/mmxext (h)
  2496. WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
  2497. add dst1q, 4
  2498. WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
  2499. %else ; sse2 (h)
  2500. lea dst8q, [dst8q+mstrideq+1]
  2501. WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  2502. lea dst1q, [dst2q+mstrideq+4]
  2503. lea dst8q, [dst8q+mstrideq+4]
  2504. %if cpuflag(sse4)
  2505. add dst2q, 4
  2506. %endif
  2507. WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
  2508. %if cpuflag(sse4)
  2509. lea dst2q, [dst8q+ strideq ]
  2510. %endif
  2511. WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
  2512. %endif
  2513. %endif
  2514. %if mmsize == 8
  2515. %if %2 == 8 ; chroma
  2516. %ifidn %1, h
  2517. sub dst1q, 5
  2518. %endif
  2519. cmp dst1q, dst8q
  2520. mov dst1q, dst8q
  2521. jnz .next8px
  2522. %else
  2523. %ifidn %1, h
  2524. lea dst1q, [dst1q+ strideq*8-5]
  2525. %else ; v
  2526. add dst1q, 8
  2527. %endif
  2528. dec cntrq
  2529. jg .next8px
  2530. %endif
  2531. %endif
  2532. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  2533. ADD rsp, pad
  2534. %endif
  2535. RET
  2536. %endmacro
  2537. %if ARCH_X86_32
  2538. INIT_MMX mmx
  2539. MBEDGE_LOOPFILTER v, 16
  2540. MBEDGE_LOOPFILTER h, 16
  2541. MBEDGE_LOOPFILTER v, 8
  2542. MBEDGE_LOOPFILTER h, 8
  2543. INIT_MMX mmxext
  2544. MBEDGE_LOOPFILTER v, 16
  2545. MBEDGE_LOOPFILTER h, 16
  2546. MBEDGE_LOOPFILTER v, 8
  2547. MBEDGE_LOOPFILTER h, 8
  2548. %endif
  2549. INIT_XMM sse2
  2550. MBEDGE_LOOPFILTER v, 16
  2551. MBEDGE_LOOPFILTER h, 16
  2552. MBEDGE_LOOPFILTER v, 8
  2553. MBEDGE_LOOPFILTER h, 8
  2554. INIT_XMM ssse3
  2555. MBEDGE_LOOPFILTER v, 16
  2556. MBEDGE_LOOPFILTER h, 16
  2557. MBEDGE_LOOPFILTER v, 8
  2558. MBEDGE_LOOPFILTER h, 8
  2559. INIT_XMM sse4
  2560. MBEDGE_LOOPFILTER h, 16
  2561. MBEDGE_LOOPFILTER h, 8