You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

2802 lines
79KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86inc.asm"
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. fourtap_filter_hw_m: times 4 dw -6, 123
  26. times 4 dw 12, -1
  27. times 4 dw -9, 93
  28. times 4 dw 50, -6
  29. times 4 dw -6, 50
  30. times 4 dw 93, -9
  31. times 4 dw -1, 12
  32. times 4 dw 123, -6
  33. sixtap_filter_hw_m: times 4 dw 2, -11
  34. times 4 dw 108, 36
  35. times 4 dw -8, 1
  36. times 4 dw 3, -16
  37. times 4 dw 77, 77
  38. times 4 dw -16, 3
  39. times 4 dw 1, -8
  40. times 4 dw 36, 108
  41. times 4 dw -11, 2
  42. fourtap_filter_hb_m: times 8 db -6, 123
  43. times 8 db 12, -1
  44. times 8 db -9, 93
  45. times 8 db 50, -6
  46. times 8 db -6, 50
  47. times 8 db 93, -9
  48. times 8 db -1, 12
  49. times 8 db 123, -6
  50. sixtap_filter_hb_m: times 8 db 2, 1
  51. times 8 db -11, 108
  52. times 8 db 36, -8
  53. times 8 db 3, 3
  54. times 8 db -16, 77
  55. times 8 db 77, -16
  56. times 8 db 1, 2
  57. times 8 db -8, 36
  58. times 8 db 108, -11
  59. fourtap_filter_v_m: times 8 dw -6
  60. times 8 dw 123
  61. times 8 dw 12
  62. times 8 dw -1
  63. times 8 dw -9
  64. times 8 dw 93
  65. times 8 dw 50
  66. times 8 dw -6
  67. times 8 dw -6
  68. times 8 dw 50
  69. times 8 dw 93
  70. times 8 dw -9
  71. times 8 dw -1
  72. times 8 dw 12
  73. times 8 dw 123
  74. times 8 dw -6
  75. sixtap_filter_v_m: times 8 dw 2
  76. times 8 dw -11
  77. times 8 dw 108
  78. times 8 dw 36
  79. times 8 dw -8
  80. times 8 dw 1
  81. times 8 dw 3
  82. times 8 dw -16
  83. times 8 dw 77
  84. times 8 dw 77
  85. times 8 dw -16
  86. times 8 dw 3
  87. times 8 dw 1
  88. times 8 dw -8
  89. times 8 dw 36
  90. times 8 dw 108
  91. times 8 dw -11
  92. times 8 dw 2
  93. bilinear_filter_vw_m: times 8 dw 1
  94. times 8 dw 2
  95. times 8 dw 3
  96. times 8 dw 4
  97. times 8 dw 5
  98. times 8 dw 6
  99. times 8 dw 7
  100. bilinear_filter_vb_m: times 8 db 7, 1
  101. times 8 db 6, 2
  102. times 8 db 5, 3
  103. times 8 db 4, 4
  104. times 8 db 3, 5
  105. times 8 db 2, 6
  106. times 8 db 1, 7
  107. %ifdef PIC
  108. %define fourtap_filter_hw picregq
  109. %define sixtap_filter_hw picregq
  110. %define fourtap_filter_hb picregq
  111. %define sixtap_filter_hb picregq
  112. %define fourtap_filter_v picregq
  113. %define sixtap_filter_v picregq
  114. %define bilinear_filter_vw picregq
  115. %define bilinear_filter_vb picregq
  116. %define npicregs 1
  117. %else
  118. %define fourtap_filter_hw fourtap_filter_hw_m
  119. %define sixtap_filter_hw sixtap_filter_hw_m
  120. %define fourtap_filter_hb fourtap_filter_hb_m
  121. %define sixtap_filter_hb sixtap_filter_hb_m
  122. %define fourtap_filter_v fourtap_filter_v_m
  123. %define sixtap_filter_v sixtap_filter_v_m
  124. %define bilinear_filter_vw bilinear_filter_vw_m
  125. %define bilinear_filter_vb bilinear_filter_vb_m
  126. %define npicregs 0
  127. %endif
  128. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  129. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  130. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  131. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  132. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  133. pw_256: times 8 dw 256
  134. pw_20091: times 4 dw 20091
  135. pw_17734: times 4 dw 17734
  136. pb_27_63: times 8 db 27, 63
  137. pb_18_63: times 8 db 18, 63
  138. pb_9_63: times 8 db 9, 63
  139. cextern pb_1
  140. cextern pw_3
  141. cextern pb_3
  142. cextern pw_4
  143. cextern pb_4
  144. cextern pw_9
  145. cextern pw_18
  146. cextern pw_27
  147. cextern pw_63
  148. cextern pw_64
  149. cextern pb_80
  150. cextern pb_F8
  151. cextern pb_FE
  152. SECTION .text
  153. ;-----------------------------------------------------------------------------
  154. ; subpel MC functions:
  155. ;
  156. ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  157. ; uint8_t *src, int srcstride,
  158. ; int height, int mx, int my);
  159. ;-----------------------------------------------------------------------------
  160. %macro FILTER_SSSE3 1
  161. cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
  162. lea mxd, [mxq*3]
  163. mova m3, [filter_h6_shuf2]
  164. mova m4, [filter_h6_shuf3]
  165. %ifdef PIC
  166. lea picregq, [sixtap_filter_hb_m]
  167. %endif
  168. mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
  169. mova m6, [sixtap_filter_hb+mxq*8-32]
  170. mova m7, [sixtap_filter_hb+mxq*8-16]
  171. .nextrow
  172. movu m0, [srcq-2]
  173. mova m1, m0
  174. mova m2, m0
  175. %if mmsize == 8
  176. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  177. ; shuffle with a memory operand
  178. punpcklbw m0, [srcq+3]
  179. %else
  180. pshufb m0, [filter_h6_shuf1]
  181. %endif
  182. pshufb m1, m3
  183. pshufb m2, m4
  184. pmaddubsw m0, m5
  185. pmaddubsw m1, m6
  186. pmaddubsw m2, m7
  187. paddsw m0, m1
  188. paddsw m0, m2
  189. pmulhrsw m0, [pw_256]
  190. packuswb m0, m0
  191. movh [dstq], m0 ; store
  192. ; go to next line
  193. add dstq, dststrideq
  194. add srcq, srcstrideq
  195. dec heightd ; next row
  196. jg .nextrow
  197. REP_RET
  198. cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  199. shl mxd, 4
  200. mova m2, [pw_256]
  201. mova m3, [filter_h2_shuf]
  202. mova m4, [filter_h4_shuf]
  203. %ifdef PIC
  204. lea picregq, [fourtap_filter_hb_m]
  205. %endif
  206. mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
  207. mova m6, [fourtap_filter_hb+mxq]
  208. .nextrow
  209. movu m0, [srcq-1]
  210. mova m1, m0
  211. pshufb m0, m3
  212. pshufb m1, m4
  213. pmaddubsw m0, m5
  214. pmaddubsw m1, m6
  215. paddsw m0, m1
  216. pmulhrsw m0, m2
  217. packuswb m0, m0
  218. movh [dstq], m0 ; store
  219. ; go to next line
  220. add dstq, dststrideq
  221. add srcq, srcstrideq
  222. dec heightd ; next row
  223. jg .nextrow
  224. REP_RET
  225. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  226. shl myd, 4
  227. %ifdef PIC
  228. lea picregq, [fourtap_filter_hb_m]
  229. %endif
  230. mova m5, [fourtap_filter_hb+myq-16]
  231. mova m6, [fourtap_filter_hb+myq]
  232. mova m7, [pw_256]
  233. ; read 3 lines
  234. sub srcq, srcstrideq
  235. movh m0, [srcq]
  236. movh m1, [srcq+ srcstrideq]
  237. movh m2, [srcq+2*srcstrideq]
  238. add srcq, srcstrideq
  239. .nextrow
  240. movh m3, [srcq+2*srcstrideq] ; read new row
  241. mova m4, m0
  242. mova m0, m1
  243. punpcklbw m4, m1
  244. mova m1, m2
  245. punpcklbw m2, m3
  246. pmaddubsw m4, m5
  247. pmaddubsw m2, m6
  248. paddsw m4, m2
  249. mova m2, m3
  250. pmulhrsw m4, m7
  251. packuswb m4, m4
  252. movh [dstq], m4
  253. ; go to next line
  254. add dstq, dststrideq
  255. add srcq, srcstrideq
  256. dec heightd ; next row
  257. jg .nextrow
  258. REP_RET
  259. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  260. lea myd, [myq*3]
  261. %ifdef PIC
  262. lea picregq, [sixtap_filter_hb_m]
  263. %endif
  264. lea myq, [sixtap_filter_hb+myq*8]
  265. ; read 5 lines
  266. sub srcq, srcstrideq
  267. sub srcq, srcstrideq
  268. movh m0, [srcq]
  269. movh m1, [srcq+srcstrideq]
  270. movh m2, [srcq+srcstrideq*2]
  271. lea srcq, [srcq+srcstrideq*2]
  272. add srcq, srcstrideq
  273. movh m3, [srcq]
  274. movh m4, [srcq+srcstrideq]
  275. .nextrow
  276. movh m5, [srcq+2*srcstrideq] ; read new row
  277. mova m6, m0
  278. punpcklbw m6, m5
  279. mova m0, m1
  280. punpcklbw m1, m2
  281. mova m7, m3
  282. punpcklbw m7, m4
  283. pmaddubsw m6, [myq-48]
  284. pmaddubsw m1, [myq-32]
  285. pmaddubsw m7, [myq-16]
  286. paddsw m6, m1
  287. paddsw m6, m7
  288. mova m1, m2
  289. mova m2, m3
  290. pmulhrsw m6, [pw_256]
  291. mova m3, m4
  292. packuswb m6, m6
  293. mova m4, m5
  294. movh [dstq], m6
  295. ; go to next line
  296. add dstq, dststrideq
  297. add srcq, srcstrideq
  298. dec heightd ; next row
  299. jg .nextrow
  300. REP_RET
  301. %endmacro
  302. INIT_MMX ssse3
  303. FILTER_SSSE3 4
  304. INIT_XMM ssse3
  305. FILTER_SSSE3 8
  306. ; 4x4 block, H-only 4-tap filter
  307. INIT_MMX mmx2
  308. cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  309. shl mxd, 4
  310. %ifdef PIC
  311. lea picregq, [fourtap_filter_hw_m]
  312. %endif
  313. movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
  314. movq mm5, [fourtap_filter_hw+mxq]
  315. movq mm7, [pw_64]
  316. pxor mm6, mm6
  317. .nextrow
  318. movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
  319. ; first set of 2 pixels
  320. movq mm2, mm1 ; byte ABCD..
  321. punpcklbw mm1, mm6 ; byte->word ABCD
  322. pshufw mm0, mm2, 9 ; byte CDEF..
  323. punpcklbw mm0, mm6 ; byte->word CDEF
  324. pshufw mm3, mm1, 0x94 ; word ABBC
  325. pshufw mm1, mm0, 0x94 ; word CDDE
  326. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  327. movq mm0, mm1 ; backup for second set of pixels
  328. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  329. paddd mm3, mm1 ; finish 1st 2px
  330. ; second set of 2 pixels, use backup of above
  331. punpckhbw mm2, mm6 ; byte->word EFGH
  332. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  333. pshufw mm1, mm2, 0x94 ; word EFFG
  334. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  335. paddd mm0, mm1 ; finish 2nd 2px
  336. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  337. packssdw mm3, mm0 ; merge dword->word (4px)
  338. paddsw mm3, mm7 ; rounding
  339. psraw mm3, 7
  340. packuswb mm3, mm6 ; clip and word->bytes
  341. movd [dstq], mm3 ; store
  342. ; go to next line
  343. add dstq, dststrideq
  344. add srcq, srcstrideq
  345. dec heightd ; next row
  346. jg .nextrow
  347. REP_RET
  348. ; 4x4 block, H-only 6-tap filter
  349. INIT_MMX mmx2
  350. cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
  351. lea mxd, [mxq*3]
  352. %ifdef PIC
  353. lea picregq, [sixtap_filter_hw_m]
  354. %endif
  355. movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
  356. movq mm5, [sixtap_filter_hw+mxq*8-32]
  357. movq mm6, [sixtap_filter_hw+mxq*8-16]
  358. movq mm7, [pw_64]
  359. pxor mm3, mm3
  360. .nextrow
  361. movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
  362. ; first set of 2 pixels
  363. movq mm2, mm1 ; byte ABCD..
  364. punpcklbw mm1, mm3 ; byte->word ABCD
  365. pshufw mm0, mm2, 0x9 ; byte CDEF..
  366. punpckhbw mm2, mm3 ; byte->word EFGH
  367. punpcklbw mm0, mm3 ; byte->word CDEF
  368. pshufw mm1, mm1, 0x94 ; word ABBC
  369. pshufw mm2, mm2, 0x94 ; word EFFG
  370. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  371. pshufw mm3, mm0, 0x94 ; word CDDE
  372. movq mm0, mm3 ; backup for second set of pixels
  373. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  374. paddd mm1, mm3 ; add to 1st 2px cache
  375. movq mm3, mm2 ; backup for second set of pixels
  376. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  377. paddd mm1, mm2 ; finish 1st 2px
  378. ; second set of 2 pixels, use backup of above
  379. movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
  380. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  381. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  382. paddd mm0, mm3 ; add to 2nd 2px cache
  383. pxor mm3, mm3
  384. punpcklbw mm2, mm3 ; byte->word FGHI
  385. pshufw mm2, mm2, 0xE9 ; word GHHI
  386. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  387. paddd mm0, mm2 ; finish 2nd 2px
  388. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  389. packssdw mm1, mm0 ; merge dword->word (4px)
  390. paddsw mm1, mm7 ; rounding
  391. psraw mm1, 7
  392. packuswb mm1, mm3 ; clip and word->bytes
  393. movd [dstq], mm1 ; store
  394. ; go to next line
  395. add dstq, dststrideq
  396. add srcq, srcstrideq
  397. dec heightd ; next row
  398. jg .nextrow
  399. REP_RET
  400. INIT_XMM sse2
  401. cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
  402. shl mxd, 5
  403. %ifdef PIC
  404. lea picregq, [fourtap_filter_v_m]
  405. %endif
  406. lea mxq, [fourtap_filter_v+mxq-32]
  407. pxor m7, m7
  408. mova m4, [pw_64]
  409. mova m5, [mxq+ 0]
  410. mova m6, [mxq+16]
  411. %ifdef m8
  412. mova m8, [mxq+32]
  413. mova m9, [mxq+48]
  414. %endif
  415. .nextrow
  416. movq m0, [srcq-1]
  417. movq m1, [srcq-0]
  418. movq m2, [srcq+1]
  419. movq m3, [srcq+2]
  420. punpcklbw m0, m7
  421. punpcklbw m1, m7
  422. punpcklbw m2, m7
  423. punpcklbw m3, m7
  424. pmullw m0, m5
  425. pmullw m1, m6
  426. %ifdef m8
  427. pmullw m2, m8
  428. pmullw m3, m9
  429. %else
  430. pmullw m2, [mxq+32]
  431. pmullw m3, [mxq+48]
  432. %endif
  433. paddsw m0, m1
  434. paddsw m2, m3
  435. paddsw m0, m2
  436. paddsw m0, m4
  437. psraw m0, 7
  438. packuswb m0, m7
  439. movh [dstq], m0 ; store
  440. ; go to next line
  441. add dstq, dststrideq
  442. add srcq, srcstrideq
  443. dec heightd ; next row
  444. jg .nextrow
  445. REP_RET
  446. INIT_XMM sse2
  447. cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
  448. lea mxd, [mxq*3]
  449. shl mxd, 4
  450. %ifdef PIC
  451. lea picregq, [sixtap_filter_v_m]
  452. %endif
  453. lea mxq, [sixtap_filter_v+mxq-96]
  454. pxor m7, m7
  455. mova m6, [pw_64]
  456. %ifdef m8
  457. mova m8, [mxq+ 0]
  458. mova m9, [mxq+16]
  459. mova m10, [mxq+32]
  460. mova m11, [mxq+48]
  461. mova m12, [mxq+64]
  462. mova m13, [mxq+80]
  463. %endif
  464. .nextrow
  465. movq m0, [srcq-2]
  466. movq m1, [srcq-1]
  467. movq m2, [srcq-0]
  468. movq m3, [srcq+1]
  469. movq m4, [srcq+2]
  470. movq m5, [srcq+3]
  471. punpcklbw m0, m7
  472. punpcklbw m1, m7
  473. punpcklbw m2, m7
  474. punpcklbw m3, m7
  475. punpcklbw m4, m7
  476. punpcklbw m5, m7
  477. %ifdef m8
  478. pmullw m0, m8
  479. pmullw m1, m9
  480. pmullw m2, m10
  481. pmullw m3, m11
  482. pmullw m4, m12
  483. pmullw m5, m13
  484. %else
  485. pmullw m0, [mxq+ 0]
  486. pmullw m1, [mxq+16]
  487. pmullw m2, [mxq+32]
  488. pmullw m3, [mxq+48]
  489. pmullw m4, [mxq+64]
  490. pmullw m5, [mxq+80]
  491. %endif
  492. paddsw m1, m4
  493. paddsw m0, m5
  494. paddsw m1, m2
  495. paddsw m0, m3
  496. paddsw m0, m1
  497. paddsw m0, m6
  498. psraw m0, 7
  499. packuswb m0, m7
  500. movh [dstq], m0 ; store
  501. ; go to next line
  502. add dstq, dststrideq
  503. add srcq, srcstrideq
  504. dec heightd ; next row
  505. jg .nextrow
  506. REP_RET
  507. %macro FILTER_V 1
  508. ; 4x4 block, V-only 4-tap filter
  509. cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  510. shl myd, 5
  511. %ifdef PIC
  512. lea picregq, [fourtap_filter_v_m]
  513. %endif
  514. lea myq, [fourtap_filter_v+myq-32]
  515. mova m6, [pw_64]
  516. pxor m7, m7
  517. mova m5, [myq+48]
  518. ; read 3 lines
  519. sub srcq, srcstrideq
  520. movh m0, [srcq]
  521. movh m1, [srcq+ srcstrideq]
  522. movh m2, [srcq+2*srcstrideq]
  523. add srcq, srcstrideq
  524. punpcklbw m0, m7
  525. punpcklbw m1, m7
  526. punpcklbw m2, m7
  527. .nextrow
  528. ; first calculate negative taps (to prevent losing positive overflows)
  529. movh m4, [srcq+2*srcstrideq] ; read new row
  530. punpcklbw m4, m7
  531. mova m3, m4
  532. pmullw m0, [myq+0]
  533. pmullw m4, m5
  534. paddsw m4, m0
  535. ; then calculate positive taps
  536. mova m0, m1
  537. pmullw m1, [myq+16]
  538. paddsw m4, m1
  539. mova m1, m2
  540. pmullw m2, [myq+32]
  541. paddsw m4, m2
  542. mova m2, m3
  543. ; round/clip/store
  544. paddsw m4, m6
  545. psraw m4, 7
  546. packuswb m4, m7
  547. movh [dstq], m4
  548. ; go to next line
  549. add dstq, dststrideq
  550. add srcq, srcstrideq
  551. dec heightd ; next row
  552. jg .nextrow
  553. REP_RET
  554. ; 4x4 block, V-only 6-tap filter
  555. cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
  556. shl myd, 4
  557. lea myq, [myq*3]
  558. %ifdef PIC
  559. lea picregq, [sixtap_filter_v_m]
  560. %endif
  561. lea myq, [sixtap_filter_v+myq-96]
  562. pxor m7, m7
  563. ; read 5 lines
  564. sub srcq, srcstrideq
  565. sub srcq, srcstrideq
  566. movh m0, [srcq]
  567. movh m1, [srcq+srcstrideq]
  568. movh m2, [srcq+srcstrideq*2]
  569. lea srcq, [srcq+srcstrideq*2]
  570. add srcq, srcstrideq
  571. movh m3, [srcq]
  572. movh m4, [srcq+srcstrideq]
  573. punpcklbw m0, m7
  574. punpcklbw m1, m7
  575. punpcklbw m2, m7
  576. punpcklbw m3, m7
  577. punpcklbw m4, m7
  578. .nextrow
  579. ; first calculate negative taps (to prevent losing positive overflows)
  580. mova m5, m1
  581. pmullw m5, [myq+16]
  582. mova m6, m4
  583. pmullw m6, [myq+64]
  584. paddsw m6, m5
  585. ; then calculate positive taps
  586. movh m5, [srcq+2*srcstrideq] ; read new row
  587. punpcklbw m5, m7
  588. pmullw m0, [myq+0]
  589. paddsw m6, m0
  590. mova m0, m1
  591. mova m1, m2
  592. pmullw m2, [myq+32]
  593. paddsw m6, m2
  594. mova m2, m3
  595. pmullw m3, [myq+48]
  596. paddsw m6, m3
  597. mova m3, m4
  598. mova m4, m5
  599. pmullw m5, [myq+80]
  600. paddsw m6, m5
  601. ; round/clip/store
  602. paddsw m6, [pw_64]
  603. psraw m6, 7
  604. packuswb m6, m7
  605. movh [dstq], m6
  606. ; go to next line
  607. add dstq, dststrideq
  608. add srcq, srcstrideq
  609. dec heightd ; next row
  610. jg .nextrow
  611. REP_RET
  612. %endmacro
  613. INIT_MMX mmx2
  614. FILTER_V 4
  615. INIT_XMM sse2
  616. FILTER_V 8
  617. %macro FILTER_BILINEAR 1
  618. cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
  619. shl myd, 4
  620. %ifdef PIC
  621. lea picregq, [bilinear_filter_vw_m]
  622. %endif
  623. pxor m6, m6
  624. mova m5, [bilinear_filter_vw+myq-1*16]
  625. neg myq
  626. mova m4, [bilinear_filter_vw+myq+7*16]
  627. .nextrow
  628. movh m0, [srcq+srcstrideq*0]
  629. movh m1, [srcq+srcstrideq*1]
  630. movh m3, [srcq+srcstrideq*2]
  631. punpcklbw m0, m6
  632. punpcklbw m1, m6
  633. punpcklbw m3, m6
  634. mova m2, m1
  635. pmullw m0, m4
  636. pmullw m1, m5
  637. pmullw m2, m4
  638. pmullw m3, m5
  639. paddsw m0, m1
  640. paddsw m2, m3
  641. psraw m0, 2
  642. psraw m2, 2
  643. pavgw m0, m6
  644. pavgw m2, m6
  645. %if mmsize == 8
  646. packuswb m0, m0
  647. packuswb m2, m2
  648. movh [dstq+dststrideq*0], m0
  649. movh [dstq+dststrideq*1], m2
  650. %else
  651. packuswb m0, m2
  652. movh [dstq+dststrideq*0], m0
  653. movhps [dstq+dststrideq*1], m0
  654. %endif
  655. lea dstq, [dstq+dststrideq*2]
  656. lea srcq, [srcq+srcstrideq*2]
  657. sub heightd, 2
  658. jg .nextrow
  659. REP_RET
  660. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
  661. shl mxd, 4
  662. %ifdef PIC
  663. lea picregq, [bilinear_filter_vw_m]
  664. %endif
  665. pxor m6, m6
  666. mova m5, [bilinear_filter_vw+mxq-1*16]
  667. neg mxq
  668. mova m4, [bilinear_filter_vw+mxq+7*16]
  669. .nextrow
  670. movh m0, [srcq+srcstrideq*0+0]
  671. movh m1, [srcq+srcstrideq*0+1]
  672. movh m2, [srcq+srcstrideq*1+0]
  673. movh m3, [srcq+srcstrideq*1+1]
  674. punpcklbw m0, m6
  675. punpcklbw m1, m6
  676. punpcklbw m2, m6
  677. punpcklbw m3, m6
  678. pmullw m0, m4
  679. pmullw m1, m5
  680. pmullw m2, m4
  681. pmullw m3, m5
  682. paddsw m0, m1
  683. paddsw m2, m3
  684. psraw m0, 2
  685. psraw m2, 2
  686. pavgw m0, m6
  687. pavgw m2, m6
  688. %if mmsize == 8
  689. packuswb m0, m0
  690. packuswb m2, m2
  691. movh [dstq+dststrideq*0], m0
  692. movh [dstq+dststrideq*1], m2
  693. %else
  694. packuswb m0, m2
  695. movh [dstq+dststrideq*0], m0
  696. movhps [dstq+dststrideq*1], m0
  697. %endif
  698. lea dstq, [dstq+dststrideq*2]
  699. lea srcq, [srcq+srcstrideq*2]
  700. sub heightd, 2
  701. jg .nextrow
  702. REP_RET
  703. %endmacro
  704. INIT_MMX mmx2
  705. FILTER_BILINEAR 4
  706. INIT_XMM sse2
  707. FILTER_BILINEAR 8
  708. %macro FILTER_BILINEAR_SSSE3 1
  709. cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
  710. shl myd, 4
  711. %ifdef PIC
  712. lea picregq, [bilinear_filter_vb_m]
  713. %endif
  714. pxor m4, m4
  715. mova m3, [bilinear_filter_vb+myq-16]
  716. .nextrow
  717. movh m0, [srcq+srcstrideq*0]
  718. movh m1, [srcq+srcstrideq*1]
  719. movh m2, [srcq+srcstrideq*2]
  720. punpcklbw m0, m1
  721. punpcklbw m1, m2
  722. pmaddubsw m0, m3
  723. pmaddubsw m1, m3
  724. psraw m0, 2
  725. psraw m1, 2
  726. pavgw m0, m4
  727. pavgw m1, m4
  728. %if mmsize==8
  729. packuswb m0, m0
  730. packuswb m1, m1
  731. movh [dstq+dststrideq*0], m0
  732. movh [dstq+dststrideq*1], m1
  733. %else
  734. packuswb m0, m1
  735. movh [dstq+dststrideq*0], m0
  736. movhps [dstq+dststrideq*1], m0
  737. %endif
  738. lea dstq, [dstq+dststrideq*2]
  739. lea srcq, [srcq+srcstrideq*2]
  740. sub heightd, 2
  741. jg .nextrow
  742. REP_RET
  743. cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
  744. shl mxd, 4
  745. %ifdef PIC
  746. lea picregq, [bilinear_filter_vb_m]
  747. %endif
  748. pxor m4, m4
  749. mova m2, [filter_h2_shuf]
  750. mova m3, [bilinear_filter_vb+mxq-16]
  751. .nextrow
  752. movu m0, [srcq+srcstrideq*0]
  753. movu m1, [srcq+srcstrideq*1]
  754. pshufb m0, m2
  755. pshufb m1, m2
  756. pmaddubsw m0, m3
  757. pmaddubsw m1, m3
  758. psraw m0, 2
  759. psraw m1, 2
  760. pavgw m0, m4
  761. pavgw m1, m4
  762. %if mmsize==8
  763. packuswb m0, m0
  764. packuswb m1, m1
  765. movh [dstq+dststrideq*0], m0
  766. movh [dstq+dststrideq*1], m1
  767. %else
  768. packuswb m0, m1
  769. movh [dstq+dststrideq*0], m0
  770. movhps [dstq+dststrideq*1], m0
  771. %endif
  772. lea dstq, [dstq+dststrideq*2]
  773. lea srcq, [srcq+srcstrideq*2]
  774. sub heightd, 2
  775. jg .nextrow
  776. REP_RET
  777. %endmacro
  778. INIT_MMX ssse3
  779. FILTER_BILINEAR_SSSE3 4
  780. INIT_XMM ssse3
  781. FILTER_BILINEAR_SSSE3 8
  782. INIT_MMX mmx
  783. cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
  784. .nextrow:
  785. movq mm0, [srcq+srcstrideq*0]
  786. movq mm1, [srcq+srcstrideq*1]
  787. lea srcq, [srcq+srcstrideq*2]
  788. movq [dstq+dststrideq*0], mm0
  789. movq [dstq+dststrideq*1], mm1
  790. lea dstq, [dstq+dststrideq*2]
  791. sub heightd, 2
  792. jg .nextrow
  793. REP_RET
  794. %if ARCH_X86_32
  795. INIT_MMX mmx
  796. cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
  797. .nextrow:
  798. movq mm0, [srcq+srcstrideq*0+0]
  799. movq mm1, [srcq+srcstrideq*0+8]
  800. movq mm2, [srcq+srcstrideq*1+0]
  801. movq mm3, [srcq+srcstrideq*1+8]
  802. lea srcq, [srcq+srcstrideq*2]
  803. movq [dstq+dststrideq*0+0], mm0
  804. movq [dstq+dststrideq*0+8], mm1
  805. movq [dstq+dststrideq*1+0], mm2
  806. movq [dstq+dststrideq*1+8], mm3
  807. lea dstq, [dstq+dststrideq*2]
  808. sub heightd, 2
  809. jg .nextrow
  810. REP_RET
  811. %endif
  812. INIT_XMM sse
  813. cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
  814. .nextrow:
  815. movups xmm0, [srcq+srcstrideq*0]
  816. movups xmm1, [srcq+srcstrideq*1]
  817. lea srcq, [srcq+srcstrideq*2]
  818. movaps [dstq+dststrideq*0], xmm0
  819. movaps [dstq+dststrideq*1], xmm1
  820. lea dstq, [dstq+dststrideq*2]
  821. sub heightd, 2
  822. jg .nextrow
  823. REP_RET
  824. ;-----------------------------------------------------------------------------
  825. ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  826. ;-----------------------------------------------------------------------------
  827. %macro ADD_DC 4
  828. %4 m2, [dst1q+%3]
  829. %4 m3, [dst1q+strideq+%3]
  830. %4 m4, [dst2q+%3]
  831. %4 m5, [dst2q+strideq+%3]
  832. paddusb m2, %1
  833. paddusb m3, %1
  834. paddusb m4, %1
  835. paddusb m5, %1
  836. psubusb m2, %2
  837. psubusb m3, %2
  838. psubusb m4, %2
  839. psubusb m5, %2
  840. %4 [dst1q+%3], m2
  841. %4 [dst1q+strideq+%3], m3
  842. %4 [dst2q+%3], m4
  843. %4 [dst2q+strideq+%3], m5
  844. %endmacro
  845. INIT_MMX mmx
  846. cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
  847. ; load data
  848. movd m0, [blockq]
  849. ; calculate DC
  850. paddw m0, [pw_4]
  851. pxor m1, m1
  852. psraw m0, 3
  853. movd [blockq], m1
  854. psubw m1, m0
  855. packuswb m0, m0
  856. packuswb m1, m1
  857. punpcklbw m0, m0
  858. punpcklbw m1, m1
  859. punpcklwd m0, m0
  860. punpcklwd m1, m1
  861. ; add DC
  862. DEFINE_ARGS dst1, dst2, stride
  863. lea dst2q, [dst1q+strideq*2]
  864. ADD_DC m0, m1, 0, movh
  865. RET
  866. INIT_XMM sse4
  867. cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
  868. ; load data
  869. movd m0, [blockq]
  870. pxor m1, m1
  871. ; calculate DC
  872. paddw m0, [pw_4]
  873. movd [blockq], m1
  874. DEFINE_ARGS dst1, dst2, stride
  875. lea dst2q, [dst1q+strideq*2]
  876. movd m2, [dst1q]
  877. movd m3, [dst1q+strideq]
  878. movd m4, [dst2q]
  879. movd m5, [dst2q+strideq]
  880. psraw m0, 3
  881. pshuflw m0, m0, 0
  882. punpcklqdq m0, m0
  883. punpckldq m2, m3
  884. punpckldq m4, m5
  885. punpcklbw m2, m1
  886. punpcklbw m4, m1
  887. paddw m2, m0
  888. paddw m4, m0
  889. packuswb m2, m4
  890. movd [dst1q], m2
  891. pextrd [dst1q+strideq], m2, 1
  892. pextrd [dst2q], m2, 2
  893. pextrd [dst2q+strideq], m2, 3
  894. RET
  895. ;-----------------------------------------------------------------------------
  896. ; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  897. ;-----------------------------------------------------------------------------
  898. %if ARCH_X86_32
  899. INIT_MMX mmx
  900. cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
  901. ; load data
  902. movd m0, [blockq+32*0] ; A
  903. movd m1, [blockq+32*2] ; C
  904. punpcklwd m0, [blockq+32*1] ; A B
  905. punpcklwd m1, [blockq+32*3] ; C D
  906. punpckldq m0, m1 ; A B C D
  907. pxor m6, m6
  908. ; calculate DC
  909. paddw m0, [pw_4]
  910. movd [blockq+32*0], m6
  911. movd [blockq+32*1], m6
  912. movd [blockq+32*2], m6
  913. movd [blockq+32*3], m6
  914. psraw m0, 3
  915. psubw m6, m0
  916. packuswb m0, m0
  917. packuswb m6, m6
  918. punpcklbw m0, m0 ; AABBCCDD
  919. punpcklbw m6, m6 ; AABBCCDD
  920. movq m1, m0
  921. movq m7, m6
  922. punpcklbw m0, m0 ; AAAABBBB
  923. punpckhbw m1, m1 ; CCCCDDDD
  924. punpcklbw m6, m6 ; AAAABBBB
  925. punpckhbw m7, m7 ; CCCCDDDD
  926. ; add DC
  927. DEFINE_ARGS dst1, dst2, stride
  928. lea dst2q, [dst1q+strideq*2]
  929. ADD_DC m0, m6, 0, mova
  930. ADD_DC m1, m7, 8, mova
  931. RET
  932. %endif
  933. INIT_XMM sse2
  934. cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
  935. ; load data
  936. movd m0, [blockq+32*0] ; A
  937. movd m1, [blockq+32*2] ; C
  938. punpcklwd m0, [blockq+32*1] ; A B
  939. punpcklwd m1, [blockq+32*3] ; C D
  940. punpckldq m0, m1 ; A B C D
  941. pxor m1, m1
  942. ; calculate DC
  943. paddw m0, [pw_4]
  944. movd [blockq+32*0], m1
  945. movd [blockq+32*1], m1
  946. movd [blockq+32*2], m1
  947. movd [blockq+32*3], m1
  948. psraw m0, 3
  949. psubw m1, m0
  950. packuswb m0, m0
  951. packuswb m1, m1
  952. punpcklbw m0, m0
  953. punpcklbw m1, m1
  954. punpcklbw m0, m0
  955. punpcklbw m1, m1
  956. ; add DC
  957. DEFINE_ARGS dst1, dst2, stride
  958. lea dst2q, [dst1q+strideq*2]
  959. ADD_DC m0, m1, 0, mova
  960. RET
  961. ;-----------------------------------------------------------------------------
  962. ; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, DCTELEM block[4][16], int stride);
  963. ;-----------------------------------------------------------------------------
  964. INIT_MMX mmx
  965. cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
  966. ; load data
  967. movd m0, [blockq+32*0] ; A
  968. movd m1, [blockq+32*2] ; C
  969. punpcklwd m0, [blockq+32*1] ; A B
  970. punpcklwd m1, [blockq+32*3] ; C D
  971. punpckldq m0, m1 ; A B C D
  972. pxor m6, m6
  973. ; calculate DC
  974. paddw m0, [pw_4]
  975. movd [blockq+32*0], m6
  976. movd [blockq+32*1], m6
  977. movd [blockq+32*2], m6
  978. movd [blockq+32*3], m6
  979. psraw m0, 3
  980. psubw m6, m0
  981. packuswb m0, m0
  982. packuswb m6, m6
  983. punpcklbw m0, m0 ; AABBCCDD
  984. punpcklbw m6, m6 ; AABBCCDD
  985. movq m1, m0
  986. movq m7, m6
  987. punpcklbw m0, m0 ; AAAABBBB
  988. punpckhbw m1, m1 ; CCCCDDDD
  989. punpcklbw m6, m6 ; AAAABBBB
  990. punpckhbw m7, m7 ; CCCCDDDD
  991. ; add DC
  992. DEFINE_ARGS dst1, dst2, stride
  993. lea dst2q, [dst1q+strideq*2]
  994. ADD_DC m0, m6, 0, mova
  995. lea dst1q, [dst1q+strideq*4]
  996. lea dst2q, [dst2q+strideq*4]
  997. ADD_DC m1, m7, 0, mova
  998. RET
  999. ;-----------------------------------------------------------------------------
  1000. ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  1001. ;-----------------------------------------------------------------------------
  1002. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  1003. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  1004. %macro VP8_MULTIPLY_SUMSUB 4
  1005. mova %3, %1
  1006. mova %4, %2
  1007. pmulhw %3, m6 ;20091(1)
  1008. pmulhw %4, m6 ;20091(2)
  1009. paddw %3, %1
  1010. paddw %4, %2
  1011. paddw %1, %1
  1012. paddw %2, %2
  1013. pmulhw %1, m7 ;35468(1)
  1014. pmulhw %2, m7 ;35468(2)
  1015. psubw %1, %4
  1016. paddw %2, %3
  1017. %endmacro
  1018. ; calculate x0=%1+%3; x1=%1-%3
  1019. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  1020. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  1021. ; %5/%6 are temporary registers
  1022. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  1023. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  1024. SUMSUB_BA w, %3, %1, %5 ;t0, t1
  1025. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  1026. SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
  1027. SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
  1028. SWAP %4, %1
  1029. SWAP %4, %3
  1030. %endmacro
  1031. %macro VP8_IDCT_ADD 0
  1032. cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
  1033. ; load block data
  1034. movq m0, [blockq+ 0]
  1035. movq m1, [blockq+ 8]
  1036. movq m2, [blockq+16]
  1037. movq m3, [blockq+24]
  1038. movq m6, [pw_20091]
  1039. movq m7, [pw_17734]
  1040. %if cpuflag(sse)
  1041. xorps xmm0, xmm0
  1042. movaps [blockq+ 0], xmm0
  1043. movaps [blockq+16], xmm0
  1044. %else
  1045. pxor m4, m4
  1046. movq [blockq+ 0], m4
  1047. movq [blockq+ 8], m4
  1048. movq [blockq+16], m4
  1049. movq [blockq+24], m4
  1050. %endif
  1051. ; actual IDCT
  1052. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1053. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1054. paddw m0, [pw_4]
  1055. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  1056. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1057. ; store
  1058. pxor m4, m4
  1059. DEFINE_ARGS dst1, dst2, stride
  1060. lea dst2q, [dst1q+2*strideq]
  1061. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
  1062. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
  1063. RET
  1064. %endmacro
  1065. %if ARCH_X86_32
  1066. INIT_MMX mmx
  1067. VP8_IDCT_ADD
  1068. %endif
  1069. INIT_MMX sse
  1070. VP8_IDCT_ADD
  1071. ;-----------------------------------------------------------------------------
  1072. ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
  1073. ;-----------------------------------------------------------------------------
  1074. %macro SCATTER_WHT 3
  1075. movd dc1d, m%1
  1076. movd dc2d, m%2
  1077. mov [blockq+2*16*(0+%3)], dc1w
  1078. mov [blockq+2*16*(1+%3)], dc2w
  1079. shr dc1d, 16
  1080. shr dc2d, 16
  1081. psrlq m%1, 32
  1082. psrlq m%2, 32
  1083. mov [blockq+2*16*(4+%3)], dc1w
  1084. mov [blockq+2*16*(5+%3)], dc2w
  1085. movd dc1d, m%1
  1086. movd dc2d, m%2
  1087. mov [blockq+2*16*(8+%3)], dc1w
  1088. mov [blockq+2*16*(9+%3)], dc2w
  1089. shr dc1d, 16
  1090. shr dc2d, 16
  1091. mov [blockq+2*16*(12+%3)], dc1w
  1092. mov [blockq+2*16*(13+%3)], dc2w
  1093. %endmacro
  1094. %macro HADAMARD4_1D 4
  1095. SUMSUB_BADC w, %2, %1, %4, %3
  1096. SUMSUB_BADC w, %4, %2, %3, %1
  1097. SWAP %1, %4, %3
  1098. %endmacro
  1099. %macro VP8_DC_WHT 0
  1100. cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
  1101. movq m0, [dc1q]
  1102. movq m1, [dc1q+8]
  1103. movq m2, [dc1q+16]
  1104. movq m3, [dc1q+24]
  1105. %if cpuflag(sse)
  1106. xorps xmm0, xmm0
  1107. movaps [dc1q+ 0], xmm0
  1108. movaps [dc1q+16], xmm0
  1109. %else
  1110. pxor m4, m4
  1111. movq [dc1q+ 0], m4
  1112. movq [dc1q+ 8], m4
  1113. movq [dc1q+16], m4
  1114. movq [dc1q+24], m4
  1115. %endif
  1116. HADAMARD4_1D 0, 1, 2, 3
  1117. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1118. paddw m0, [pw_3]
  1119. HADAMARD4_1D 0, 1, 2, 3
  1120. psraw m0, 3
  1121. psraw m1, 3
  1122. psraw m2, 3
  1123. psraw m3, 3
  1124. SCATTER_WHT 0, 1, 0
  1125. SCATTER_WHT 2, 3, 2
  1126. RET
  1127. %endmacro
  1128. %if ARCH_X86_32
  1129. INIT_MMX mmx
  1130. VP8_DC_WHT
  1131. %endif
  1132. INIT_MMX sse
  1133. VP8_DC_WHT
  1134. ;-----------------------------------------------------------------------------
  1135. ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
  1136. ;-----------------------------------------------------------------------------
  1137. ; macro called with 7 mm register indexes as argument, and 4 regular registers
  1138. ;
  1139. ; first 4 mm registers will carry the transposed pixel data
  1140. ; the other three are scratchspace (one would be sufficient, but this allows
  1141. ; for more spreading/pipelining and thus faster execution on OOE CPUs)
  1142. ;
  1143. ; first two regular registers are buf+4*stride and buf+5*stride
  1144. ; third is -stride, fourth is +stride
  1145. %macro READ_8x4_INTERLEAVED 11
  1146. ; interleave 8 (A-H) rows of 4 pixels each
  1147. movd m%1, [%8+%10*4] ; A0-3
  1148. movd m%5, [%9+%10*4] ; B0-3
  1149. movd m%2, [%8+%10*2] ; C0-3
  1150. movd m%6, [%8+%10] ; D0-3
  1151. movd m%3, [%8] ; E0-3
  1152. movd m%7, [%9] ; F0-3
  1153. movd m%4, [%9+%11] ; G0-3
  1154. punpcklbw m%1, m%5 ; A/B interleaved
  1155. movd m%5, [%9+%11*2] ; H0-3
  1156. punpcklbw m%2, m%6 ; C/D interleaved
  1157. punpcklbw m%3, m%7 ; E/F interleaved
  1158. punpcklbw m%4, m%5 ; G/H interleaved
  1159. %endmacro
  1160. ; macro called with 7 mm register indexes as argument, and 5 regular registers
  1161. ; first 11 mean the same as READ_8x4_TRANSPOSED above
  1162. ; fifth regular register is scratchspace to reach the bottom 8 rows, it
  1163. ; will be set to second regular register + 8*stride at the end
  1164. %macro READ_16x4_INTERLEAVED 12
  1165. ; transpose 16 (A-P) rows of 4 pixels each
  1166. lea %12, [r0+8*r2]
  1167. ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
  1168. movd m%1, [%8+%10*4] ; A0-3
  1169. movd m%3, [%12+%10*4] ; I0-3
  1170. movd m%2, [%8+%10*2] ; C0-3
  1171. movd m%4, [%12+%10*2] ; K0-3
  1172. movd m%6, [%8+%10] ; D0-3
  1173. movd m%5, [%12+%10] ; L0-3
  1174. movd m%7, [%12] ; M0-3
  1175. add %12, %11
  1176. punpcklbw m%1, m%3 ; A/I
  1177. movd m%3, [%8] ; E0-3
  1178. punpcklbw m%2, m%4 ; C/K
  1179. punpcklbw m%6, m%5 ; D/L
  1180. punpcklbw m%3, m%7 ; E/M
  1181. punpcklbw m%2, m%6 ; C/D/K/L interleaved
  1182. ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
  1183. movd m%5, [%9+%10*4] ; B0-3
  1184. movd m%4, [%12+%10*4] ; J0-3
  1185. movd m%7, [%9] ; F0-3
  1186. movd m%6, [%12] ; N0-3
  1187. punpcklbw m%5, m%4 ; B/J
  1188. punpcklbw m%7, m%6 ; F/N
  1189. punpcklbw m%1, m%5 ; A/B/I/J interleaved
  1190. punpcklbw m%3, m%7 ; E/F/M/N interleaved
  1191. movd m%4, [%9+%11] ; G0-3
  1192. movd m%6, [%12+%11] ; O0-3
  1193. movd m%5, [%9+%11*2] ; H0-3
  1194. movd m%7, [%12+%11*2] ; P0-3
  1195. punpcklbw m%4, m%6 ; G/O
  1196. punpcklbw m%5, m%7 ; H/P
  1197. punpcklbw m%4, m%5 ; G/H/O/P interleaved
  1198. %endmacro
  1199. ; write 4 mm registers of 2 dwords each
  1200. ; first four arguments are mm register indexes containing source data
  1201. ; last four are registers containing buf+4*stride, buf+5*stride,
  1202. ; -stride and +stride
  1203. %macro WRITE_4x2D 8
  1204. ; write out (2 dwords per register)
  1205. movd [%5+%7*4], m%1
  1206. movd [%5+%7*2], m%2
  1207. movd [%5], m%3
  1208. movd [%6+%8], m%4
  1209. punpckhdq m%1, m%1
  1210. punpckhdq m%2, m%2
  1211. punpckhdq m%3, m%3
  1212. punpckhdq m%4, m%4
  1213. movd [%6+%7*4], m%1
  1214. movd [%5+%7], m%2
  1215. movd [%6], m%3
  1216. movd [%6+%8*2], m%4
  1217. %endmacro
  1218. ; write 4 xmm registers of 4 dwords each
  1219. ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
  1220. ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
  1221. ; we add 1*stride to the third regular registry in the process
  1222. ; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
  1223. ; same memory region), or 8 if they cover two separate buffers (third one points to
  1224. ; a different memory region than the first two), allowing for more optimal code for
  1225. ; the 16-width case
  1226. %macro WRITE_4x4D 10
  1227. ; write out (4 dwords per register), start with dwords zero
  1228. movd [%5+%8*4], m%1
  1229. movd [%5], m%2
  1230. movd [%7+%8*4], m%3
  1231. movd [%7], m%4
  1232. ; store dwords 1
  1233. psrldq m%1, 4
  1234. psrldq m%2, 4
  1235. psrldq m%3, 4
  1236. psrldq m%4, 4
  1237. movd [%6+%8*4], m%1
  1238. movd [%6], m%2
  1239. %if %10 == 16
  1240. movd [%6+%9*4], m%3
  1241. %endif
  1242. movd [%7+%9], m%4
  1243. ; write dwords 2
  1244. psrldq m%1, 4
  1245. psrldq m%2, 4
  1246. %if %10 == 8
  1247. movd [%5+%8*2], m%1
  1248. movd %5d, m%3
  1249. %endif
  1250. psrldq m%3, 4
  1251. psrldq m%4, 4
  1252. %if %10 == 16
  1253. movd [%5+%8*2], m%1
  1254. %endif
  1255. movd [%6+%9], m%2
  1256. movd [%7+%8*2], m%3
  1257. movd [%7+%9*2], m%4
  1258. add %7, %9
  1259. ; store dwords 3
  1260. psrldq m%1, 4
  1261. psrldq m%2, 4
  1262. psrldq m%3, 4
  1263. psrldq m%4, 4
  1264. %if %10 == 8
  1265. mov [%7+%8*4], %5d
  1266. movd [%6+%8*2], m%1
  1267. %else
  1268. movd [%5+%8], m%1
  1269. %endif
  1270. movd [%6+%9*2], m%2
  1271. movd [%7+%8*2], m%3
  1272. movd [%7+%9*2], m%4
  1273. %endmacro
  1274. ; write 4 or 8 words in the mmx/xmm registers as 8 lines
  1275. ; 1 and 2 are the registers to write, this can be the same (for SSE2)
  1276. ; for pre-SSE4:
  1277. ; 3 is a general-purpose register that we will clobber
  1278. ; for SSE4:
  1279. ; 3 is a pointer to the destination's 5th line
  1280. ; 4 is a pointer to the destination's 4th line
  1281. ; 5/6 is -stride and +stride
  1282. %macro WRITE_2x4W 6
  1283. movd %3d, %1
  1284. punpckhdq %1, %1
  1285. mov [%4+%5*4], %3w
  1286. shr %3, 16
  1287. add %4, %6
  1288. mov [%4+%5*4], %3w
  1289. movd %3d, %1
  1290. add %4, %5
  1291. mov [%4+%5*2], %3w
  1292. shr %3, 16
  1293. mov [%4+%5 ], %3w
  1294. movd %3d, %2
  1295. punpckhdq %2, %2
  1296. mov [%4 ], %3w
  1297. shr %3, 16
  1298. mov [%4+%6 ], %3w
  1299. movd %3d, %2
  1300. add %4, %6
  1301. mov [%4+%6 ], %3w
  1302. shr %3, 16
  1303. mov [%4+%6*2], %3w
  1304. add %4, %5
  1305. %endmacro
  1306. %macro WRITE_8W 5
  1307. %if cpuflag(sse4)
  1308. pextrw [%3+%4*4], %1, 0
  1309. pextrw [%2+%4*4], %1, 1
  1310. pextrw [%3+%4*2], %1, 2
  1311. pextrw [%3+%4 ], %1, 3
  1312. pextrw [%3 ], %1, 4
  1313. pextrw [%2 ], %1, 5
  1314. pextrw [%2+%5 ], %1, 6
  1315. pextrw [%2+%5*2], %1, 7
  1316. %else
  1317. movd %2d, %1
  1318. psrldq %1, 4
  1319. mov [%3+%4*4], %2w
  1320. shr %2, 16
  1321. add %3, %5
  1322. mov [%3+%4*4], %2w
  1323. movd %2d, %1
  1324. psrldq %1, 4
  1325. add %3, %4
  1326. mov [%3+%4*2], %2w
  1327. shr %2, 16
  1328. mov [%3+%4 ], %2w
  1329. movd %2d, %1
  1330. psrldq %1, 4
  1331. mov [%3 ], %2w
  1332. shr %2, 16
  1333. mov [%3+%5 ], %2w
  1334. movd %2d, %1
  1335. add %3, %5
  1336. mov [%3+%5 ], %2w
  1337. shr %2, 16
  1338. mov [%3+%5*2], %2w
  1339. %endif
  1340. %endmacro
  1341. %macro SPLATB_REG 2-3
  1342. %if cpuflag(ssse3)
  1343. movd %1, %2d
  1344. pshufb %1, %3
  1345. %elif cpuflag(sse2)
  1346. movd %1, %2d
  1347. punpcklbw %1, %1
  1348. pshuflw %1, %1, 0x0
  1349. punpcklqdq %1, %1
  1350. %elif cpuflag(mmx2)
  1351. movd %1, %2d
  1352. punpcklbw %1, %1
  1353. pshufw %1, %1, 0x0
  1354. %else
  1355. movd %1, %2d
  1356. punpcklbw %1, %1
  1357. punpcklwd %1, %1
  1358. punpckldq %1, %1
  1359. %endif
  1360. %endmacro
  1361. %macro SIMPLE_LOOPFILTER 2
  1362. cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
  1363. %if mmsize == 8 ; mmx/mmxext
  1364. mov cntrq, 2
  1365. %endif
  1366. %if cpuflag(ssse3)
  1367. pxor m0, m0
  1368. %endif
  1369. SPLATB_REG m7, flim, m0 ; splat "flim" into register
  1370. ; set up indexes to address 4 rows
  1371. %if mmsize == 8
  1372. DEFINE_ARGS dst1, mstride, stride, cntr, dst2
  1373. %else
  1374. DEFINE_ARGS dst1, mstride, stride, dst3, dst2
  1375. %endif
  1376. mov strideq, mstrideq
  1377. neg mstrideq
  1378. %ifidn %1, h
  1379. lea dst1q, [dst1q+4*strideq-2]
  1380. %endif
  1381. %if mmsize == 8 ; mmx / mmxext
  1382. .next8px
  1383. %endif
  1384. %ifidn %1, v
  1385. ; read 4 half/full rows of pixels
  1386. mova m0, [dst1q+mstrideq*2] ; p1
  1387. mova m1, [dst1q+mstrideq] ; p0
  1388. mova m2, [dst1q] ; q0
  1389. mova m3, [dst1q+ strideq] ; q1
  1390. %else ; h
  1391. lea dst2q, [dst1q+ strideq]
  1392. %if mmsize == 8 ; mmx/mmxext
  1393. READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
  1394. %else ; sse2
  1395. READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
  1396. %endif
  1397. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1398. %endif
  1399. ; simple_limit
  1400. mova m5, m2 ; m5=backup of q0
  1401. mova m6, m1 ; m6=backup of p0
  1402. psubusb m1, m2 ; p0-q0
  1403. psubusb m2, m6 ; q0-p0
  1404. por m1, m2 ; FFABS(p0-q0)
  1405. paddusb m1, m1 ; m1=FFABS(p0-q0)*2
  1406. mova m4, m3
  1407. mova m2, m0
  1408. psubusb m3, m0 ; q1-p1
  1409. psubusb m0, m4 ; p1-q1
  1410. por m3, m0 ; FFABS(p1-q1)
  1411. mova m0, [pb_80]
  1412. pxor m2, m0
  1413. pxor m4, m0
  1414. psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
  1415. pand m3, [pb_FE]
  1416. psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
  1417. paddusb m3, m1
  1418. psubusb m3, m7
  1419. pxor m1, m1
  1420. pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
  1421. ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
  1422. mova m4, m5
  1423. pxor m5, m0
  1424. pxor m0, m6
  1425. psubsb m5, m0 ; q0-p0 (signed)
  1426. paddsb m2, m5
  1427. paddsb m2, m5
  1428. paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
  1429. pand m2, m3 ; apply filter mask (m3)
  1430. mova m3, [pb_F8]
  1431. mova m1, m2
  1432. paddsb m2, [pb_4] ; f1<<3=a+4
  1433. paddsb m1, [pb_3] ; f2<<3=a+3
  1434. pand m2, m3
  1435. pand m1, m3 ; cache f2<<3
  1436. pxor m0, m0
  1437. pxor m3, m3
  1438. pcmpgtb m0, m2 ; which values are <0?
  1439. psubb m3, m2 ; -f1<<3
  1440. psrlq m2, 3 ; +f1
  1441. psrlq m3, 3 ; -f1
  1442. pand m3, m0
  1443. pandn m0, m2
  1444. psubusb m4, m0
  1445. paddusb m4, m3 ; q0-f1
  1446. pxor m0, m0
  1447. pxor m3, m3
  1448. pcmpgtb m0, m1 ; which values are <0?
  1449. psubb m3, m1 ; -f2<<3
  1450. psrlq m1, 3 ; +f2
  1451. psrlq m3, 3 ; -f2
  1452. pand m3, m0
  1453. pandn m0, m1
  1454. paddusb m6, m0
  1455. psubusb m6, m3 ; p0+f2
  1456. ; store
  1457. %ifidn %1, v
  1458. mova [dst1q], m4
  1459. mova [dst1q+mstrideq], m6
  1460. %else ; h
  1461. inc dst1q
  1462. SBUTTERFLY bw, 6, 4, 0
  1463. %if mmsize == 16 ; sse2
  1464. %if cpuflag(sse4)
  1465. inc dst2q
  1466. %endif
  1467. WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
  1468. lea dst2q, [dst3q+mstrideq+1]
  1469. %if cpuflag(sse4)
  1470. inc dst3q
  1471. %endif
  1472. WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
  1473. %else ; mmx/mmxext
  1474. WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
  1475. %endif
  1476. %endif
  1477. %if mmsize == 8 ; mmx/mmxext
  1478. ; next 8 pixels
  1479. %ifidn %1, v
  1480. add dst1q, 8 ; advance 8 cols = pixels
  1481. %else ; h
  1482. lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
  1483. %endif
  1484. dec cntrq
  1485. jg .next8px
  1486. REP_RET
  1487. %else ; sse2
  1488. RET
  1489. %endif
  1490. %endmacro
  1491. %if ARCH_X86_32
  1492. INIT_MMX mmx
  1493. SIMPLE_LOOPFILTER v, 4
  1494. SIMPLE_LOOPFILTER h, 5
  1495. INIT_MMX mmx2
  1496. SIMPLE_LOOPFILTER v, 4
  1497. SIMPLE_LOOPFILTER h, 5
  1498. %endif
  1499. INIT_XMM sse2
  1500. SIMPLE_LOOPFILTER v, 3
  1501. SIMPLE_LOOPFILTER h, 5
  1502. INIT_XMM ssse3
  1503. SIMPLE_LOOPFILTER v, 3
  1504. SIMPLE_LOOPFILTER h, 5
  1505. INIT_XMM sse4
  1506. SIMPLE_LOOPFILTER h, 5
  1507. ;-----------------------------------------------------------------------------
  1508. ; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1509. ; int flimE, int flimI, int hev_thr);
  1510. ;-----------------------------------------------------------------------------
  1511. %macro INNER_LOOPFILTER 2
  1512. %if %2 == 8 ; chroma
  1513. cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr
  1514. %else ; luma
  1515. cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr
  1516. %endif
  1517. %if cpuflag(ssse3)
  1518. pxor m7, m7
  1519. %endif
  1520. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1521. %ifidn %1, v ; [3]=hev() result
  1522. %assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15)
  1523. %else ; h ; extra storage space for transposes
  1524. %assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15)
  1525. %endif
  1526. ; splat function arguments
  1527. SPLATB_REG m0, flimEq, m7 ; E
  1528. SPLATB_REG m1, flimIq, m7 ; I
  1529. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1530. SUB rsp, pad
  1531. %define m_flimE [rsp]
  1532. %define m_flimI [rsp+mmsize]
  1533. %define m_hevthr [rsp+mmsize*2]
  1534. %define m_maskres [rsp+mmsize*3]
  1535. %define m_p0backup [rsp+mmsize*3]
  1536. %define m_q0backup [rsp+mmsize*4]
  1537. mova m_flimE, m0
  1538. mova m_flimI, m1
  1539. mova m_hevthr, m2
  1540. %else
  1541. %define m_flimE m9
  1542. %define m_flimI m10
  1543. %define m_hevthr m11
  1544. %define m_maskres m12
  1545. %define m_p0backup m12
  1546. %define m_q0backup m8
  1547. ; splat function arguments
  1548. SPLATB_REG m_flimE, flimEq, m7 ; E
  1549. SPLATB_REG m_flimI, flimIq, m7 ; I
  1550. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  1551. %endif
  1552. %if %2 == 8 ; chroma
  1553. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  1554. %elif mmsize == 8
  1555. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  1556. mov cntrq, 2
  1557. %else
  1558. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  1559. %endif
  1560. mov strideq, mstrideq
  1561. neg mstrideq
  1562. %ifidn %1, h
  1563. lea dst1q, [dst1q+strideq*4-4]
  1564. %if %2 == 8 ; chroma
  1565. lea dst8q, [dst8q+strideq*4-4]
  1566. %endif
  1567. %endif
  1568. %if mmsize == 8
  1569. .next8px:
  1570. %endif
  1571. ; read
  1572. lea dst2q, [dst1q+strideq]
  1573. %ifidn %1, v
  1574. %if %2 == 8 && mmsize == 16
  1575. %define movrow movh
  1576. %else
  1577. %define movrow mova
  1578. %endif
  1579. movrow m0, [dst1q+mstrideq*4] ; p3
  1580. movrow m1, [dst2q+mstrideq*4] ; p2
  1581. movrow m2, [dst1q+mstrideq*2] ; p1
  1582. movrow m5, [dst2q] ; q1
  1583. movrow m6, [dst2q+ strideq*1] ; q2
  1584. movrow m7, [dst2q+ strideq*2] ; q3
  1585. %if mmsize == 16 && %2 == 8
  1586. movhps m0, [dst8q+mstrideq*4]
  1587. movhps m2, [dst8q+mstrideq*2]
  1588. add dst8q, strideq
  1589. movhps m1, [dst8q+mstrideq*4]
  1590. movhps m5, [dst8q]
  1591. movhps m6, [dst8q+ strideq ]
  1592. movhps m7, [dst8q+ strideq*2]
  1593. add dst8q, mstrideq
  1594. %endif
  1595. %elif mmsize == 8 ; mmx/mmxext (h)
  1596. ; read 8 rows of 8px each
  1597. movu m0, [dst1q+mstrideq*4]
  1598. movu m1, [dst2q+mstrideq*4]
  1599. movu m2, [dst1q+mstrideq*2]
  1600. movu m3, [dst1q+mstrideq ]
  1601. movu m4, [dst1q]
  1602. movu m5, [dst2q]
  1603. movu m6, [dst2q+ strideq ]
  1604. ; 8x8 transpose
  1605. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1606. mova m_q0backup, m1
  1607. movu m7, [dst2q+ strideq*2]
  1608. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1609. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1610. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1611. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1612. mova m1, m_q0backup
  1613. mova m_q0backup, m2 ; store q0
  1614. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1615. mova m_p0backup, m5 ; store p0
  1616. SWAP 1, 4
  1617. SWAP 2, 4
  1618. SWAP 6, 3
  1619. SWAP 5, 3
  1620. %else ; sse2 (h)
  1621. %if %2 == 16
  1622. lea dst8q, [dst1q+ strideq*8]
  1623. %endif
  1624. ; read 16 rows of 8px each, interleave
  1625. movh m0, [dst1q+mstrideq*4]
  1626. movh m1, [dst8q+mstrideq*4]
  1627. movh m2, [dst1q+mstrideq*2]
  1628. movh m5, [dst8q+mstrideq*2]
  1629. movh m3, [dst1q+mstrideq ]
  1630. movh m6, [dst8q+mstrideq ]
  1631. movh m4, [dst1q]
  1632. movh m7, [dst8q]
  1633. punpcklbw m0, m1 ; A/I
  1634. punpcklbw m2, m5 ; C/K
  1635. punpcklbw m3, m6 ; D/L
  1636. punpcklbw m4, m7 ; E/M
  1637. add dst8q, strideq
  1638. movh m1, [dst2q+mstrideq*4]
  1639. movh m6, [dst8q+mstrideq*4]
  1640. movh m5, [dst2q]
  1641. movh m7, [dst8q]
  1642. punpcklbw m1, m6 ; B/J
  1643. punpcklbw m5, m7 ; F/N
  1644. movh m6, [dst2q+ strideq ]
  1645. movh m7, [dst8q+ strideq ]
  1646. punpcklbw m6, m7 ; G/O
  1647. ; 8x16 transpose
  1648. TRANSPOSE4x4B 0, 1, 2, 3, 7
  1649. %ifdef m8
  1650. SWAP 1, 8
  1651. %else
  1652. mova m_q0backup, m1
  1653. %endif
  1654. movh m7, [dst2q+ strideq*2]
  1655. movh m1, [dst8q+ strideq*2]
  1656. punpcklbw m7, m1 ; H/P
  1657. TRANSPOSE4x4B 4, 5, 6, 7, 1
  1658. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  1659. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  1660. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  1661. %ifdef m8
  1662. SWAP 1, 8
  1663. SWAP 2, 8
  1664. %else
  1665. mova m1, m_q0backup
  1666. mova m_q0backup, m2 ; store q0
  1667. %endif
  1668. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  1669. %ifdef m12
  1670. SWAP 5, 12
  1671. %else
  1672. mova m_p0backup, m5 ; store p0
  1673. %endif
  1674. SWAP 1, 4
  1675. SWAP 2, 4
  1676. SWAP 6, 3
  1677. SWAP 5, 3
  1678. %endif
  1679. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  1680. mova m4, m1
  1681. SWAP 4, 1
  1682. psubusb m4, m0 ; p2-p3
  1683. psubusb m0, m1 ; p3-p2
  1684. por m0, m4 ; abs(p3-p2)
  1685. mova m4, m2
  1686. SWAP 4, 2
  1687. psubusb m4, m1 ; p1-p2
  1688. psubusb m1, m2 ; p2-p1
  1689. por m1, m4 ; abs(p2-p1)
  1690. mova m4, m6
  1691. SWAP 4, 6
  1692. psubusb m4, m7 ; q2-q3
  1693. psubusb m7, m6 ; q3-q2
  1694. por m7, m4 ; abs(q3-q2)
  1695. mova m4, m5
  1696. SWAP 4, 5
  1697. psubusb m4, m6 ; q1-q2
  1698. psubusb m6, m5 ; q2-q1
  1699. por m6, m4 ; abs(q2-q1)
  1700. %if notcpuflag(mmx2)
  1701. mova m4, m_flimI
  1702. pxor m3, m3
  1703. psubusb m0, m4
  1704. psubusb m1, m4
  1705. psubusb m7, m4
  1706. psubusb m6, m4
  1707. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  1708. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  1709. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  1710. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  1711. pand m0, m1
  1712. pand m7, m6
  1713. pand m0, m7
  1714. %else ; mmxext/sse2
  1715. pmaxub m0, m1
  1716. pmaxub m6, m7
  1717. pmaxub m0, m6
  1718. %endif
  1719. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  1720. SWAP 7, 3 ; now m7 is zero
  1721. %ifidn %1, v
  1722. movrow m3, [dst1q+mstrideq ] ; p0
  1723. %if mmsize == 16 && %2 == 8
  1724. movhps m3, [dst8q+mstrideq ]
  1725. %endif
  1726. %elifdef m12
  1727. SWAP 3, 12
  1728. %else
  1729. mova m3, m_p0backup
  1730. %endif
  1731. mova m1, m2
  1732. SWAP 1, 2
  1733. mova m6, m3
  1734. SWAP 3, 6
  1735. psubusb m1, m3 ; p1-p0
  1736. psubusb m6, m2 ; p0-p1
  1737. por m1, m6 ; abs(p1-p0)
  1738. %if notcpuflag(mmx2)
  1739. mova m6, m1
  1740. psubusb m1, m4
  1741. psubusb m6, m_hevthr
  1742. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  1743. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  1744. pand m0, m1
  1745. mova m_maskres, m6
  1746. %else ; mmxext/sse2
  1747. pmaxub m0, m1 ; max_I
  1748. SWAP 1, 4 ; max_hev_thresh
  1749. %endif
  1750. SWAP 6, 4 ; now m6 is I
  1751. %ifidn %1, v
  1752. movrow m4, [dst1q] ; q0
  1753. %if mmsize == 16 && %2 == 8
  1754. movhps m4, [dst8q]
  1755. %endif
  1756. %elifdef m8
  1757. SWAP 4, 8
  1758. %else
  1759. mova m4, m_q0backup
  1760. %endif
  1761. mova m1, m4
  1762. SWAP 1, 4
  1763. mova m7, m5
  1764. SWAP 7, 5
  1765. psubusb m1, m5 ; q0-q1
  1766. psubusb m7, m4 ; q1-q0
  1767. por m1, m7 ; abs(q1-q0)
  1768. %if notcpuflag(mmx2)
  1769. mova m7, m1
  1770. psubusb m1, m6
  1771. psubusb m7, m_hevthr
  1772. pxor m6, m6
  1773. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  1774. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  1775. mova m6, m_maskres
  1776. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  1777. pand m6, m7
  1778. %else ; mmxext/sse2
  1779. pxor m7, m7
  1780. pmaxub m0, m1
  1781. pmaxub m6, m1
  1782. psubusb m0, m_flimI
  1783. psubusb m6, m_hevthr
  1784. pcmpeqb m0, m7 ; max(abs(..)) <= I
  1785. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  1786. %endif
  1787. %ifdef m12
  1788. SWAP 6, 12
  1789. %else
  1790. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  1791. %endif
  1792. ; simple_limit
  1793. mova m1, m3
  1794. SWAP 1, 3
  1795. mova m6, m4 ; keep copies of p0/q0 around for later use
  1796. SWAP 6, 4
  1797. psubusb m1, m4 ; p0-q0
  1798. psubusb m6, m3 ; q0-p0
  1799. por m1, m6 ; abs(q0-p0)
  1800. paddusb m1, m1 ; m1=2*abs(q0-p0)
  1801. mova m7, m2
  1802. SWAP 7, 2
  1803. mova m6, m5
  1804. SWAP 6, 5
  1805. psubusb m7, m5 ; p1-q1
  1806. psubusb m6, m2 ; q1-p1
  1807. por m7, m6 ; abs(q1-p1)
  1808. pxor m6, m6
  1809. pand m7, [pb_FE]
  1810. psrlq m7, 1 ; abs(q1-p1)/2
  1811. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  1812. psubusb m7, m_flimE
  1813. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  1814. pand m0, m7 ; normal_limit result
  1815. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  1816. %ifdef m8 ; x86-64 && sse2
  1817. mova m8, [pb_80]
  1818. %define m_pb_80 m8
  1819. %else ; x86-32 or mmx/mmxext
  1820. %define m_pb_80 [pb_80]
  1821. %endif
  1822. mova m1, m4
  1823. mova m7, m3
  1824. pxor m1, m_pb_80
  1825. pxor m7, m_pb_80
  1826. psubsb m1, m7 ; (signed) q0-p0
  1827. mova m6, m2
  1828. mova m7, m5
  1829. pxor m6, m_pb_80
  1830. pxor m7, m_pb_80
  1831. psubsb m6, m7 ; (signed) p1-q1
  1832. mova m7, m_maskres
  1833. pandn m7, m6
  1834. paddsb m7, m1
  1835. paddsb m7, m1
  1836. paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
  1837. pand m7, m0
  1838. mova m1, [pb_F8]
  1839. mova m6, m7
  1840. paddsb m7, [pb_3]
  1841. paddsb m6, [pb_4]
  1842. pand m7, m1
  1843. pand m6, m1
  1844. pxor m1, m1
  1845. pxor m0, m0
  1846. pcmpgtb m1, m7
  1847. psubb m0, m7
  1848. psrlq m7, 3 ; +f2
  1849. psrlq m0, 3 ; -f2
  1850. pand m0, m1
  1851. pandn m1, m7
  1852. psubusb m3, m0
  1853. paddusb m3, m1 ; p0+f2
  1854. pxor m1, m1
  1855. pxor m0, m0
  1856. pcmpgtb m0, m6
  1857. psubb m1, m6
  1858. psrlq m6, 3 ; +f1
  1859. psrlq m1, 3 ; -f1
  1860. pand m1, m0
  1861. pandn m0, m6
  1862. psubusb m4, m0
  1863. paddusb m4, m1 ; q0-f1
  1864. %ifdef m12
  1865. SWAP 6, 12
  1866. %else
  1867. mova m6, m_maskres
  1868. %endif
  1869. %if notcpuflag(mmx2)
  1870. mova m7, [pb_1]
  1871. %else ; mmxext/sse2
  1872. pxor m7, m7
  1873. %endif
  1874. pand m0, m6
  1875. pand m1, m6
  1876. %if notcpuflag(mmx2)
  1877. paddusb m0, m7
  1878. pand m1, [pb_FE]
  1879. pandn m7, m0
  1880. psrlq m1, 1
  1881. psrlq m7, 1
  1882. SWAP 0, 7
  1883. %else ; mmxext/sse2
  1884. psubusb m1, [pb_1]
  1885. pavgb m0, m7 ; a
  1886. pavgb m1, m7 ; -a
  1887. %endif
  1888. psubusb m5, m0
  1889. psubusb m2, m1
  1890. paddusb m5, m1 ; q1-a
  1891. paddusb m2, m0 ; p1+a
  1892. ; store
  1893. %ifidn %1, v
  1894. movrow [dst1q+mstrideq*2], m2
  1895. movrow [dst1q+mstrideq ], m3
  1896. movrow [dst1q], m4
  1897. movrow [dst1q+ strideq ], m5
  1898. %if mmsize == 16 && %2 == 8
  1899. movhps [dst8q+mstrideq*2], m2
  1900. movhps [dst8q+mstrideq ], m3
  1901. movhps [dst8q], m4
  1902. movhps [dst8q+ strideq ], m5
  1903. %endif
  1904. %else ; h
  1905. add dst1q, 2
  1906. add dst2q, 2
  1907. ; 4x8/16 transpose
  1908. TRANSPOSE4x4B 2, 3, 4, 5, 6
  1909. %if mmsize == 8 ; mmx/mmxext (h)
  1910. WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
  1911. %else ; sse2 (h)
  1912. lea dst8q, [dst8q+mstrideq +2]
  1913. WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  1914. %endif
  1915. %endif
  1916. %if mmsize == 8
  1917. %if %2 == 8 ; chroma
  1918. %ifidn %1, h
  1919. sub dst1q, 2
  1920. %endif
  1921. cmp dst1q, dst8q
  1922. mov dst1q, dst8q
  1923. jnz .next8px
  1924. %else
  1925. %ifidn %1, h
  1926. lea dst1q, [dst1q+ strideq*8-2]
  1927. %else ; v
  1928. add dst1q, 8
  1929. %endif
  1930. dec cntrq
  1931. jg .next8px
  1932. %endif
  1933. %endif
  1934. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  1935. ADD rsp, pad
  1936. %endif
  1937. RET
  1938. %endmacro
  1939. %if ARCH_X86_32
  1940. INIT_MMX mmx
  1941. INNER_LOOPFILTER v, 16
  1942. INNER_LOOPFILTER h, 16
  1943. INNER_LOOPFILTER v, 8
  1944. INNER_LOOPFILTER h, 8
  1945. INIT_MMX mmx2
  1946. INNER_LOOPFILTER v, 16
  1947. INNER_LOOPFILTER h, 16
  1948. INNER_LOOPFILTER v, 8
  1949. INNER_LOOPFILTER h, 8
  1950. %endif
  1951. INIT_XMM sse2
  1952. INNER_LOOPFILTER v, 16
  1953. INNER_LOOPFILTER h, 16
  1954. INNER_LOOPFILTER v, 8
  1955. INNER_LOOPFILTER h, 8
  1956. INIT_XMM ssse3
  1957. INNER_LOOPFILTER v, 16
  1958. INNER_LOOPFILTER h, 16
  1959. INNER_LOOPFILTER v, 8
  1960. INNER_LOOPFILTER h, 8
  1961. ;-----------------------------------------------------------------------------
  1962. ; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
  1963. ; int flimE, int flimI, int hev_thr);
  1964. ;-----------------------------------------------------------------------------
  1965. %macro MBEDGE_LOOPFILTER 2
  1966. %if %2 == 8 ; chroma
  1967. cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr
  1968. %else ; luma
  1969. cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr
  1970. %endif
  1971. %if cpuflag(ssse3)
  1972. pxor m7, m7
  1973. %endif
  1974. %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
  1975. %if mmsize == 16 ; [3]=hev() result
  1976. ; [4]=filter tmp result
  1977. ; [5]/[6] = p2/q2 backup
  1978. ; [7]=lim_res sign result
  1979. %assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15)
  1980. %else ; 8 ; extra storage space for transposes
  1981. %assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15)
  1982. %endif
  1983. ; splat function arguments
  1984. SPLATB_REG m0, flimEq, m7 ; E
  1985. SPLATB_REG m1, flimIq, m7 ; I
  1986. SPLATB_REG m2, hevthrq, m7 ; hev_thresh
  1987. SUB rsp, pad
  1988. %define m_flimE [rsp]
  1989. %define m_flimI [rsp+mmsize]
  1990. %define m_hevthr [rsp+mmsize*2]
  1991. %define m_maskres [rsp+mmsize*3]
  1992. %define m_limres [rsp+mmsize*4]
  1993. %define m_p0backup [rsp+mmsize*3]
  1994. %define m_q0backup [rsp+mmsize*4]
  1995. %define m_p2backup [rsp+mmsize*5]
  1996. %define m_q2backup [rsp+mmsize*6]
  1997. %if mmsize == 16
  1998. %define m_limsign [rsp]
  1999. %else
  2000. %define m_limsign [rsp+mmsize*7]
  2001. %endif
  2002. mova m_flimE, m0
  2003. mova m_flimI, m1
  2004. mova m_hevthr, m2
  2005. %else ; sse2 on x86-64
  2006. %define m_flimE m9
  2007. %define m_flimI m10
  2008. %define m_hevthr m11
  2009. %define m_maskres m12
  2010. %define m_limres m8
  2011. %define m_p0backup m12
  2012. %define m_q0backup m8
  2013. %define m_p2backup m13
  2014. %define m_q2backup m14
  2015. %define m_limsign m9
  2016. ; splat function arguments
  2017. SPLATB_REG m_flimE, flimEq, m7 ; E
  2018. SPLATB_REG m_flimI, flimIq, m7 ; I
  2019. SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
  2020. %endif
  2021. %if %2 == 8 ; chroma
  2022. DEFINE_ARGS dst1, dst8, mstride, stride, dst2
  2023. %elif mmsize == 8
  2024. DEFINE_ARGS dst1, mstride, stride, dst2, cntr
  2025. mov cntrq, 2
  2026. %else
  2027. DEFINE_ARGS dst1, mstride, stride, dst2, dst8
  2028. %endif
  2029. mov strideq, mstrideq
  2030. neg mstrideq
  2031. %ifidn %1, h
  2032. lea dst1q, [dst1q+strideq*4-4]
  2033. %if %2 == 8 ; chroma
  2034. lea dst8q, [dst8q+strideq*4-4]
  2035. %endif
  2036. %endif
  2037. %if mmsize == 8
  2038. .next8px:
  2039. %endif
  2040. ; read
  2041. lea dst2q, [dst1q+ strideq ]
  2042. %ifidn %1, v
  2043. %if %2 == 8 && mmsize == 16
  2044. %define movrow movh
  2045. %else
  2046. %define movrow mova
  2047. %endif
  2048. movrow m0, [dst1q+mstrideq*4] ; p3
  2049. movrow m1, [dst2q+mstrideq*4] ; p2
  2050. movrow m2, [dst1q+mstrideq*2] ; p1
  2051. movrow m5, [dst2q] ; q1
  2052. movrow m6, [dst2q+ strideq ] ; q2
  2053. movrow m7, [dst2q+ strideq*2] ; q3
  2054. %if mmsize == 16 && %2 == 8
  2055. movhps m0, [dst8q+mstrideq*4]
  2056. movhps m2, [dst8q+mstrideq*2]
  2057. add dst8q, strideq
  2058. movhps m1, [dst8q+mstrideq*4]
  2059. movhps m5, [dst8q]
  2060. movhps m6, [dst8q+ strideq ]
  2061. movhps m7, [dst8q+ strideq*2]
  2062. add dst8q, mstrideq
  2063. %endif
  2064. %elif mmsize == 8 ; mmx/mmxext (h)
  2065. ; read 8 rows of 8px each
  2066. movu m0, [dst1q+mstrideq*4]
  2067. movu m1, [dst2q+mstrideq*4]
  2068. movu m2, [dst1q+mstrideq*2]
  2069. movu m3, [dst1q+mstrideq ]
  2070. movu m4, [dst1q]
  2071. movu m5, [dst2q]
  2072. movu m6, [dst2q+ strideq ]
  2073. ; 8x8 transpose
  2074. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2075. mova m_q0backup, m1
  2076. movu m7, [dst2q+ strideq*2]
  2077. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2078. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2079. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2080. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2081. mova m1, m_q0backup
  2082. mova m_q0backup, m2 ; store q0
  2083. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2084. mova m_p0backup, m5 ; store p0
  2085. SWAP 1, 4
  2086. SWAP 2, 4
  2087. SWAP 6, 3
  2088. SWAP 5, 3
  2089. %else ; sse2 (h)
  2090. %if %2 == 16
  2091. lea dst8q, [dst1q+ strideq*8 ]
  2092. %endif
  2093. ; read 16 rows of 8px each, interleave
  2094. movh m0, [dst1q+mstrideq*4]
  2095. movh m1, [dst8q+mstrideq*4]
  2096. movh m2, [dst1q+mstrideq*2]
  2097. movh m5, [dst8q+mstrideq*2]
  2098. movh m3, [dst1q+mstrideq ]
  2099. movh m6, [dst8q+mstrideq ]
  2100. movh m4, [dst1q]
  2101. movh m7, [dst8q]
  2102. punpcklbw m0, m1 ; A/I
  2103. punpcklbw m2, m5 ; C/K
  2104. punpcklbw m3, m6 ; D/L
  2105. punpcklbw m4, m7 ; E/M
  2106. add dst8q, strideq
  2107. movh m1, [dst2q+mstrideq*4]
  2108. movh m6, [dst8q+mstrideq*4]
  2109. movh m5, [dst2q]
  2110. movh m7, [dst8q]
  2111. punpcklbw m1, m6 ; B/J
  2112. punpcklbw m5, m7 ; F/N
  2113. movh m6, [dst2q+ strideq ]
  2114. movh m7, [dst8q+ strideq ]
  2115. punpcklbw m6, m7 ; G/O
  2116. ; 8x16 transpose
  2117. TRANSPOSE4x4B 0, 1, 2, 3, 7
  2118. %ifdef m8
  2119. SWAP 1, 8
  2120. %else
  2121. mova m_q0backup, m1
  2122. %endif
  2123. movh m7, [dst2q+ strideq*2]
  2124. movh m1, [dst8q+ strideq*2]
  2125. punpcklbw m7, m1 ; H/P
  2126. TRANSPOSE4x4B 4, 5, 6, 7, 1
  2127. SBUTTERFLY dq, 0, 4, 1 ; p3/p2
  2128. SBUTTERFLY dq, 2, 6, 1 ; q0/q1
  2129. SBUTTERFLY dq, 3, 7, 1 ; q2/q3
  2130. %ifdef m8
  2131. SWAP 1, 8
  2132. SWAP 2, 8
  2133. %else
  2134. mova m1, m_q0backup
  2135. mova m_q0backup, m2 ; store q0
  2136. %endif
  2137. SBUTTERFLY dq, 1, 5, 2 ; p1/p0
  2138. %ifdef m12
  2139. SWAP 5, 12
  2140. %else
  2141. mova m_p0backup, m5 ; store p0
  2142. %endif
  2143. SWAP 1, 4
  2144. SWAP 2, 4
  2145. SWAP 6, 3
  2146. SWAP 5, 3
  2147. %endif
  2148. ; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
  2149. mova m4, m1
  2150. SWAP 4, 1
  2151. psubusb m4, m0 ; p2-p3
  2152. psubusb m0, m1 ; p3-p2
  2153. por m0, m4 ; abs(p3-p2)
  2154. mova m4, m2
  2155. SWAP 4, 2
  2156. psubusb m4, m1 ; p1-p2
  2157. mova m_p2backup, m1
  2158. psubusb m1, m2 ; p2-p1
  2159. por m1, m4 ; abs(p2-p1)
  2160. mova m4, m6
  2161. SWAP 4, 6
  2162. psubusb m4, m7 ; q2-q3
  2163. psubusb m7, m6 ; q3-q2
  2164. por m7, m4 ; abs(q3-q2)
  2165. mova m4, m5
  2166. SWAP 4, 5
  2167. psubusb m4, m6 ; q1-q2
  2168. mova m_q2backup, m6
  2169. psubusb m6, m5 ; q2-q1
  2170. por m6, m4 ; abs(q2-q1)
  2171. %if notcpuflag(mmx2)
  2172. mova m4, m_flimI
  2173. pxor m3, m3
  2174. psubusb m0, m4
  2175. psubusb m1, m4
  2176. psubusb m7, m4
  2177. psubusb m6, m4
  2178. pcmpeqb m0, m3 ; abs(p3-p2) <= I
  2179. pcmpeqb m1, m3 ; abs(p2-p1) <= I
  2180. pcmpeqb m7, m3 ; abs(q3-q2) <= I
  2181. pcmpeqb m6, m3 ; abs(q2-q1) <= I
  2182. pand m0, m1
  2183. pand m7, m6
  2184. pand m0, m7
  2185. %else ; mmxext/sse2
  2186. pmaxub m0, m1
  2187. pmaxub m6, m7
  2188. pmaxub m0, m6
  2189. %endif
  2190. ; normal_limit and high_edge_variance for p1-p0, q1-q0
  2191. SWAP 7, 3 ; now m7 is zero
  2192. %ifidn %1, v
  2193. movrow m3, [dst1q+mstrideq ] ; p0
  2194. %if mmsize == 16 && %2 == 8
  2195. movhps m3, [dst8q+mstrideq ]
  2196. %endif
  2197. %elifdef m12
  2198. SWAP 3, 12
  2199. %else
  2200. mova m3, m_p0backup
  2201. %endif
  2202. mova m1, m2
  2203. SWAP 1, 2
  2204. mova m6, m3
  2205. SWAP 3, 6
  2206. psubusb m1, m3 ; p1-p0
  2207. psubusb m6, m2 ; p0-p1
  2208. por m1, m6 ; abs(p1-p0)
  2209. %if notcpuflag(mmx2)
  2210. mova m6, m1
  2211. psubusb m1, m4
  2212. psubusb m6, m_hevthr
  2213. pcmpeqb m1, m7 ; abs(p1-p0) <= I
  2214. pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
  2215. pand m0, m1
  2216. mova m_maskres, m6
  2217. %else ; mmxext/sse2
  2218. pmaxub m0, m1 ; max_I
  2219. SWAP 1, 4 ; max_hev_thresh
  2220. %endif
  2221. SWAP 6, 4 ; now m6 is I
  2222. %ifidn %1, v
  2223. movrow m4, [dst1q] ; q0
  2224. %if mmsize == 16 && %2 == 8
  2225. movhps m4, [dst8q]
  2226. %endif
  2227. %elifdef m8
  2228. SWAP 4, 8
  2229. %else
  2230. mova m4, m_q0backup
  2231. %endif
  2232. mova m1, m4
  2233. SWAP 1, 4
  2234. mova m7, m5
  2235. SWAP 7, 5
  2236. psubusb m1, m5 ; q0-q1
  2237. psubusb m7, m4 ; q1-q0
  2238. por m1, m7 ; abs(q1-q0)
  2239. %if notcpuflag(mmx2)
  2240. mova m7, m1
  2241. psubusb m1, m6
  2242. psubusb m7, m_hevthr
  2243. pxor m6, m6
  2244. pcmpeqb m1, m6 ; abs(q1-q0) <= I
  2245. pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
  2246. mova m6, m_maskres
  2247. pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
  2248. pand m6, m7
  2249. %else ; mmxext/sse2
  2250. pxor m7, m7
  2251. pmaxub m0, m1
  2252. pmaxub m6, m1
  2253. psubusb m0, m_flimI
  2254. psubusb m6, m_hevthr
  2255. pcmpeqb m0, m7 ; max(abs(..)) <= I
  2256. pcmpeqb m6, m7 ; !(max(abs..) > thresh)
  2257. %endif
  2258. %ifdef m12
  2259. SWAP 6, 12
  2260. %else
  2261. mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
  2262. %endif
  2263. ; simple_limit
  2264. mova m1, m3
  2265. SWAP 1, 3
  2266. mova m6, m4 ; keep copies of p0/q0 around for later use
  2267. SWAP 6, 4
  2268. psubusb m1, m4 ; p0-q0
  2269. psubusb m6, m3 ; q0-p0
  2270. por m1, m6 ; abs(q0-p0)
  2271. paddusb m1, m1 ; m1=2*abs(q0-p0)
  2272. mova m7, m2
  2273. SWAP 7, 2
  2274. mova m6, m5
  2275. SWAP 6, 5
  2276. psubusb m7, m5 ; p1-q1
  2277. psubusb m6, m2 ; q1-p1
  2278. por m7, m6 ; abs(q1-p1)
  2279. pxor m6, m6
  2280. pand m7, [pb_FE]
  2281. psrlq m7, 1 ; abs(q1-p1)/2
  2282. paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
  2283. psubusb m7, m_flimE
  2284. pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
  2285. pand m0, m7 ; normal_limit result
  2286. ; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
  2287. %ifdef m8 ; x86-64 && sse2
  2288. mova m8, [pb_80]
  2289. %define m_pb_80 m8
  2290. %else ; x86-32 or mmx/mmxext
  2291. %define m_pb_80 [pb_80]
  2292. %endif
  2293. mova m1, m4
  2294. mova m7, m3
  2295. pxor m1, m_pb_80
  2296. pxor m7, m_pb_80
  2297. psubsb m1, m7 ; (signed) q0-p0
  2298. mova m6, m2
  2299. mova m7, m5
  2300. pxor m6, m_pb_80
  2301. pxor m7, m_pb_80
  2302. psubsb m6, m7 ; (signed) p1-q1
  2303. mova m7, m_maskres
  2304. paddsb m6, m1
  2305. paddsb m6, m1
  2306. paddsb m6, m1
  2307. pand m6, m0
  2308. %ifdef m8
  2309. mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
  2310. pand m_limres, m7
  2311. %else
  2312. mova m0, m6
  2313. pand m0, m7
  2314. mova m_limres, m0
  2315. %endif
  2316. pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
  2317. mova m1, [pb_F8]
  2318. mova m6, m7
  2319. paddsb m7, [pb_3]
  2320. paddsb m6, [pb_4]
  2321. pand m7, m1
  2322. pand m6, m1
  2323. pxor m1, m1
  2324. pxor m0, m0
  2325. pcmpgtb m1, m7
  2326. psubb m0, m7
  2327. psrlq m7, 3 ; +f2
  2328. psrlq m0, 3 ; -f2
  2329. pand m0, m1
  2330. pandn m1, m7
  2331. psubusb m3, m0
  2332. paddusb m3, m1 ; p0+f2
  2333. pxor m1, m1
  2334. pxor m0, m0
  2335. pcmpgtb m0, m6
  2336. psubb m1, m6
  2337. psrlq m6, 3 ; +f1
  2338. psrlq m1, 3 ; -f1
  2339. pand m1, m0
  2340. pandn m0, m6
  2341. psubusb m4, m0
  2342. paddusb m4, m1 ; q0-f1
  2343. ; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
  2344. %if cpuflag(ssse3)
  2345. mova m7, [pb_1]
  2346. %else
  2347. mova m7, [pw_63]
  2348. %endif
  2349. %ifdef m8
  2350. SWAP 1, 8
  2351. %else
  2352. mova m1, m_limres
  2353. %endif
  2354. pxor m0, m0
  2355. mova m6, m1
  2356. pcmpgtb m0, m1 ; which are negative
  2357. %if cpuflag(ssse3)
  2358. punpcklbw m6, m7 ; interleave with "1" for rounding
  2359. punpckhbw m1, m7
  2360. %else
  2361. punpcklbw m6, m0 ; signed byte->word
  2362. punpckhbw m1, m0
  2363. %endif
  2364. mova m_limsign, m0
  2365. %if cpuflag(ssse3)
  2366. mova m7, [pb_27_63]
  2367. %ifndef m8
  2368. mova m_limres, m1
  2369. %endif
  2370. %ifdef m10
  2371. SWAP 0, 10 ; don't lose lim_sign copy
  2372. %endif
  2373. mova m0, m7
  2374. pmaddubsw m7, m6
  2375. SWAP 6, 7
  2376. pmaddubsw m0, m1
  2377. SWAP 1, 0
  2378. %ifdef m10
  2379. SWAP 0, 10
  2380. %else
  2381. mova m0, m_limsign
  2382. %endif
  2383. %else
  2384. mova m_maskres, m6 ; backup for later in filter
  2385. mova m_limres, m1
  2386. pmullw m6, [pw_27]
  2387. pmullw m1, [pw_27]
  2388. paddw m6, m7
  2389. paddw m1, m7
  2390. %endif
  2391. psraw m6, 7
  2392. psraw m1, 7
  2393. packsswb m6, m1 ; a0
  2394. pxor m1, m1
  2395. psubb m1, m6
  2396. pand m1, m0 ; -a0
  2397. pandn m0, m6 ; +a0
  2398. %if cpuflag(ssse3)
  2399. mova m6, [pb_18_63] ; pipelining
  2400. %endif
  2401. psubusb m3, m1
  2402. paddusb m4, m1
  2403. paddusb m3, m0 ; p0+a0
  2404. psubusb m4, m0 ; q0-a0
  2405. %if cpuflag(ssse3)
  2406. SWAP 6, 7
  2407. %ifdef m10
  2408. SWAP 1, 10
  2409. %else
  2410. mova m1, m_limres
  2411. %endif
  2412. mova m0, m7
  2413. pmaddubsw m7, m6
  2414. SWAP 6, 7
  2415. pmaddubsw m0, m1
  2416. SWAP 1, 0
  2417. %ifdef m10
  2418. SWAP 0, 10
  2419. %endif
  2420. mova m0, m_limsign
  2421. %else
  2422. mova m6, m_maskres
  2423. mova m1, m_limres
  2424. pmullw m6, [pw_18]
  2425. pmullw m1, [pw_18]
  2426. paddw m6, m7
  2427. paddw m1, m7
  2428. %endif
  2429. mova m0, m_limsign
  2430. psraw m6, 7
  2431. psraw m1, 7
  2432. packsswb m6, m1 ; a1
  2433. pxor m1, m1
  2434. psubb m1, m6
  2435. pand m1, m0 ; -a1
  2436. pandn m0, m6 ; +a1
  2437. %if cpuflag(ssse3)
  2438. mova m6, [pb_9_63]
  2439. %endif
  2440. psubusb m2, m1
  2441. paddusb m5, m1
  2442. paddusb m2, m0 ; p1+a1
  2443. psubusb m5, m0 ; q1-a1
  2444. %if cpuflag(ssse3)
  2445. SWAP 6, 7
  2446. %ifdef m10
  2447. SWAP 1, 10
  2448. %else
  2449. mova m1, m_limres
  2450. %endif
  2451. mova m0, m7
  2452. pmaddubsw m7, m6
  2453. SWAP 6, 7
  2454. pmaddubsw m0, m1
  2455. SWAP 1, 0
  2456. %else
  2457. %ifdef m8
  2458. SWAP 6, 12
  2459. SWAP 1, 8
  2460. %else
  2461. mova m6, m_maskres
  2462. mova m1, m_limres
  2463. %endif
  2464. pmullw m6, [pw_9]
  2465. pmullw m1, [pw_9]
  2466. paddw m6, m7
  2467. paddw m1, m7
  2468. %endif
  2469. %ifdef m9
  2470. SWAP 7, 9
  2471. %else
  2472. mova m7, m_limsign
  2473. %endif
  2474. psraw m6, 7
  2475. psraw m1, 7
  2476. packsswb m6, m1 ; a1
  2477. pxor m0, m0
  2478. psubb m0, m6
  2479. pand m0, m7 ; -a1
  2480. pandn m7, m6 ; +a1
  2481. %ifdef m8
  2482. SWAP 1, 13
  2483. SWAP 6, 14
  2484. %else
  2485. mova m1, m_p2backup
  2486. mova m6, m_q2backup
  2487. %endif
  2488. psubusb m1, m0
  2489. paddusb m6, m0
  2490. paddusb m1, m7 ; p1+a1
  2491. psubusb m6, m7 ; q1-a1
  2492. ; store
  2493. %ifidn %1, v
  2494. movrow [dst2q+mstrideq*4], m1
  2495. movrow [dst1q+mstrideq*2], m2
  2496. movrow [dst1q+mstrideq ], m3
  2497. movrow [dst1q], m4
  2498. movrow [dst2q], m5
  2499. movrow [dst2q+ strideq ], m6
  2500. %if mmsize == 16 && %2 == 8
  2501. add dst8q, mstrideq
  2502. movhps [dst8q+mstrideq*2], m1
  2503. movhps [dst8q+mstrideq ], m2
  2504. movhps [dst8q], m3
  2505. add dst8q, strideq
  2506. movhps [dst8q], m4
  2507. movhps [dst8q+ strideq ], m5
  2508. movhps [dst8q+ strideq*2], m6
  2509. %endif
  2510. %else ; h
  2511. inc dst1q
  2512. inc dst2q
  2513. ; 4x8/16 transpose
  2514. TRANSPOSE4x4B 1, 2, 3, 4, 0
  2515. SBUTTERFLY bw, 5, 6, 0
  2516. %if mmsize == 8 ; mmx/mmxext (h)
  2517. WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
  2518. add dst1q, 4
  2519. WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
  2520. %else ; sse2 (h)
  2521. lea dst8q, [dst8q+mstrideq+1]
  2522. WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
  2523. lea dst1q, [dst2q+mstrideq+4]
  2524. lea dst8q, [dst8q+mstrideq+4]
  2525. %if cpuflag(sse4)
  2526. add dst2q, 4
  2527. %endif
  2528. WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
  2529. %if cpuflag(sse4)
  2530. lea dst2q, [dst8q+ strideq ]
  2531. %endif
  2532. WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
  2533. %endif
  2534. %endif
  2535. %if mmsize == 8
  2536. %if %2 == 8 ; chroma
  2537. %ifidn %1, h
  2538. sub dst1q, 5
  2539. %endif
  2540. cmp dst1q, dst8q
  2541. mov dst1q, dst8q
  2542. jnz .next8px
  2543. %else
  2544. %ifidn %1, h
  2545. lea dst1q, [dst1q+ strideq*8-5]
  2546. %else ; v
  2547. add dst1q, 8
  2548. %endif
  2549. dec cntrq
  2550. jg .next8px
  2551. %endif
  2552. %endif
  2553. %ifndef m8 ; sse2 on x86-32 or mmx/mmxext
  2554. ADD rsp, pad
  2555. %endif
  2556. RET
  2557. %endmacro
  2558. %if ARCH_X86_32
  2559. INIT_MMX mmx
  2560. MBEDGE_LOOPFILTER v, 16
  2561. MBEDGE_LOOPFILTER h, 16
  2562. MBEDGE_LOOPFILTER v, 8
  2563. MBEDGE_LOOPFILTER h, 8
  2564. INIT_MMX mmx2
  2565. MBEDGE_LOOPFILTER v, 16
  2566. MBEDGE_LOOPFILTER h, 16
  2567. MBEDGE_LOOPFILTER v, 8
  2568. MBEDGE_LOOPFILTER h, 8
  2569. %endif
  2570. INIT_XMM sse2
  2571. MBEDGE_LOOPFILTER v, 16
  2572. MBEDGE_LOOPFILTER h, 16
  2573. MBEDGE_LOOPFILTER v, 8
  2574. MBEDGE_LOOPFILTER h, 8
  2575. INIT_XMM ssse3
  2576. MBEDGE_LOOPFILTER v, 16
  2577. MBEDGE_LOOPFILTER h, 16
  2578. MBEDGE_LOOPFILTER v, 8
  2579. MBEDGE_LOOPFILTER h, 8
  2580. INIT_XMM sse4
  2581. MBEDGE_LOOPFILTER h, 16
  2582. MBEDGE_LOOPFILTER h, 8