You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1372 lines
38KB

  1. ;******************************************************************************
  2. ;* VP8 MMXEXT optimizations
  3. ;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
  5. ;*
  6. ;* This file is part of FFmpeg.
  7. ;*
  8. ;* FFmpeg is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* FFmpeg is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with FFmpeg; if not, write to the Free Software
  20. ;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "x86inc.asm"
  23. %include "x86util.asm"
  24. SECTION_RODATA
  25. fourtap_filter_hw_m: times 4 dw -6, 123
  26. times 4 dw 12, -1
  27. times 4 dw -9, 93
  28. times 4 dw 50, -6
  29. times 4 dw -6, 50
  30. times 4 dw 93, -9
  31. times 4 dw -1, 12
  32. times 4 dw 123, -6
  33. sixtap_filter_hw_m: times 4 dw 2, -11
  34. times 4 dw 108, 36
  35. times 4 dw -8, 1
  36. times 4 dw 3, -16
  37. times 4 dw 77, 77
  38. times 4 dw -16, 3
  39. times 4 dw 1, -8
  40. times 4 dw 36, 108
  41. times 4 dw -11, 2
  42. fourtap_filter_hb_m: times 8 db -6, 123
  43. times 8 db 12, -1
  44. times 8 db -9, 93
  45. times 8 db 50, -6
  46. times 8 db -6, 50
  47. times 8 db 93, -9
  48. times 8 db -1, 12
  49. times 8 db 123, -6
  50. sixtap_filter_hb_m: times 8 db 2, 1
  51. times 8 db -11, 108
  52. times 8 db 36, -8
  53. times 8 db 3, 3
  54. times 8 db -16, 77
  55. times 8 db 77, -16
  56. times 8 db 1, 2
  57. times 8 db -8, 36
  58. times 8 db 108, -11
  59. fourtap_filter_v_m: times 8 dw -6
  60. times 8 dw 123
  61. times 8 dw 12
  62. times 8 dw -1
  63. times 8 dw -9
  64. times 8 dw 93
  65. times 8 dw 50
  66. times 8 dw -6
  67. times 8 dw -6
  68. times 8 dw 50
  69. times 8 dw 93
  70. times 8 dw -9
  71. times 8 dw -1
  72. times 8 dw 12
  73. times 8 dw 123
  74. times 8 dw -6
  75. sixtap_filter_v_m: times 8 dw 2
  76. times 8 dw -11
  77. times 8 dw 108
  78. times 8 dw 36
  79. times 8 dw -8
  80. times 8 dw 1
  81. times 8 dw 3
  82. times 8 dw -16
  83. times 8 dw 77
  84. times 8 dw 77
  85. times 8 dw -16
  86. times 8 dw 3
  87. times 8 dw 1
  88. times 8 dw -8
  89. times 8 dw 36
  90. times 8 dw 108
  91. times 8 dw -11
  92. times 8 dw 2
  93. bilinear_filter_vw_m: times 8 dw 1
  94. times 8 dw 2
  95. times 8 dw 3
  96. times 8 dw 4
  97. times 8 dw 5
  98. times 8 dw 6
  99. times 8 dw 7
  100. bilinear_filter_vb_m: times 8 db 7, 1
  101. times 8 db 6, 2
  102. times 8 db 5, 3
  103. times 8 db 4, 4
  104. times 8 db 3, 5
  105. times 8 db 2, 6
  106. times 8 db 1, 7
  107. %ifdef PIC
  108. %define fourtap_filter_hw r11
  109. %define sixtap_filter_hw r11
  110. %define fourtap_filter_hb r11
  111. %define sixtap_filter_hb r11
  112. %define fourtap_filter_v r11
  113. %define sixtap_filter_v r11
  114. %define bilinear_filter_vw r11
  115. %define bilinear_filter_vb r11
  116. %else
  117. %define fourtap_filter_hw fourtap_filter_hw_m
  118. %define sixtap_filter_hw sixtap_filter_hw_m
  119. %define fourtap_filter_hb fourtap_filter_hb_m
  120. %define sixtap_filter_hb sixtap_filter_hb_m
  121. %define fourtap_filter_v fourtap_filter_v_m
  122. %define sixtap_filter_v sixtap_filter_v_m
  123. %define bilinear_filter_vw bilinear_filter_vw_m
  124. %define bilinear_filter_vb bilinear_filter_vb_m
  125. %endif
  126. filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
  127. filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
  128. filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
  129. filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
  130. filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
  131. pw_20091: times 4 dw 20091
  132. pw_17734: times 4 dw 17734
  133. cextern pw_3
  134. cextern pb_3
  135. cextern pw_4
  136. cextern pb_4
  137. cextern pw_64
  138. cextern pb_80
  139. cextern pb_F8
  140. cextern pb_FE
  141. SECTION .text
  142. ;-----------------------------------------------------------------------------
  143. ; subpel MC functions:
  144. ;
  145. ; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
  146. ; uint8_t *src, int srcstride,
  147. ; int height, int mx, int my);
  148. ;-----------------------------------------------------------------------------
  149. %macro FILTER_SSSE3 3
  150. cglobal put_vp8_epel%1_h6_ssse3, 6, 6, %2
  151. lea r5d, [r5*3]
  152. mova m3, [filter_h6_shuf2]
  153. mova m4, [filter_h6_shuf3]
  154. %ifdef PIC
  155. lea r11, [sixtap_filter_hb_m]
  156. %endif
  157. mova m5, [sixtap_filter_hb+r5*8-48] ; set up 6tap filter in bytes
  158. mova m6, [sixtap_filter_hb+r5*8-32]
  159. mova m7, [sixtap_filter_hb+r5*8-16]
  160. .nextrow
  161. movu m0, [r2-2]
  162. mova m1, m0
  163. mova m2, m0
  164. %ifidn %1, 4
  165. ; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
  166. ; shuffle with a memory operand
  167. punpcklbw m0, [r2+3]
  168. %else
  169. pshufb m0, [filter_h6_shuf1]
  170. %endif
  171. pshufb m1, m3
  172. pshufb m2, m4
  173. pmaddubsw m0, m5
  174. pmaddubsw m1, m6
  175. pmaddubsw m2, m7
  176. paddsw m0, m1
  177. paddsw m0, m2
  178. paddsw m0, [pw_64]
  179. psraw m0, 7
  180. packuswb m0, m0
  181. movh [r0], m0 ; store
  182. ; go to next line
  183. add r0, r1
  184. add r2, r3
  185. dec r4 ; next row
  186. jg .nextrow
  187. REP_RET
  188. cglobal put_vp8_epel%1_h4_ssse3, 6, 6, %3
  189. shl r5d, 4
  190. mova m2, [pw_64]
  191. mova m3, [filter_h2_shuf]
  192. mova m4, [filter_h4_shuf]
  193. %ifdef PIC
  194. lea r11, [fourtap_filter_hb_m]
  195. %endif
  196. mova m5, [fourtap_filter_hb+r5-16] ; set up 4tap filter in bytes
  197. mova m6, [fourtap_filter_hb+r5]
  198. .nextrow
  199. movu m0, [r2-1]
  200. mova m1, m0
  201. pshufb m0, m3
  202. pshufb m1, m4
  203. pmaddubsw m0, m5
  204. pmaddubsw m1, m6
  205. paddsw m0, m2
  206. paddsw m0, m1
  207. psraw m0, 7
  208. packuswb m0, m0
  209. movh [r0], m0 ; store
  210. ; go to next line
  211. add r0, r1
  212. add r2, r3
  213. dec r4 ; next row
  214. jg .nextrow
  215. REP_RET
  216. cglobal put_vp8_epel%1_v4_ssse3, 7, 7, %2
  217. shl r6d, 4
  218. %ifdef PIC
  219. lea r11, [fourtap_filter_hb_m]
  220. %endif
  221. mova m5, [fourtap_filter_hb+r6-16]
  222. mova m6, [fourtap_filter_hb+r6]
  223. mova m7, [pw_64]
  224. ; read 3 lines
  225. sub r2, r3
  226. movh m0, [r2]
  227. movh m1, [r2+ r3]
  228. movh m2, [r2+2*r3]
  229. add r2, r3
  230. .nextrow
  231. movh m3, [r2+2*r3] ; read new row
  232. mova m4, m0
  233. mova m0, m1
  234. punpcklbw m4, m1
  235. mova m1, m2
  236. punpcklbw m2, m3
  237. pmaddubsw m4, m5
  238. pmaddubsw m2, m6
  239. paddsw m4, m2
  240. mova m2, m3
  241. paddsw m4, m7
  242. psraw m4, 7
  243. packuswb m4, m4
  244. movh [r0], m4
  245. ; go to next line
  246. add r0, r1
  247. add r2, r3
  248. dec r4 ; next row
  249. jg .nextrow
  250. REP_RET
  251. cglobal put_vp8_epel%1_v6_ssse3, 7, 7, %2
  252. lea r6d, [r6*3]
  253. %ifdef PIC
  254. lea r11, [sixtap_filter_hb_m]
  255. %endif
  256. lea r6, [sixtap_filter_hb+r6*8]
  257. ; read 5 lines
  258. sub r2, r3
  259. sub r2, r3
  260. movh m0, [r2]
  261. movh m1, [r2+r3]
  262. movh m2, [r2+r3*2]
  263. lea r2, [r2+r3*2]
  264. add r2, r3
  265. movh m3, [r2]
  266. movh m4, [r2+r3]
  267. .nextrow
  268. movh m5, [r2+2*r3] ; read new row
  269. mova m6, m0
  270. punpcklbw m6, m5
  271. mova m0, m1
  272. punpcklbw m1, m2
  273. mova m7, m3
  274. punpcklbw m7, m4
  275. pmaddubsw m6, [r6-48]
  276. pmaddubsw m1, [r6-32]
  277. pmaddubsw m7, [r6-16]
  278. paddsw m6, m1
  279. paddsw m6, m7
  280. mova m1, m2
  281. paddsw m6, [pw_64]
  282. mova m2, m3
  283. psraw m6, 7
  284. mova m3, m4
  285. packuswb m6, m6
  286. mova m4, m5
  287. movh [r0], m6
  288. ; go to next line
  289. add r0, r1
  290. add r2, r3
  291. dec r4 ; next row
  292. jg .nextrow
  293. REP_RET
  294. %endmacro
  295. INIT_MMX
  296. FILTER_SSSE3 4, 0, 0
  297. INIT_XMM
  298. FILTER_SSSE3 8, 8, 7
  299. ; 4x4 block, H-only 4-tap filter
  300. cglobal put_vp8_epel4_h4_mmxext, 6, 6
  301. shl r5d, 4
  302. %ifdef PIC
  303. lea r11, [fourtap_filter_hw_m]
  304. %endif
  305. movq mm4, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
  306. movq mm5, [fourtap_filter_hw+r5]
  307. movq mm7, [pw_64]
  308. pxor mm6, mm6
  309. .nextrow
  310. movq mm1, [r2-1] ; (ABCDEFGH) load 8 horizontal pixels
  311. ; first set of 2 pixels
  312. movq mm2, mm1 ; byte ABCD..
  313. punpcklbw mm1, mm6 ; byte->word ABCD
  314. pshufw mm0, mm2, 9 ; byte CDEF..
  315. punpcklbw mm0, mm6 ; byte->word CDEF
  316. pshufw mm3, mm1, 0x94 ; word ABBC
  317. pshufw mm1, mm0, 0x94 ; word CDDE
  318. pmaddwd mm3, mm4 ; multiply 2px with F0/F1
  319. movq mm0, mm1 ; backup for second set of pixels
  320. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  321. paddd mm3, mm1 ; finish 1st 2px
  322. ; second set of 2 pixels, use backup of above
  323. punpckhbw mm2, mm6 ; byte->word EFGH
  324. pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
  325. pshufw mm1, mm2, 0x94 ; word EFFG
  326. pmaddwd mm1, mm5 ; multiply 2px with F2/F3
  327. paddd mm0, mm1 ; finish 2nd 2px
  328. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  329. packssdw mm3, mm0 ; merge dword->word (4px)
  330. paddsw mm3, mm7 ; rounding
  331. psraw mm3, 7
  332. packuswb mm3, mm6 ; clip and word->bytes
  333. movd [r0], mm3 ; store
  334. ; go to next line
  335. add r0, r1
  336. add r2, r3
  337. dec r4 ; next row
  338. jg .nextrow
  339. REP_RET
  340. ; 4x4 block, H-only 6-tap filter
  341. cglobal put_vp8_epel4_h6_mmxext, 6, 6
  342. lea r5d, [r5*3]
  343. %ifdef PIC
  344. lea r11, [sixtap_filter_hw_m]
  345. %endif
  346. movq mm4, [sixtap_filter_hw+r5*8-48] ; set up 4tap filter in words
  347. movq mm5, [sixtap_filter_hw+r5*8-32]
  348. movq mm6, [sixtap_filter_hw+r5*8-16]
  349. movq mm7, [pw_64]
  350. pxor mm3, mm3
  351. .nextrow
  352. movq mm1, [r2-2] ; (ABCDEFGH) load 8 horizontal pixels
  353. ; first set of 2 pixels
  354. movq mm2, mm1 ; byte ABCD..
  355. punpcklbw mm1, mm3 ; byte->word ABCD
  356. pshufw mm0, mm2, 0x9 ; byte CDEF..
  357. punpckhbw mm2, mm3 ; byte->word EFGH
  358. punpcklbw mm0, mm3 ; byte->word CDEF
  359. pshufw mm1, mm1, 0x94 ; word ABBC
  360. pshufw mm2, mm2, 0x94 ; word EFFG
  361. pmaddwd mm1, mm4 ; multiply 2px with F0/F1
  362. pshufw mm3, mm0, 0x94 ; word CDDE
  363. movq mm0, mm3 ; backup for second set of pixels
  364. pmaddwd mm3, mm5 ; multiply 2px with F2/F3
  365. paddd mm1, mm3 ; add to 1st 2px cache
  366. movq mm3, mm2 ; backup for second set of pixels
  367. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  368. paddd mm1, mm2 ; finish 1st 2px
  369. ; second set of 2 pixels, use backup of above
  370. movd mm2, [r2+3] ; byte FGHI (prevent overreads)
  371. pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
  372. pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
  373. paddd mm0, mm3 ; add to 2nd 2px cache
  374. pxor mm3, mm3
  375. punpcklbw mm2, mm3 ; byte->word FGHI
  376. pshufw mm2, mm2, 0xE9 ; word GHHI
  377. pmaddwd mm2, mm6 ; multiply 2px with F4/F5
  378. paddd mm0, mm2 ; finish 2nd 2px
  379. ; merge two sets of 2 pixels into one set of 4, round/clip/store
  380. packssdw mm1, mm0 ; merge dword->word (4px)
  381. paddsw mm1, mm7 ; rounding
  382. psraw mm1, 7
  383. packuswb mm1, mm3 ; clip and word->bytes
  384. movd [r0], mm1 ; store
  385. ; go to next line
  386. add r0, r1
  387. add r2, r3
  388. dec r4 ; next row
  389. jg .nextrow
  390. REP_RET
  391. ; 4x4 block, H-only 4-tap filter
  392. INIT_XMM
  393. cglobal put_vp8_epel8_h4_sse2, 6, 6, 8
  394. shl r5d, 4
  395. %ifdef PIC
  396. lea r11, [fourtap_filter_hw_m]
  397. %endif
  398. mova m5, [fourtap_filter_hw+r5-16] ; set up 4tap filter in words
  399. mova m6, [fourtap_filter_hw+r5]
  400. pxor m7, m7
  401. .nextrow
  402. movh m0, [r2-1]
  403. punpcklbw m0, m7 ; ABCDEFGH
  404. mova m1, m0
  405. mova m2, m0
  406. mova m3, m0
  407. psrldq m1, 2 ; BCDEFGH
  408. psrldq m2, 4 ; CDEFGH
  409. psrldq m3, 6 ; DEFGH
  410. punpcklwd m0, m1 ; ABBCCDDE
  411. punpcklwd m2, m3 ; CDDEEFFG
  412. pmaddwd m0, m5
  413. pmaddwd m2, m6
  414. paddd m0, m2
  415. movh m1, [r2+3]
  416. punpcklbw m1, m7 ; ABCDEFGH
  417. mova m2, m1
  418. mova m3, m1
  419. mova m4, m1
  420. psrldq m2, 2 ; BCDEFGH
  421. psrldq m3, 4 ; CDEFGH
  422. psrldq m4, 6 ; DEFGH
  423. punpcklwd m1, m2 ; ABBCCDDE
  424. punpcklwd m3, m4 ; CDDEEFFG
  425. pmaddwd m1, m5
  426. pmaddwd m3, m6
  427. paddd m1, m3
  428. packssdw m0, m1
  429. paddsw m0, [pw_64]
  430. psraw m0, 7
  431. packuswb m0, m7
  432. movh [r0], m0 ; store
  433. ; go to next line
  434. add r0, r1
  435. add r2, r3
  436. dec r4 ; next row
  437. jg .nextrow
  438. REP_RET
  439. cglobal put_vp8_epel8_h6_sse2, 6, 6, 8
  440. lea r5d, [r5*3]
  441. %ifdef PIC
  442. lea r11, [sixtap_filter_hw_m]
  443. %endif
  444. lea r5, [sixtap_filter_hw+r5*8]
  445. pxor m7, m7
  446. .nextrow
  447. movu m0, [r2-2]
  448. mova m6, m0
  449. mova m4, m0
  450. punpcklbw m0, m7 ; ABCDEFGHI
  451. mova m1, m0
  452. mova m2, m0
  453. mova m3, m0
  454. psrldq m1, 2 ; BCDEFGH
  455. psrldq m2, 4 ; CDEFGH
  456. psrldq m3, 6 ; DEFGH
  457. psrldq m4, 4
  458. punpcklbw m4, m7 ; EFGH
  459. mova m5, m4
  460. psrldq m5, 2 ; FGH
  461. punpcklwd m0, m1 ; ABBCCDDE
  462. punpcklwd m2, m3 ; CDDEEFFG
  463. punpcklwd m4, m5 ; EFFGGHHI
  464. pmaddwd m0, [r5-48]
  465. pmaddwd m2, [r5-32]
  466. pmaddwd m4, [r5-16]
  467. paddd m0, m2
  468. paddd m0, m4
  469. psrldq m6, 4
  470. mova m4, m6
  471. punpcklbw m6, m7 ; ABCDEFGHI
  472. mova m1, m6
  473. mova m2, m6
  474. mova m3, m6
  475. psrldq m1, 2 ; BCDEFGH
  476. psrldq m2, 4 ; CDEFGH
  477. psrldq m3, 6 ; DEFGH
  478. psrldq m4, 4
  479. punpcklbw m4, m7 ; EFGH
  480. mova m5, m4
  481. psrldq m5, 2 ; FGH
  482. punpcklwd m6, m1 ; ABBCCDDE
  483. punpcklwd m2, m3 ; CDDEEFFG
  484. punpcklwd m4, m5 ; EFFGGHHI
  485. pmaddwd m6, [r5-48]
  486. pmaddwd m2, [r5-32]
  487. pmaddwd m4, [r5-16]
  488. paddd m6, m2
  489. paddd m6, m4
  490. packssdw m0, m6
  491. paddsw m0, [pw_64]
  492. psraw m0, 7
  493. packuswb m0, m7
  494. movh [r0], m0 ; store
  495. ; go to next line
  496. add r0, r1
  497. add r2, r3
  498. dec r4 ; next row
  499. jg .nextrow
  500. REP_RET
  501. %macro FILTER_V 3
  502. ; 4x4 block, V-only 4-tap filter
  503. cglobal put_vp8_epel%2_v4_%1, 7, 7, %3
  504. shl r6d, 5
  505. %ifdef PIC
  506. lea r11, [fourtap_filter_v_m]
  507. %endif
  508. lea r6, [fourtap_filter_v+r6-32]
  509. mova m6, [pw_64]
  510. pxor m7, m7
  511. mova m5, [r6+48]
  512. ; read 3 lines
  513. sub r2, r3
  514. movh m0, [r2]
  515. movh m1, [r2+ r3]
  516. movh m2, [r2+2*r3]
  517. add r2, r3
  518. punpcklbw m0, m7
  519. punpcklbw m1, m7
  520. punpcklbw m2, m7
  521. .nextrow
  522. ; first calculate negative taps (to prevent losing positive overflows)
  523. movh m4, [r2+2*r3] ; read new row
  524. punpcklbw m4, m7
  525. mova m3, m4
  526. pmullw m0, [r6+0]
  527. pmullw m4, m5
  528. paddsw m4, m0
  529. ; then calculate positive taps
  530. mova m0, m1
  531. pmullw m1, [r6+16]
  532. paddsw m4, m1
  533. mova m1, m2
  534. pmullw m2, [r6+32]
  535. paddsw m4, m2
  536. mova m2, m3
  537. ; round/clip/store
  538. paddsw m4, m6
  539. psraw m4, 7
  540. packuswb m4, m7
  541. movh [r0], m4
  542. ; go to next line
  543. add r0, r1
  544. add r2, r3
  545. dec r4 ; next row
  546. jg .nextrow
  547. REP_RET
  548. ; 4x4 block, V-only 6-tap filter
  549. cglobal put_vp8_epel%2_v6_%1, 7, 7, %3
  550. shl r6d, 4
  551. lea r6, [r6*3]
  552. %ifdef PIC
  553. lea r11, [sixtap_filter_v_m]
  554. %endif
  555. lea r6, [sixtap_filter_v+r6-96]
  556. pxor m7, m7
  557. ; read 5 lines
  558. sub r2, r3
  559. sub r2, r3
  560. movh m0, [r2]
  561. movh m1, [r2+r3]
  562. movh m2, [r2+r3*2]
  563. lea r2, [r2+r3*2]
  564. add r2, r3
  565. movh m3, [r2]
  566. movh m4, [r2+r3]
  567. punpcklbw m0, m7
  568. punpcklbw m1, m7
  569. punpcklbw m2, m7
  570. punpcklbw m3, m7
  571. punpcklbw m4, m7
  572. .nextrow
  573. ; first calculate negative taps (to prevent losing positive overflows)
  574. mova m5, m1
  575. pmullw m5, [r6+16]
  576. mova m6, m4
  577. pmullw m6, [r6+64]
  578. paddsw m6, m5
  579. ; then calculate positive taps
  580. movh m5, [r2+2*r3] ; read new row
  581. punpcklbw m5, m7
  582. pmullw m0, [r6+0]
  583. paddsw m6, m0
  584. mova m0, m1
  585. mova m1, m2
  586. pmullw m2, [r6+32]
  587. paddsw m6, m2
  588. mova m2, m3
  589. pmullw m3, [r6+48]
  590. paddsw m6, m3
  591. mova m3, m4
  592. mova m4, m5
  593. pmullw m5, [r6+80]
  594. paddsw m6, m5
  595. ; round/clip/store
  596. paddsw m6, [pw_64]
  597. psraw m6, 7
  598. packuswb m6, m7
  599. movh [r0], m6
  600. ; go to next line
  601. add r0, r1
  602. add r2, r3
  603. dec r4 ; next row
  604. jg .nextrow
  605. REP_RET
  606. %endmacro
  607. INIT_MMX
  608. FILTER_V mmxext, 4, 0
  609. INIT_XMM
  610. FILTER_V sse2, 8, 8
  611. %macro FILTER_BILINEAR 3
  612. cglobal put_vp8_bilinear%2_v_%1, 7,7,%3
  613. mov r5d, 8*16
  614. shl r6d, 4
  615. sub r5d, r6d
  616. %ifdef PIC
  617. lea r11, [bilinear_filter_vw_m]
  618. %endif
  619. pxor m6, m6
  620. mova m4, [bilinear_filter_vw+r5-16]
  621. mova m5, [bilinear_filter_vw+r6-16]
  622. .nextrow
  623. movh m0, [r2+r3*0]
  624. movh m1, [r2+r3*1]
  625. movh m3, [r2+r3*2]
  626. punpcklbw m0, m6
  627. punpcklbw m1, m6
  628. punpcklbw m3, m6
  629. mova m2, m1
  630. pmullw m0, m4
  631. pmullw m1, m5
  632. pmullw m2, m4
  633. pmullw m3, m5
  634. paddsw m0, m1
  635. paddsw m2, m3
  636. psraw m0, 2
  637. psraw m2, 2
  638. pavgw m0, m6
  639. pavgw m2, m6
  640. %ifidn %1, mmxext
  641. packuswb m0, m0
  642. packuswb m2, m2
  643. movh [r0+r1*0], m0
  644. movh [r0+r1*1], m2
  645. %else
  646. packuswb m0, m2
  647. movh [r0+r1*0], m0
  648. movhps [r0+r1*1], m0
  649. %endif
  650. lea r0, [r0+r1*2]
  651. lea r2, [r2+r3*2]
  652. sub r4, 2
  653. jg .nextrow
  654. REP_RET
  655. cglobal put_vp8_bilinear%2_h_%1, 7,7,%3
  656. mov r6d, 8*16
  657. shl r5d, 4
  658. sub r6d, r5d
  659. %ifdef PIC
  660. lea r11, [bilinear_filter_vw_m]
  661. %endif
  662. pxor m6, m6
  663. mova m4, [bilinear_filter_vw+r6-16]
  664. mova m5, [bilinear_filter_vw+r5-16]
  665. .nextrow
  666. movh m0, [r2+r3*0+0]
  667. movh m1, [r2+r3*0+1]
  668. movh m2, [r2+r3*1+0]
  669. movh m3, [r2+r3*1+1]
  670. punpcklbw m0, m6
  671. punpcklbw m1, m6
  672. punpcklbw m2, m6
  673. punpcklbw m3, m6
  674. pmullw m0, m4
  675. pmullw m1, m5
  676. pmullw m2, m4
  677. pmullw m3, m5
  678. paddsw m0, m1
  679. paddsw m2, m3
  680. psraw m0, 2
  681. psraw m2, 2
  682. pavgw m0, m6
  683. pavgw m2, m6
  684. %ifidn %1, mmxext
  685. packuswb m0, m0
  686. packuswb m2, m2
  687. movh [r0+r1*0], m0
  688. movh [r0+r1*1], m2
  689. %else
  690. packuswb m0, m2
  691. movh [r0+r1*0], m0
  692. movhps [r0+r1*1], m0
  693. %endif
  694. lea r0, [r0+r1*2]
  695. lea r2, [r2+r3*2]
  696. sub r4, 2
  697. jg .nextrow
  698. REP_RET
  699. %endmacro
  700. INIT_MMX
  701. FILTER_BILINEAR mmxext, 4, 0
  702. INIT_XMM
  703. FILTER_BILINEAR sse2, 8, 7
  704. %macro FILTER_BILINEAR_SSSE3 1
  705. cglobal put_vp8_bilinear%1_v_ssse3, 7,7
  706. shl r6d, 4
  707. %ifdef PIC
  708. lea r11, [bilinear_filter_vb_m]
  709. %endif
  710. pxor m4, m4
  711. mova m3, [bilinear_filter_vb+r6-16]
  712. .nextrow
  713. movh m0, [r2+r3*0]
  714. movh m1, [r2+r3*1]
  715. movh m2, [r2+r3*2]
  716. punpcklbw m0, m1
  717. punpcklbw m1, m2
  718. pmaddubsw m0, m3
  719. pmaddubsw m1, m3
  720. psraw m0, 2
  721. psraw m1, 2
  722. pavgw m0, m4
  723. pavgw m1, m4
  724. %if mmsize==8
  725. packuswb m0, m0
  726. packuswb m1, m1
  727. movh [r0+r1*0], m0
  728. movh [r0+r1*1], m1
  729. %else
  730. packuswb m0, m1
  731. movh [r0+r1*0], m0
  732. movhps [r0+r1*1], m0
  733. %endif
  734. lea r0, [r0+r1*2]
  735. lea r2, [r2+r3*2]
  736. sub r4, 2
  737. jg .nextrow
  738. REP_RET
  739. cglobal put_vp8_bilinear%1_h_ssse3, 7,7
  740. shl r5d, 4
  741. %ifdef PIC
  742. lea r11, [bilinear_filter_vb_m]
  743. %endif
  744. pxor m4, m4
  745. mova m2, [filter_h2_shuf]
  746. mova m3, [bilinear_filter_vb+r5-16]
  747. .nextrow
  748. movu m0, [r2+r3*0]
  749. movu m1, [r2+r3*1]
  750. pshufb m0, m2
  751. pshufb m1, m2
  752. pmaddubsw m0, m3
  753. pmaddubsw m1, m3
  754. psraw m0, 2
  755. psraw m1, 2
  756. pavgw m0, m4
  757. pavgw m1, m4
  758. %if mmsize==8
  759. packuswb m0, m0
  760. packuswb m1, m1
  761. movh [r0+r1*0], m0
  762. movh [r0+r1*1], m1
  763. %else
  764. packuswb m0, m1
  765. movh [r0+r1*0], m0
  766. movhps [r0+r1*1], m0
  767. %endif
  768. lea r0, [r0+r1*2]
  769. lea r2, [r2+r3*2]
  770. sub r4, 2
  771. jg .nextrow
  772. REP_RET
  773. %endmacro
  774. INIT_MMX
  775. FILTER_BILINEAR_SSSE3 4
  776. INIT_XMM
  777. FILTER_BILINEAR_SSSE3 8
  778. cglobal put_vp8_pixels8_mmx, 5,5
  779. .nextrow:
  780. movq mm0, [r2+r3*0]
  781. movq mm1, [r2+r3*1]
  782. lea r2, [r2+r3*2]
  783. movq [r0+r1*0], mm0
  784. movq [r0+r1*1], mm1
  785. lea r0, [r0+r1*2]
  786. sub r4d, 2
  787. jg .nextrow
  788. REP_RET
  789. cglobal put_vp8_pixels16_mmx, 5,5
  790. .nextrow:
  791. movq mm0, [r2+r3*0+0]
  792. movq mm1, [r2+r3*0+8]
  793. movq mm2, [r2+r3*1+0]
  794. movq mm3, [r2+r3*1+8]
  795. lea r2, [r2+r3*2]
  796. movq [r0+r1*0+0], mm0
  797. movq [r0+r1*0+8], mm1
  798. movq [r0+r1*1+0], mm2
  799. movq [r0+r1*1+8], mm3
  800. lea r0, [r0+r1*2]
  801. sub r4d, 2
  802. jg .nextrow
  803. REP_RET
  804. cglobal put_vp8_pixels16_sse, 5,5,2
  805. .nextrow:
  806. movups xmm0, [r2+r3*0]
  807. movups xmm1, [r2+r3*1]
  808. lea r2, [r2+r3*2]
  809. movaps [r0+r1*0], xmm0
  810. movaps [r0+r1*1], xmm1
  811. lea r0, [r0+r1*2]
  812. sub r4d, 2
  813. jg .nextrow
  814. REP_RET
  815. ;-----------------------------------------------------------------------------
  816. ; IDCT functions:
  817. ;
  818. ; void vp8_idct_dc_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  819. ;-----------------------------------------------------------------------------
  820. cglobal vp8_idct_dc_add_mmx, 3, 3
  821. ; load data
  822. movd mm0, [r1]
  823. ; calculate DC
  824. paddw mm0, [pw_4]
  825. pxor mm1, mm1
  826. psraw mm0, 3
  827. psubw mm1, mm0
  828. packuswb mm0, mm0
  829. packuswb mm1, mm1
  830. punpcklbw mm0, mm0
  831. punpcklbw mm1, mm1
  832. punpcklwd mm0, mm0
  833. punpcklwd mm1, mm1
  834. ; add DC
  835. lea r1, [r0+r2*2]
  836. movd mm2, [r0]
  837. movd mm3, [r0+r2]
  838. movd mm4, [r1]
  839. movd mm5, [r1+r2]
  840. paddusb mm2, mm0
  841. paddusb mm3, mm0
  842. paddusb mm4, mm0
  843. paddusb mm5, mm0
  844. psubusb mm2, mm1
  845. psubusb mm3, mm1
  846. psubusb mm4, mm1
  847. psubusb mm5, mm1
  848. movd [r0], mm2
  849. movd [r0+r2], mm3
  850. movd [r1], mm4
  851. movd [r1+r2], mm5
  852. RET
  853. cglobal vp8_idct_dc_add_sse4, 3, 3, 6
  854. ; load data
  855. movd xmm0, [r1]
  856. lea r1, [r0+r2*2]
  857. pxor xmm1, xmm1
  858. movq xmm2, [pw_4]
  859. ; calculate DC
  860. paddw xmm0, xmm2
  861. movd xmm2, [r0]
  862. movd xmm3, [r0+r2]
  863. movd xmm4, [r1]
  864. movd xmm5, [r1+r2]
  865. psraw xmm0, 3
  866. pshuflw xmm0, xmm0, 0
  867. punpcklqdq xmm0, xmm0
  868. punpckldq xmm2, xmm3
  869. punpckldq xmm4, xmm5
  870. punpcklbw xmm2, xmm1
  871. punpcklbw xmm4, xmm1
  872. paddw xmm2, xmm0
  873. paddw xmm4, xmm0
  874. packuswb xmm2, xmm4
  875. movd [r0], xmm2
  876. pextrd [r0+r2], xmm2, 1
  877. pextrd [r1], xmm2, 2
  878. pextrd [r1+r2], xmm2, 3
  879. RET
  880. ;-----------------------------------------------------------------------------
  881. ; void vp8_idct_add_<opt>(uint8_t *dst, DCTELEM block[16], int stride);
  882. ;-----------------------------------------------------------------------------
  883. ; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
  884. ; this macro assumes that m6/m7 have words for 20091/17734 loaded
  885. %macro VP8_MULTIPLY_SUMSUB 4
  886. mova %3, %1
  887. mova %4, %2
  888. pmulhw %3, m6 ;20091(1)
  889. pmulhw %4, m6 ;20091(2)
  890. paddw %3, %1
  891. paddw %4, %2
  892. paddw %1, %1
  893. paddw %2, %2
  894. pmulhw %1, m7 ;35468(1)
  895. pmulhw %2, m7 ;35468(2)
  896. psubw %1, %4
  897. paddw %2, %3
  898. %endmacro
  899. ; calculate x0=%1+%3; x1=%1-%3
  900. ; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
  901. ; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
  902. ; %5/%6 are temporary registers
  903. ; we assume m6/m7 have constant words 20091/17734 loaded in them
  904. %macro VP8_IDCT_TRANSFORM4x4_1D 6
  905. SUMSUB_BA m%3, m%1, m%5 ;t0, t1
  906. VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
  907. SUMSUB_BA m%4, m%3, m%5 ;tmp0, tmp3
  908. SUMSUB_BA m%2, m%1, m%5 ;tmp1, tmp2
  909. SWAP %4, %1
  910. SWAP %4, %3
  911. %endmacro
  912. INIT_MMX
  913. cglobal vp8_idct_add_mmx, 3, 3
  914. ; load block data
  915. movq m0, [r1]
  916. movq m1, [r1+8]
  917. movq m2, [r1+16]
  918. movq m3, [r1+24]
  919. movq m6, [pw_20091]
  920. movq m7, [pw_17734]
  921. ; actual IDCT
  922. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  923. TRANSPOSE4x4W 0, 1, 2, 3, 4
  924. paddw m0, [pw_4]
  925. VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
  926. TRANSPOSE4x4W 0, 1, 2, 3, 4
  927. ; store
  928. pxor m4, m4
  929. lea r1, [r0+2*r2]
  930. STORE_DIFFx2 m0, m1, m6, m7, m4, 3, r0, r2
  931. STORE_DIFFx2 m2, m3, m6, m7, m4, 3, r1, r2
  932. RET
  933. ;-----------------------------------------------------------------------------
  934. ; void vp8_luma_dc_wht_mmxext(DCTELEM block[4][4][16], DCTELEM dc[16])
  935. ;-----------------------------------------------------------------------------
  936. %macro SCATTER_WHT 1
  937. pextrw r1d, m0, %1
  938. pextrw r2d, m1, %1
  939. mov [r0+2*16*0], r1w
  940. mov [r0+2*16*1], r2w
  941. pextrw r1d, m2, %1
  942. pextrw r2d, m3, %1
  943. mov [r0+2*16*2], r1w
  944. mov [r0+2*16*3], r2w
  945. %endmacro
  946. %macro HADAMARD4_1D 4
  947. SUMSUB_BADC m%2, m%1, m%4, m%3
  948. SUMSUB_BADC m%4, m%2, m%3, m%1
  949. SWAP %1, %4, %3
  950. %endmacro
  951. INIT_MMX
  952. cglobal vp8_luma_dc_wht_mmxext, 2,3
  953. movq m0, [r1]
  954. movq m1, [r1+8]
  955. movq m2, [r1+16]
  956. movq m3, [r1+24]
  957. HADAMARD4_1D 0, 1, 2, 3
  958. TRANSPOSE4x4W 0, 1, 2, 3, 4
  959. paddw m0, [pw_3]
  960. HADAMARD4_1D 0, 1, 2, 3
  961. psraw m0, 3
  962. psraw m1, 3
  963. psraw m2, 3
  964. psraw m3, 3
  965. SCATTER_WHT 0
  966. add r0, 2*16*4
  967. SCATTER_WHT 1
  968. add r0, 2*16*4
  969. SCATTER_WHT 2
  970. add r0, 2*16*4
  971. SCATTER_WHT 3
  972. RET
  973. ;-----------------------------------------------------------------------------
  974. ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
  975. ;-----------------------------------------------------------------------------
  976. ; macro called with 7 mm register indexes as argument, and 4 regular registers
  977. ;
  978. ; first 4 mm registers will carry the transposed pixel data
  979. ; the other three are scratchspace (one would be sufficient, but this allows
  980. ; for more spreading/pipelining and thus faster execution on OOE CPUs)
  981. ;
  982. ; first two regular registers are buf+4*stride and buf+5*stride
  983. ; third is -stride, fourth is +stride
  984. %macro READ_8x4_INTERLEAVED 11
  985. ; interleave 8 (A-H) rows of 4 pixels each
  986. movd m%1, [%8+%10*4] ; A0-3
  987. movd m%5, [%9+%10*4] ; B0-3
  988. movd m%2, [%8+%10*2] ; C0-3
  989. movd m%6, [%8+%10] ; D0-3
  990. movd m%3, [%8] ; E0-3
  991. movd m%7, [%9] ; F0-3
  992. movd m%4, [%9+%11] ; G0-3
  993. punpcklbw m%1, m%5 ; A/B interleaved
  994. movd m%5, [%9+%11*2] ; H0-3
  995. punpcklbw m%2, m%6 ; C/D interleaved
  996. punpcklbw m%3, m%7 ; E/F interleaved
  997. punpcklbw m%4, m%5 ; G/H interleaved
  998. %endmacro
  999. ; macro called with 7 mm register indexes as argument, and 5 regular registers
  1000. ; first 11 mean the same as READ_8x4_TRANSPOSED above
  1001. ; fifth regular register is scratchspace to reach the bottom 8 rows, it
  1002. ; will be set to second regular register + 8*stride at the end
  1003. %macro READ_16x4_INTERLEAVED 12
  1004. ; transpose 16 (A-P) rows of 4 pixels each
  1005. lea %12, [r0+8*r2]
  1006. ; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
  1007. movd m%1, [%8+%10*4] ; A0-3
  1008. movd m%3, [%12+%10*4] ; I0-3
  1009. movd m%2, [%8+%10*2] ; C0-3
  1010. movd m%4, [%12+%10*2] ; K0-3
  1011. movd m%6, [%8+%10] ; D0-3
  1012. movd m%5, [%12+%10] ; L0-3
  1013. movd m%7, [%12] ; M0-3
  1014. add %12, %11
  1015. punpcklbw m%1, m%3 ; A/I
  1016. movd m%3, [%8] ; E0-3
  1017. punpcklbw m%2, m%4 ; C/K
  1018. punpcklbw m%6, m%5 ; D/L
  1019. punpcklbw m%3, m%7 ; E/M
  1020. punpcklbw m%2, m%6 ; C/D/K/L interleaved
  1021. ; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
  1022. movd m%5, [%9+%10*4] ; B0-3
  1023. movd m%4, [%12+%10*4] ; J0-3
  1024. movd m%7, [%9] ; F0-3
  1025. movd m%6, [%12] ; N0-3
  1026. punpcklbw m%5, m%4 ; B/J
  1027. punpcklbw m%7, m%6 ; F/N
  1028. punpcklbw m%1, m%5 ; A/B/I/J interleaved
  1029. punpcklbw m%3, m%7 ; E/F/M/N interleaved
  1030. movd m%4, [%9+%11] ; G0-3
  1031. movd m%6, [%12+%11] ; O0-3
  1032. movd m%5, [%9+%11*2] ; H0-3
  1033. movd m%7, [%12+%11*2] ; P0-3
  1034. punpcklbw m%4, m%6 ; G/O
  1035. punpcklbw m%5, m%7 ; H/P
  1036. punpcklbw m%4, m%5 ; G/H/O/P interleaved
  1037. %endmacro
  1038. ; write 4 mm registers of 2 dwords each
  1039. ; first four arguments are mm register indexes containing source data
  1040. ; last four are registers containing buf+4*stride, buf+5*stride,
  1041. ; -stride and +stride
  1042. %macro WRITE_4x2D 8
  1043. ; write out (2 dwords per register)
  1044. movd [%5+%7*4], m%1
  1045. movd [%5+%7*2], m%2
  1046. movd [%5], m%3
  1047. movd [%6+%8], m%4
  1048. punpckhdq m%1, m%1
  1049. punpckhdq m%2, m%2
  1050. punpckhdq m%3, m%3
  1051. punpckhdq m%4, m%4
  1052. movd [%6+%7*4], m%1
  1053. movd [%5+%7], m%2
  1054. movd [%6], m%3
  1055. movd [%6+%8*2], m%4
  1056. %endmacro
  1057. ; write 4 xmm registers of 4 dwords each
  1058. ; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
  1059. ; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
  1060. ; we add 1*stride to the third regular registry in the process
  1061. %macro WRITE_4x4D 9
  1062. ; write out (4 dwords per register), start with dwords zero
  1063. movd [%5+%8*4], m%1
  1064. movd [%5], m%2
  1065. movd [%5+%9*4], m%3
  1066. movd [%5+%9*8], m%4
  1067. ; store dwords 1
  1068. psrldq m%1, 4
  1069. psrldq m%2, 4
  1070. psrldq m%3, 4
  1071. psrldq m%4, 4
  1072. movd [%6+%8*4], m%1
  1073. movd [%6], m%2
  1074. movd [%6+%9*4], m%3
  1075. movd [%6+%9*8], m%4
  1076. ; write dwords 2
  1077. psrldq m%1, 4
  1078. psrldq m%2, 4
  1079. psrldq m%3, 4
  1080. psrldq m%4, 4
  1081. movd [%5+%8*2], m%1
  1082. movd [%6+%9], m%2
  1083. movd [%7+%8*2], m%3
  1084. movd [%7+%9*2], m%4
  1085. add %7, %9
  1086. ; store dwords 3
  1087. psrldq m%1, 4
  1088. psrldq m%2, 4
  1089. psrldq m%3, 4
  1090. psrldq m%4, 4
  1091. movd [%5+%8], m%1
  1092. movd [%6+%9*2], m%2
  1093. movd [%7+%8*2], m%3
  1094. movd [%7+%9*2], m%4
  1095. %endmacro
  1096. %macro SIMPLE_LOOPFILTER 3
  1097. cglobal vp8_%2_loop_filter_simple_%1, 3, %3
  1098. %ifidn %2, h
  1099. mov r5, rsp ; backup stack pointer
  1100. and rsp, ~(mmsize-1) ; align stack
  1101. %endif
  1102. %if mmsize == 8 ; mmx/mmxext
  1103. mov r3, 2
  1104. %endif
  1105. ; splat register with "flim"
  1106. movd m7, r2
  1107. punpcklbw m7, m7
  1108. %if mmsize == 16 ; sse2
  1109. punpcklwd m7, m7
  1110. pshufd m7, m7, 0x0
  1111. %elifidn %1, mmx
  1112. punpcklwd m7, m7
  1113. punpckldq m7, m7
  1114. %else ; mmxext
  1115. pshufw m7, m7, 0x0
  1116. %endif
  1117. ; set up indexes to address 4 rows
  1118. mov r2, r1
  1119. neg r1
  1120. %ifidn %2, h
  1121. lea r0, [r0+4*r2-2]
  1122. sub rsp, mmsize*2 ; (aligned) storage space for saving p1/q1
  1123. %endif
  1124. %if mmsize == 8 ; mmx / mmxext
  1125. .next8px
  1126. %endif
  1127. %ifidn %2, v
  1128. ; read 4 half/full rows of pixels
  1129. mova m0, [r0+r1*2] ; p1
  1130. mova m1, [r0+r1] ; p0
  1131. mova m2, [r0] ; q0
  1132. mova m3, [r0+r2] ; q1
  1133. %else ; h
  1134. lea r4, [r0+r2]
  1135. %if mmsize == 8 ; mmx/mmxext
  1136. READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2
  1137. %else ; sse2
  1138. READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, r0, r4, r1, r2, r3
  1139. %endif
  1140. TRANSPOSE4x4W 0, 1, 2, 3, 4
  1141. mova [rsp], m0 ; store p1
  1142. mova [rsp+mmsize], m3 ; store q1
  1143. %endif
  1144. ; simple_limit
  1145. mova m5, m2 ; m5=backup of q0
  1146. mova m6, m1 ; m6=backup of p0
  1147. psubusb m1, m2 ; p0-q0
  1148. psubusb m2, m6 ; q0-p0
  1149. por m1, m2 ; FFABS(p0-q0)
  1150. paddusb m1, m1 ; m1=FFABS(p0-q0)*2
  1151. mova m4, m3
  1152. mova m2, m0
  1153. psubusb m3, m0 ; q1-p1
  1154. psubusb m0, m4 ; p1-q1
  1155. por m3, m0 ; FFABS(p1-q1)
  1156. mova m0, [pb_80]
  1157. pxor m2, m0
  1158. pxor m4, m0
  1159. psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
  1160. pand m3, [pb_FE]
  1161. psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
  1162. paddusb m3, m1
  1163. psubusb m3, m7
  1164. pxor m1, m1
  1165. pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
  1166. ; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
  1167. mova m4, m5
  1168. pxor m5, m0
  1169. pxor m0, m6
  1170. psubsb m5, m0 ; q0-p0 (signed)
  1171. paddsb m2, m5
  1172. paddsb m2, m5
  1173. paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
  1174. pand m2, m3 ; apply filter mask (m3)
  1175. mova m3, [pb_F8]
  1176. mova m1, m2
  1177. paddsb m2, [pb_4] ; f1<<3=a+4
  1178. paddsb m1, [pb_3] ; f2<<3=a+3
  1179. pand m2, m3
  1180. pand m1, m3 ; cache f2<<3
  1181. pxor m0, m0
  1182. pxor m3, m3
  1183. pcmpgtb m0, m2 ; which values are <0?
  1184. psubb m3, m2 ; -f1<<3
  1185. psrlq m2, 3 ; +f1
  1186. psrlq m3, 3 ; -f1
  1187. pand m3, m0
  1188. pandn m0, m2
  1189. psubusb m4, m0
  1190. paddusb m4, m3 ; q0-f1
  1191. pxor m0, m0
  1192. pxor m3, m3
  1193. pcmpgtb m0, m1 ; which values are <0?
  1194. psubb m3, m1 ; -f2<<3
  1195. psrlq m1, 3 ; +f2
  1196. psrlq m3, 3 ; -f2
  1197. pand m3, m0
  1198. pandn m0, m1
  1199. paddusb m6, m0
  1200. psubusb m6, m3 ; p0+f2
  1201. ; store
  1202. %ifidn %2, v
  1203. mova [r0], m4
  1204. mova [r0+r1], m6
  1205. %else ; h
  1206. mova m0, [rsp] ; p1
  1207. SWAP 2, 4 ; p0
  1208. SWAP 1, 6 ; q0
  1209. mova m3, [rsp+mmsize] ; q1
  1210. TRANSPOSE4x4B 0, 1, 2, 3, 4
  1211. %if mmsize == 16 ; sse2
  1212. add r3, r1 ; change from r4*8*stride to r0+8*stride
  1213. WRITE_4x4D 0, 1, 2, 3, r0, r4, r3, r1, r2
  1214. %else ; mmx/mmxext
  1215. WRITE_4x2D 0, 1, 2, 3, r0, r4, r1, r2
  1216. %endif
  1217. %endif
  1218. %if mmsize == 8 ; mmx/mmxext
  1219. ; next 8 pixels
  1220. %ifidn %2, v
  1221. add r0, 8 ; advance 8 cols = pixels
  1222. %else ; h
  1223. lea r0, [r0+r2*8] ; advance 8 rows = lines
  1224. %endif
  1225. dec r3
  1226. jg .next8px
  1227. %ifidn %2, v
  1228. REP_RET
  1229. %else ; h
  1230. mov rsp, r5 ; restore stack pointer
  1231. RET
  1232. %endif
  1233. %else ; sse2
  1234. %ifidn %2, h
  1235. mov rsp, r5 ; restore stack pointer
  1236. %endif
  1237. RET
  1238. %endif
  1239. %endmacro
  1240. INIT_MMX
  1241. SIMPLE_LOOPFILTER mmx, v, 4
  1242. SIMPLE_LOOPFILTER mmx, h, 6
  1243. SIMPLE_LOOPFILTER mmxext, v, 4
  1244. SIMPLE_LOOPFILTER mmxext, h, 6
  1245. INIT_XMM
  1246. SIMPLE_LOOPFILTER sse2, v, 3
  1247. SIMPLE_LOOPFILTER sse2, h, 6