You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

816 lines
19KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. cextern pb_FC
  26. cextern h263_loop_filter_strength
  27. pb_f: times 16 db 15
  28. pb_zzzzzzzz77777777: times 8 db -1
  29. pb_7: times 8 db 7
  30. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  31. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  32. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  33. pd_16384: times 4 dd 16384
  34. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  35. SECTION_TEXT
  36. %macro SCALARPRODUCT 0
  37. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  38. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  39. shl orderq, 1
  40. add v1q, orderq
  41. add v2q, orderq
  42. neg orderq
  43. pxor m2, m2
  44. .loop:
  45. movu m0, [v1q + orderq]
  46. movu m1, [v1q + orderq + mmsize]
  47. pmaddwd m0, [v2q + orderq]
  48. pmaddwd m1, [v2q + orderq + mmsize]
  49. paddd m2, m0
  50. paddd m2, m1
  51. add orderq, mmsize*2
  52. jl .loop
  53. %if mmsize == 16
  54. movhlps m0, m2
  55. paddd m2, m0
  56. pshuflw m0, m2, 0x4e
  57. %else
  58. pshufw m0, m2, 0x4e
  59. %endif
  60. paddd m2, m0
  61. movd eax, m2
  62. RET
  63. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  64. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  65. shl orderq, 1
  66. movd m7, mulm
  67. %if mmsize == 16
  68. pshuflw m7, m7, 0
  69. punpcklqdq m7, m7
  70. %else
  71. pshufw m7, m7, 0
  72. %endif
  73. pxor m6, m6
  74. add v1q, orderq
  75. add v2q, orderq
  76. add v3q, orderq
  77. neg orderq
  78. .loop:
  79. movu m0, [v2q + orderq]
  80. movu m1, [v2q + orderq + mmsize]
  81. mova m4, [v1q + orderq]
  82. mova m5, [v1q + orderq + mmsize]
  83. movu m2, [v3q + orderq]
  84. movu m3, [v3q + orderq + mmsize]
  85. pmaddwd m0, m4
  86. pmaddwd m1, m5
  87. pmullw m2, m7
  88. pmullw m3, m7
  89. paddd m6, m0
  90. paddd m6, m1
  91. paddw m2, m4
  92. paddw m3, m5
  93. mova [v1q + orderq], m2
  94. mova [v1q + orderq + mmsize], m3
  95. add orderq, mmsize*2
  96. jl .loop
  97. %if mmsize == 16
  98. movhlps m0, m6
  99. paddd m6, m0
  100. pshuflw m0, m6, 0x4e
  101. %else
  102. pshufw m0, m6, 0x4e
  103. %endif
  104. paddd m6, m0
  105. movd eax, m6
  106. RET
  107. %endmacro
  108. INIT_MMX mmxext
  109. SCALARPRODUCT
  110. INIT_XMM sse2
  111. SCALARPRODUCT
  112. %macro SCALARPRODUCT_LOOP 1
  113. align 16
  114. .loop%1:
  115. sub orderq, mmsize*2
  116. %if %1
  117. mova m1, m4
  118. mova m4, [v2q + orderq]
  119. mova m0, [v2q + orderq + mmsize]
  120. palignr m1, m0, %1
  121. palignr m0, m4, %1
  122. mova m3, m5
  123. mova m5, [v3q + orderq]
  124. mova m2, [v3q + orderq + mmsize]
  125. palignr m3, m2, %1
  126. palignr m2, m5, %1
  127. %else
  128. mova m0, [v2q + orderq]
  129. mova m1, [v2q + orderq + mmsize]
  130. mova m2, [v3q + orderq]
  131. mova m3, [v3q + orderq + mmsize]
  132. %endif
  133. %define t0 [v1q + orderq]
  134. %define t1 [v1q + orderq + mmsize]
  135. %if ARCH_X86_64
  136. mova m8, t0
  137. mova m9, t1
  138. %define t0 m8
  139. %define t1 m9
  140. %endif
  141. pmaddwd m0, t0
  142. pmaddwd m1, t1
  143. pmullw m2, m7
  144. pmullw m3, m7
  145. paddw m2, t0
  146. paddw m3, t1
  147. paddd m6, m0
  148. paddd m6, m1
  149. mova [v1q + orderq], m2
  150. mova [v1q + orderq + mmsize], m3
  151. jg .loop%1
  152. %if %1
  153. jmp .end
  154. %endif
  155. %endmacro
  156. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  157. INIT_XMM ssse3
  158. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  159. shl orderq, 1
  160. movd m7, mulm
  161. pshuflw m7, m7, 0
  162. punpcklqdq m7, m7
  163. pxor m6, m6
  164. mov r4d, v2d
  165. and r4d, 15
  166. and v2q, ~15
  167. and v3q, ~15
  168. mova m4, [v2q + orderq]
  169. mova m5, [v3q + orderq]
  170. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  171. cmp r4d, 0
  172. je .loop0
  173. cmp r4d, 2
  174. je .loop2
  175. cmp r4d, 4
  176. je .loop4
  177. cmp r4d, 6
  178. je .loop6
  179. cmp r4d, 8
  180. je .loop8
  181. cmp r4d, 10
  182. je .loop10
  183. cmp r4d, 12
  184. je .loop12
  185. SCALARPRODUCT_LOOP 14
  186. SCALARPRODUCT_LOOP 12
  187. SCALARPRODUCT_LOOP 10
  188. SCALARPRODUCT_LOOP 8
  189. SCALARPRODUCT_LOOP 6
  190. SCALARPRODUCT_LOOP 4
  191. SCALARPRODUCT_LOOP 2
  192. SCALARPRODUCT_LOOP 0
  193. .end:
  194. movhlps m0, m6
  195. paddd m6, m0
  196. pshuflw m0, m6, 0x4e
  197. paddd m6, m0
  198. movd eax, m6
  199. RET
  200. ;-----------------------------------------------------------------------------
  201. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  202. ; const int16_t *window, unsigned int len)
  203. ;-----------------------------------------------------------------------------
  204. %macro REVERSE_WORDS 1-2
  205. %if cpuflag(ssse3) && notcpuflag(atom)
  206. pshufb %1, %2
  207. %elif cpuflag(sse2)
  208. pshuflw %1, %1, 0x1B
  209. pshufhw %1, %1, 0x1B
  210. pshufd %1, %1, 0x4E
  211. %elif cpuflag(mmxext)
  212. pshufw %1, %1, 0x1B
  213. %endif
  214. %endmacro
  215. %macro MUL16FIXED 3
  216. %if cpuflag(ssse3) ; dst, src, unused
  217. ; dst = ((dst * src) + (1<<14)) >> 15
  218. pmulhrsw %1, %2
  219. %elif cpuflag(mmxext) ; dst, src, temp
  220. ; dst = (dst * src) >> 15
  221. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  222. ; in from the pmullw result.
  223. mova %3, %1
  224. pmulhw %1, %2
  225. pmullw %3, %2
  226. psrlw %3, 15
  227. psllw %1, 1
  228. por %1, %3
  229. %endif
  230. %endmacro
  231. %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
  232. %if %1
  233. cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
  234. %else
  235. cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
  236. %endif
  237. lea offset2q, [offsetq-mmsize]
  238. %if cpuflag(ssse3) && notcpuflag(atom)
  239. mova m5, [pb_revwords]
  240. ALIGN 16
  241. %elif %1
  242. mova m5, [pd_16384]
  243. %endif
  244. .loop:
  245. %if cpuflag(ssse3)
  246. ; This version does the 16x16->16 multiplication in-place without expanding
  247. ; to 32-bit. The ssse3 version is bit-identical.
  248. mova m0, [windowq+offset2q]
  249. mova m1, [ inputq+offset2q]
  250. pmulhrsw m1, m0
  251. REVERSE_WORDS m0, m5
  252. pmulhrsw m0, [ inputq+offsetq ]
  253. mova [outputq+offset2q], m1
  254. mova [outputq+offsetq ], m0
  255. %elif %1
  256. ; This version expands 16-bit to 32-bit, multiplies by the window,
  257. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  258. ; save to the output. The window is reversed for the second half.
  259. mova m3, [windowq+offset2q]
  260. mova m4, [ inputq+offset2q]
  261. pxor m0, m0
  262. punpcklwd m0, m3
  263. punpcklwd m1, m4
  264. pmaddwd m0, m1
  265. paddd m0, m5
  266. psrad m0, 15
  267. pxor m2, m2
  268. punpckhwd m2, m3
  269. punpckhwd m1, m4
  270. pmaddwd m2, m1
  271. paddd m2, m5
  272. psrad m2, 15
  273. packssdw m0, m2
  274. mova [outputq+offset2q], m0
  275. REVERSE_WORDS m3
  276. mova m4, [ inputq+offsetq]
  277. pxor m0, m0
  278. punpcklwd m0, m3
  279. punpcklwd m1, m4
  280. pmaddwd m0, m1
  281. paddd m0, m5
  282. psrad m0, 15
  283. pxor m2, m2
  284. punpckhwd m2, m3
  285. punpckhwd m1, m4
  286. pmaddwd m2, m1
  287. paddd m2, m5
  288. psrad m2, 15
  289. packssdw m0, m2
  290. mova [outputq+offsetq], m0
  291. %else
  292. ; This version does the 16x16->16 multiplication in-place without expanding
  293. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  294. ; therefore are not bit-identical to the C version.
  295. mova m0, [windowq+offset2q]
  296. mova m1, [ inputq+offset2q]
  297. mova m2, [ inputq+offsetq ]
  298. MUL16FIXED m1, m0, m3
  299. REVERSE_WORDS m0
  300. MUL16FIXED m2, m0, m3
  301. mova [outputq+offset2q], m1
  302. mova [outputq+offsetq ], m2
  303. %endif
  304. add offsetd, mmsize
  305. sub offset2d, mmsize
  306. jae .loop
  307. REP_RET
  308. %endmacro
  309. INIT_MMX mmxext
  310. APPLY_WINDOW_INT16 0
  311. INIT_XMM sse2
  312. APPLY_WINDOW_INT16 0
  313. INIT_MMX mmxext
  314. APPLY_WINDOW_INT16 1
  315. INIT_XMM sse2
  316. APPLY_WINDOW_INT16 1
  317. INIT_XMM ssse3
  318. APPLY_WINDOW_INT16 1
  319. INIT_XMM ssse3, atom
  320. APPLY_WINDOW_INT16 1
  321. ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  322. INIT_MMX mmxext
  323. cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
  324. movq mm0, [topq]
  325. movq mm2, mm0
  326. movd mm4, [left_topq]
  327. psllq mm2, 8
  328. movq mm1, mm0
  329. por mm4, mm2
  330. movd mm3, [leftq]
  331. psubb mm0, mm4 ; t-tl
  332. add dstq, wq
  333. add topq, wq
  334. add diffq, wq
  335. neg wq
  336. jmp .skip
  337. .loop:
  338. movq mm4, [topq+wq]
  339. movq mm0, mm4
  340. psllq mm4, 8
  341. por mm4, mm1
  342. movq mm1, mm0 ; t
  343. psubb mm0, mm4 ; t-tl
  344. .skip:
  345. movq mm2, [diffq+wq]
  346. %assign i 0
  347. %rep 8
  348. movq mm4, mm0
  349. paddb mm4, mm3 ; t-tl+l
  350. movq mm5, mm3
  351. pmaxub mm3, mm1
  352. pminub mm5, mm1
  353. pminub mm3, mm4
  354. pmaxub mm3, mm5 ; median
  355. paddb mm3, mm2 ; +residual
  356. %if i==0
  357. movq mm7, mm3
  358. psllq mm7, 56
  359. %else
  360. movq mm6, mm3
  361. psrlq mm7, 8
  362. psllq mm6, 56
  363. por mm7, mm6
  364. %endif
  365. %if i<7
  366. psrlq mm0, 8
  367. psrlq mm1, 8
  368. psrlq mm2, 8
  369. %endif
  370. %assign i i+1
  371. %endrep
  372. movq [dstq+wq], mm7
  373. add wq, 8
  374. jl .loop
  375. movzx r2d, byte [dstq-1]
  376. mov [leftq], r2d
  377. movzx r2d, byte [topq-1]
  378. mov [left_topq], r2d
  379. RET
  380. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  381. add srcq, wq
  382. add dstq, wq
  383. neg wq
  384. %%.loop:
  385. %if %2
  386. mova m1, [srcq+wq]
  387. %else
  388. movu m1, [srcq+wq]
  389. %endif
  390. mova m2, m1
  391. psllw m1, 8
  392. paddb m1, m2
  393. mova m2, m1
  394. pshufb m1, m3
  395. paddb m1, m2
  396. pshufb m0, m5
  397. mova m2, m1
  398. pshufb m1, m4
  399. paddb m1, m2
  400. %if mmsize == 16
  401. mova m2, m1
  402. pshufb m1, m6
  403. paddb m1, m2
  404. %endif
  405. paddb m0, m1
  406. %if %1
  407. mova [dstq+wq], m0
  408. %else
  409. movq [dstq+wq], m0
  410. movhps [dstq+wq+8], m0
  411. %endif
  412. add wq, mmsize
  413. jl %%.loop
  414. mov eax, mmsize-1
  415. sub eax, wd
  416. movd m1, eax
  417. pshufb m0, m1
  418. movd eax, m0
  419. RET
  420. %endmacro
  421. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  422. INIT_MMX ssse3
  423. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  424. .skip_prologue:
  425. mova m5, [pb_7]
  426. mova m4, [pb_zzzz3333zzzzbbbb]
  427. mova m3, [pb_zz11zz55zz99zzdd]
  428. movd m0, leftm
  429. psllq m0, 56
  430. ADD_HFYU_LEFT_LOOP 1, 1
  431. INIT_XMM sse4
  432. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  433. mova m5, [pb_f]
  434. mova m6, [pb_zzzzzzzz77777777]
  435. mova m4, [pb_zzzz3333zzzzbbbb]
  436. mova m3, [pb_zz11zz55zz99zzdd]
  437. movd m0, leftm
  438. pslldq m0, 15
  439. test srcq, 15
  440. jnz .src_unaligned
  441. test dstq, 15
  442. jnz .dst_unaligned
  443. ADD_HFYU_LEFT_LOOP 1, 1
  444. .dst_unaligned:
  445. ADD_HFYU_LEFT_LOOP 0, 1
  446. .src_unaligned:
  447. ADD_HFYU_LEFT_LOOP 0, 0
  448. ;-----------------------------------------------------------------------------
  449. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  450. ; int32_t max, unsigned int len)
  451. ;-----------------------------------------------------------------------------
  452. ; %1 = number of xmm registers used
  453. ; %2 = number of inline load/process/store loops per asm loop
  454. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  455. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  456. ; %5 = suffix
  457. %macro VECTOR_CLIP_INT32 4-5
  458. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  459. %if %4
  460. cvtsi2ss m4, minm
  461. cvtsi2ss m5, maxm
  462. %else
  463. movd m4, minm
  464. movd m5, maxm
  465. %endif
  466. SPLATD m4
  467. SPLATD m5
  468. .loop:
  469. %assign %%i 1
  470. %rep %2
  471. mova m0, [srcq+mmsize*0*%%i]
  472. mova m1, [srcq+mmsize*1*%%i]
  473. mova m2, [srcq+mmsize*2*%%i]
  474. mova m3, [srcq+mmsize*3*%%i]
  475. %if %3
  476. mova m7, [srcq+mmsize*4*%%i]
  477. mova m8, [srcq+mmsize*5*%%i]
  478. mova m9, [srcq+mmsize*6*%%i]
  479. mova m10, [srcq+mmsize*7*%%i]
  480. %endif
  481. CLIPD m0, m4, m5, m6
  482. CLIPD m1, m4, m5, m6
  483. CLIPD m2, m4, m5, m6
  484. CLIPD m3, m4, m5, m6
  485. %if %3
  486. CLIPD m7, m4, m5, m6
  487. CLIPD m8, m4, m5, m6
  488. CLIPD m9, m4, m5, m6
  489. CLIPD m10, m4, m5, m6
  490. %endif
  491. mova [dstq+mmsize*0*%%i], m0
  492. mova [dstq+mmsize*1*%%i], m1
  493. mova [dstq+mmsize*2*%%i], m2
  494. mova [dstq+mmsize*3*%%i], m3
  495. %if %3
  496. mova [dstq+mmsize*4*%%i], m7
  497. mova [dstq+mmsize*5*%%i], m8
  498. mova [dstq+mmsize*6*%%i], m9
  499. mova [dstq+mmsize*7*%%i], m10
  500. %endif
  501. %assign %%i %%i+1
  502. %endrep
  503. add srcq, mmsize*4*(%2+%3)
  504. add dstq, mmsize*4*(%2+%3)
  505. sub lend, mmsize*(%2+%3)
  506. jg .loop
  507. REP_RET
  508. %endmacro
  509. INIT_MMX mmx
  510. %define CLIPD CLIPD_MMX
  511. VECTOR_CLIP_INT32 0, 1, 0, 0
  512. INIT_XMM sse2
  513. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  514. %define CLIPD CLIPD_SSE2
  515. VECTOR_CLIP_INT32 6, 2, 0, 1
  516. INIT_XMM sse4
  517. %define CLIPD CLIPD_SSE41
  518. %ifdef m8
  519. VECTOR_CLIP_INT32 11, 1, 1, 0
  520. %else
  521. VECTOR_CLIP_INT32 6, 1, 0, 0
  522. %endif
  523. ; %1 = aligned/unaligned
  524. %macro BSWAP_LOOPS 1
  525. mov r3, r2
  526. sar r2, 3
  527. jz .left4_%1
  528. .loop8_%1:
  529. mov%1 m0, [r1 + 0]
  530. mov%1 m1, [r1 + 16]
  531. %if cpuflag(ssse3)
  532. pshufb m0, m2
  533. pshufb m1, m2
  534. mova [r0 + 0], m0
  535. mova [r0 + 16], m1
  536. %else
  537. pshuflw m0, m0, 10110001b
  538. pshuflw m1, m1, 10110001b
  539. pshufhw m0, m0, 10110001b
  540. pshufhw m1, m1, 10110001b
  541. mova m2, m0
  542. mova m3, m1
  543. psllw m0, 8
  544. psllw m1, 8
  545. psrlw m2, 8
  546. psrlw m3, 8
  547. por m2, m0
  548. por m3, m1
  549. mova [r0 + 0], m2
  550. mova [r0 + 16], m3
  551. %endif
  552. add r0, 32
  553. add r1, 32
  554. dec r2
  555. jnz .loop8_%1
  556. .left4_%1:
  557. mov r2, r3
  558. and r3, 4
  559. jz .left
  560. mov%1 m0, [r1]
  561. %if cpuflag(ssse3)
  562. pshufb m0, m2
  563. mova [r0], m0
  564. %else
  565. pshuflw m0, m0, 10110001b
  566. pshufhw m0, m0, 10110001b
  567. mova m2, m0
  568. psllw m0, 8
  569. psrlw m2, 8
  570. por m2, m0
  571. mova [r0], m2
  572. %endif
  573. add r1, 16
  574. add r0, 16
  575. %endmacro
  576. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  577. %macro BSWAP32_BUF 0
  578. %if cpuflag(ssse3)
  579. cglobal bswap32_buf, 3,4,3
  580. mov r3, r1
  581. mova m2, [pb_bswap32]
  582. %else
  583. cglobal bswap32_buf, 3,4,5
  584. mov r3, r1
  585. %endif
  586. and r3, 15
  587. jz .start_align
  588. BSWAP_LOOPS u
  589. jmp .left
  590. .start_align:
  591. BSWAP_LOOPS a
  592. .left:
  593. %if cpuflag(ssse3)
  594. mov r3, r2
  595. and r2, 2
  596. jz .left1
  597. movq m0, [r1]
  598. pshufb m0, m2
  599. movq [r0], m0
  600. add r1, 8
  601. add r0, 8
  602. .left1:
  603. and r3, 1
  604. jz .end
  605. mov r2d, [r1]
  606. bswap r2d
  607. mov [r0], r2d
  608. %else
  609. and r2, 3
  610. jz .end
  611. .loop2:
  612. mov r3d, [r1]
  613. bswap r3d
  614. mov [r0], r3d
  615. add r1, 4
  616. add r0, 4
  617. dec r2
  618. jnz .loop2
  619. %endif
  620. .end:
  621. RET
  622. %endmacro
  623. INIT_XMM sse2
  624. BSWAP32_BUF
  625. INIT_XMM ssse3
  626. BSWAP32_BUF
  627. %macro H263_LOOP_FILTER 5
  628. pxor m7, m7
  629. mova m0, [%1]
  630. mova m1, [%1]
  631. mova m2, [%4]
  632. mova m3, [%4]
  633. punpcklbw m0, m7
  634. punpckhbw m1, m7
  635. punpcklbw m2, m7
  636. punpckhbw m3, m7
  637. psubw m0, m2
  638. psubw m1, m3
  639. mova m2, [%2]
  640. mova m3, [%2]
  641. mova m4, [%3]
  642. mova m5, [%3]
  643. punpcklbw m2, m7
  644. punpckhbw m3, m7
  645. punpcklbw m4, m7
  646. punpckhbw m5, m7
  647. psubw m4, m2
  648. psubw m5, m3
  649. psllw m4, 2
  650. psllw m5, 2
  651. paddw m4, m0
  652. paddw m5, m1
  653. pxor m6, m6
  654. pcmpgtw m6, m4
  655. pcmpgtw m7, m5
  656. pxor m4, m6
  657. pxor m5, m7
  658. psubw m4, m6
  659. psubw m5, m7
  660. psrlw m4, 3
  661. psrlw m5, 3
  662. packuswb m4, m5
  663. packsswb m6, m7
  664. pxor m7, m7
  665. movd m2, %5
  666. punpcklbw m2, m2
  667. punpcklbw m2, m2
  668. punpcklbw m2, m2
  669. psubusb m2, m4
  670. mova m3, m2
  671. psubusb m3, m4
  672. psubb m2, m3
  673. mova m3, [%2]
  674. mova m4, [%3]
  675. pxor m3, m6
  676. pxor m4, m6
  677. paddusb m3, m2
  678. psubusb m4, m2
  679. pxor m3, m6
  680. pxor m4, m6
  681. paddusb m2, m2
  682. packsswb m0, m1
  683. pcmpgtb m7, m0
  684. pxor m0, m7
  685. psubb m0, m7
  686. mova m1, m0
  687. psubusb m0, m2
  688. psubb m1, m0
  689. pand m1, [pb_FC]
  690. psrlw m1, 2
  691. pxor m1, m7
  692. psubb m1, m7
  693. mova m5, [%1]
  694. mova m6, [%4]
  695. psubb m5, m1
  696. paddb m6, m1
  697. %endmacro
  698. INIT_MMX mmx
  699. ; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
  700. cglobal h263_v_loop_filter, 3,5
  701. movsxdifnidn r1, r1d
  702. movsxdifnidn r2, r2d
  703. lea r4, [h263_loop_filter_strength]
  704. movzx r3d, BYTE [r4+r2]
  705. movsx r2, r3b
  706. shl r2, 1
  707. mov r3, r0
  708. sub r3, r1
  709. mov r4, r3
  710. sub r4, r1
  711. H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
  712. mova [r3], m3
  713. mova [r0], m4
  714. mova [r4], m5
  715. mova [r0+r1], m6
  716. RET
  717. %macro TRANSPOSE4X4 2
  718. movd m0, [%1]
  719. movd m1, [%1+r1]
  720. movd m2, [%1+r1*2]
  721. movd m3, [%1+r3]
  722. punpcklbw m0, m1
  723. punpcklbw m2, m3
  724. mova m1, m0
  725. punpcklwd m0, m2
  726. punpckhwd m1, m2
  727. movd [%2+ 0], m0
  728. punpckhdq m0, m0
  729. movd [%2+ 8], m0
  730. movd [%2+16], m1
  731. punpckhdq m1, m1
  732. movd [%2+24], m1
  733. %endmacro
  734. ; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
  735. INIT_MMX mmx
  736. cglobal h263_h_loop_filter, 3,5,0,32
  737. movsxdifnidn r1, r1d
  738. movsxdifnidn r2, r2d
  739. lea r4, [h263_loop_filter_strength]
  740. movzx r3d, BYTE [r4+r2]
  741. movsx r2, r3b
  742. shl r2, 1
  743. sub r0, 2
  744. lea r3, [r1*3]
  745. TRANSPOSE4X4 r0, rsp
  746. lea r4, [r0+r1*4]
  747. TRANSPOSE4X4 r4, rsp+4
  748. H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
  749. mova m1, m5
  750. mova m0, m4
  751. punpcklbw m5, m3
  752. punpcklbw m4, m6
  753. punpckhbw m1, m3
  754. punpckhbw m0, m6
  755. mova m3, m5
  756. mova m6, m1
  757. punpcklwd m5, m4
  758. punpcklwd m1, m0
  759. punpckhwd m3, m4
  760. punpckhwd m6, m0
  761. movd [r0], m5
  762. punpckhdq m5, m5
  763. movd [r0+r1*1], m5
  764. movd [r0+r1*2], m3
  765. punpckhdq m3, m3
  766. movd [r0+r3], m3
  767. movd [r4], m1
  768. punpckhdq m1, m1
  769. movd [r4+r1*1], m1
  770. movd [r4+r1*2], m6
  771. punpckhdq m6, m6
  772. movd [r4+r3], m6
  773. RET