You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

653 lines
15KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pb_f: times 16 db 15
  26. pb_zzzzzzzz77777777: times 8 db -1
  27. pb_7: times 8 db 7
  28. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  29. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  30. pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
  31. pd_16384: times 4 dd 16384
  32. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  33. SECTION_TEXT
  34. %macro SCALARPRODUCT 0
  35. ; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  36. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  37. shl orderq, 1
  38. add v1q, orderq
  39. add v2q, orderq
  40. neg orderq
  41. pxor m2, m2
  42. .loop:
  43. movu m0, [v1q + orderq]
  44. movu m1, [v1q + orderq + mmsize]
  45. pmaddwd m0, [v2q + orderq]
  46. pmaddwd m1, [v2q + orderq + mmsize]
  47. paddd m2, m0
  48. paddd m2, m1
  49. add orderq, mmsize*2
  50. jl .loop
  51. %if mmsize == 16
  52. movhlps m0, m2
  53. paddd m2, m0
  54. pshuflw m0, m2, 0x4e
  55. %else
  56. pshufw m0, m2, 0x4e
  57. %endif
  58. paddd m2, m0
  59. movd eax, m2
  60. RET
  61. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  62. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  63. shl orderq, 1
  64. movd m7, mulm
  65. %if mmsize == 16
  66. pshuflw m7, m7, 0
  67. punpcklqdq m7, m7
  68. %else
  69. pshufw m7, m7, 0
  70. %endif
  71. pxor m6, m6
  72. add v1q, orderq
  73. add v2q, orderq
  74. add v3q, orderq
  75. neg orderq
  76. .loop:
  77. movu m0, [v2q + orderq]
  78. movu m1, [v2q + orderq + mmsize]
  79. mova m4, [v1q + orderq]
  80. mova m5, [v1q + orderq + mmsize]
  81. movu m2, [v3q + orderq]
  82. movu m3, [v3q + orderq + mmsize]
  83. pmaddwd m0, m4
  84. pmaddwd m1, m5
  85. pmullw m2, m7
  86. pmullw m3, m7
  87. paddd m6, m0
  88. paddd m6, m1
  89. paddw m2, m4
  90. paddw m3, m5
  91. mova [v1q + orderq], m2
  92. mova [v1q + orderq + mmsize], m3
  93. add orderq, mmsize*2
  94. jl .loop
  95. %if mmsize == 16
  96. movhlps m0, m6
  97. paddd m6, m0
  98. pshuflw m0, m6, 0x4e
  99. %else
  100. pshufw m0, m6, 0x4e
  101. %endif
  102. paddd m6, m0
  103. movd eax, m6
  104. RET
  105. %endmacro
  106. INIT_MMX mmxext
  107. SCALARPRODUCT
  108. INIT_XMM sse2
  109. SCALARPRODUCT
  110. %macro SCALARPRODUCT_LOOP 1
  111. align 16
  112. .loop%1:
  113. sub orderq, mmsize*2
  114. %if %1
  115. mova m1, m4
  116. mova m4, [v2q + orderq]
  117. mova m0, [v2q + orderq + mmsize]
  118. palignr m1, m0, %1
  119. palignr m0, m4, %1
  120. mova m3, m5
  121. mova m5, [v3q + orderq]
  122. mova m2, [v3q + orderq + mmsize]
  123. palignr m3, m2, %1
  124. palignr m2, m5, %1
  125. %else
  126. mova m0, [v2q + orderq]
  127. mova m1, [v2q + orderq + mmsize]
  128. mova m2, [v3q + orderq]
  129. mova m3, [v3q + orderq + mmsize]
  130. %endif
  131. %define t0 [v1q + orderq]
  132. %define t1 [v1q + orderq + mmsize]
  133. %if ARCH_X86_64
  134. mova m8, t0
  135. mova m9, t1
  136. %define t0 m8
  137. %define t1 m9
  138. %endif
  139. pmaddwd m0, t0
  140. pmaddwd m1, t1
  141. pmullw m2, m7
  142. pmullw m3, m7
  143. paddw m2, t0
  144. paddw m3, t1
  145. paddd m6, m0
  146. paddd m6, m1
  147. mova [v1q + orderq], m2
  148. mova [v1q + orderq + mmsize], m3
  149. jg .loop%1
  150. %if %1
  151. jmp .end
  152. %endif
  153. %endmacro
  154. ; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
  155. INIT_XMM ssse3
  156. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  157. shl orderq, 1
  158. movd m7, mulm
  159. pshuflw m7, m7, 0
  160. punpcklqdq m7, m7
  161. pxor m6, m6
  162. mov r4d, v2d
  163. and r4d, 15
  164. and v2q, ~15
  165. and v3q, ~15
  166. mova m4, [v2q + orderq]
  167. mova m5, [v3q + orderq]
  168. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  169. cmp r4d, 0
  170. je .loop0
  171. cmp r4d, 2
  172. je .loop2
  173. cmp r4d, 4
  174. je .loop4
  175. cmp r4d, 6
  176. je .loop6
  177. cmp r4d, 8
  178. je .loop8
  179. cmp r4d, 10
  180. je .loop10
  181. cmp r4d, 12
  182. je .loop12
  183. SCALARPRODUCT_LOOP 14
  184. SCALARPRODUCT_LOOP 12
  185. SCALARPRODUCT_LOOP 10
  186. SCALARPRODUCT_LOOP 8
  187. SCALARPRODUCT_LOOP 6
  188. SCALARPRODUCT_LOOP 4
  189. SCALARPRODUCT_LOOP 2
  190. SCALARPRODUCT_LOOP 0
  191. .end:
  192. movhlps m0, m6
  193. paddd m6, m0
  194. pshuflw m0, m6, 0x4e
  195. paddd m6, m0
  196. movd eax, m6
  197. RET
  198. ;-----------------------------------------------------------------------------
  199. ; void ff_apply_window_int16(int16_t *output, const int16_t *input,
  200. ; const int16_t *window, unsigned int len)
  201. ;-----------------------------------------------------------------------------
  202. %macro REVERSE_WORDS 1-2
  203. %if cpuflag(ssse3) && notcpuflag(atom)
  204. pshufb %1, %2
  205. %elif cpuflag(sse2)
  206. pshuflw %1, %1, 0x1B
  207. pshufhw %1, %1, 0x1B
  208. pshufd %1, %1, 0x4E
  209. %elif cpuflag(mmxext)
  210. pshufw %1, %1, 0x1B
  211. %endif
  212. %endmacro
  213. %macro MUL16FIXED 3
  214. %if cpuflag(ssse3) ; dst, src, unused
  215. ; dst = ((dst * src) + (1<<14)) >> 15
  216. pmulhrsw %1, %2
  217. %elif cpuflag(mmxext) ; dst, src, temp
  218. ; dst = (dst * src) >> 15
  219. ; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
  220. ; in from the pmullw result.
  221. mova %3, %1
  222. pmulhw %1, %2
  223. pmullw %3, %2
  224. psrlw %3, 15
  225. psllw %1, 1
  226. por %1, %3
  227. %endif
  228. %endmacro
  229. %macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
  230. %if %1
  231. cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
  232. %else
  233. cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
  234. %endif
  235. lea offset2q, [offsetq-mmsize]
  236. %if cpuflag(ssse3) && notcpuflag(atom)
  237. mova m5, [pb_revwords]
  238. ALIGN 16
  239. %elif %1
  240. mova m5, [pd_16384]
  241. %endif
  242. .loop:
  243. %if cpuflag(ssse3)
  244. ; This version does the 16x16->16 multiplication in-place without expanding
  245. ; to 32-bit. The ssse3 version is bit-identical.
  246. mova m0, [windowq+offset2q]
  247. mova m1, [ inputq+offset2q]
  248. pmulhrsw m1, m0
  249. REVERSE_WORDS m0, m5
  250. pmulhrsw m0, [ inputq+offsetq ]
  251. mova [outputq+offset2q], m1
  252. mova [outputq+offsetq ], m0
  253. %elif %1
  254. ; This version expands 16-bit to 32-bit, multiplies by the window,
  255. ; adds 16384 for rounding, right shifts 15, then repacks back to words to
  256. ; save to the output. The window is reversed for the second half.
  257. mova m3, [windowq+offset2q]
  258. mova m4, [ inputq+offset2q]
  259. pxor m0, m0
  260. punpcklwd m0, m3
  261. punpcklwd m1, m4
  262. pmaddwd m0, m1
  263. paddd m0, m5
  264. psrad m0, 15
  265. pxor m2, m2
  266. punpckhwd m2, m3
  267. punpckhwd m1, m4
  268. pmaddwd m2, m1
  269. paddd m2, m5
  270. psrad m2, 15
  271. packssdw m0, m2
  272. mova [outputq+offset2q], m0
  273. REVERSE_WORDS m3
  274. mova m4, [ inputq+offsetq]
  275. pxor m0, m0
  276. punpcklwd m0, m3
  277. punpcklwd m1, m4
  278. pmaddwd m0, m1
  279. paddd m0, m5
  280. psrad m0, 15
  281. pxor m2, m2
  282. punpckhwd m2, m3
  283. punpckhwd m1, m4
  284. pmaddwd m2, m1
  285. paddd m2, m5
  286. psrad m2, 15
  287. packssdw m0, m2
  288. mova [outputq+offsetq], m0
  289. %else
  290. ; This version does the 16x16->16 multiplication in-place without expanding
  291. ; to 32-bit. The mmxext and sse2 versions do not use rounding, and
  292. ; therefore are not bit-identical to the C version.
  293. mova m0, [windowq+offset2q]
  294. mova m1, [ inputq+offset2q]
  295. mova m2, [ inputq+offsetq ]
  296. MUL16FIXED m1, m0, m3
  297. REVERSE_WORDS m0
  298. MUL16FIXED m2, m0, m3
  299. mova [outputq+offset2q], m1
  300. mova [outputq+offsetq ], m2
  301. %endif
  302. add offsetd, mmsize
  303. sub offset2d, mmsize
  304. jae .loop
  305. REP_RET
  306. %endmacro
  307. INIT_MMX mmxext
  308. APPLY_WINDOW_INT16 0
  309. INIT_XMM sse2
  310. APPLY_WINDOW_INT16 0
  311. INIT_MMX mmxext
  312. APPLY_WINDOW_INT16 1
  313. INIT_XMM sse2
  314. APPLY_WINDOW_INT16 1
  315. INIT_XMM ssse3
  316. APPLY_WINDOW_INT16 1
  317. INIT_XMM ssse3, atom
  318. APPLY_WINDOW_INT16 1
  319. ; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
  320. INIT_MMX mmxext
  321. cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
  322. movq mm0, [topq]
  323. movq mm2, mm0
  324. movd mm4, [left_topq]
  325. psllq mm2, 8
  326. movq mm1, mm0
  327. por mm4, mm2
  328. movd mm3, [leftq]
  329. psubb mm0, mm4 ; t-tl
  330. add dstq, wq
  331. add topq, wq
  332. add diffq, wq
  333. neg wq
  334. jmp .skip
  335. .loop:
  336. movq mm4, [topq+wq]
  337. movq mm0, mm4
  338. psllq mm4, 8
  339. por mm4, mm1
  340. movq mm1, mm0 ; t
  341. psubb mm0, mm4 ; t-tl
  342. .skip:
  343. movq mm2, [diffq+wq]
  344. %assign i 0
  345. %rep 8
  346. movq mm4, mm0
  347. paddb mm4, mm3 ; t-tl+l
  348. movq mm5, mm3
  349. pmaxub mm3, mm1
  350. pminub mm5, mm1
  351. pminub mm3, mm4
  352. pmaxub mm3, mm5 ; median
  353. paddb mm3, mm2 ; +residual
  354. %if i==0
  355. movq mm7, mm3
  356. psllq mm7, 56
  357. %else
  358. movq mm6, mm3
  359. psrlq mm7, 8
  360. psllq mm6, 56
  361. por mm7, mm6
  362. %endif
  363. %if i<7
  364. psrlq mm0, 8
  365. psrlq mm1, 8
  366. psrlq mm2, 8
  367. %endif
  368. %assign i i+1
  369. %endrep
  370. movq [dstq+wq], mm7
  371. add wq, 8
  372. jl .loop
  373. movzx r2d, byte [dstq-1]
  374. mov [leftq], r2d
  375. movzx r2d, byte [topq-1]
  376. mov [left_topq], r2d
  377. RET
  378. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  379. add srcq, wq
  380. add dstq, wq
  381. neg wq
  382. %%.loop:
  383. %if %2
  384. mova m1, [srcq+wq]
  385. %else
  386. movu m1, [srcq+wq]
  387. %endif
  388. mova m2, m1
  389. psllw m1, 8
  390. paddb m1, m2
  391. mova m2, m1
  392. pshufb m1, m3
  393. paddb m1, m2
  394. pshufb m0, m5
  395. mova m2, m1
  396. pshufb m1, m4
  397. paddb m1, m2
  398. %if mmsize == 16
  399. mova m2, m1
  400. pshufb m1, m6
  401. paddb m1, m2
  402. %endif
  403. paddb m0, m1
  404. %if %1
  405. mova [dstq+wq], m0
  406. %else
  407. movq [dstq+wq], m0
  408. movhps [dstq+wq+8], m0
  409. %endif
  410. add wq, mmsize
  411. jl %%.loop
  412. mov eax, mmsize-1
  413. sub eax, wd
  414. movd m1, eax
  415. pshufb m0, m1
  416. movd eax, m0
  417. RET
  418. %endmacro
  419. ; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
  420. INIT_MMX ssse3
  421. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  422. .skip_prologue:
  423. mova m5, [pb_7]
  424. mova m4, [pb_zzzz3333zzzzbbbb]
  425. mova m3, [pb_zz11zz55zz99zzdd]
  426. movd m0, leftm
  427. psllq m0, 56
  428. ADD_HFYU_LEFT_LOOP 1, 1
  429. INIT_XMM sse4
  430. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  431. mova m5, [pb_f]
  432. mova m6, [pb_zzzzzzzz77777777]
  433. mova m4, [pb_zzzz3333zzzzbbbb]
  434. mova m3, [pb_zz11zz55zz99zzdd]
  435. movd m0, leftm
  436. pslldq m0, 15
  437. test srcq, 15
  438. jnz .src_unaligned
  439. test dstq, 15
  440. jnz .dst_unaligned
  441. ADD_HFYU_LEFT_LOOP 1, 1
  442. .dst_unaligned:
  443. ADD_HFYU_LEFT_LOOP 0, 1
  444. .src_unaligned:
  445. ADD_HFYU_LEFT_LOOP 0, 0
  446. ;-----------------------------------------------------------------------------
  447. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  448. ; int32_t max, unsigned int len)
  449. ;-----------------------------------------------------------------------------
  450. ; %1 = number of xmm registers used
  451. ; %2 = number of inline load/process/store loops per asm loop
  452. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  453. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  454. ; %5 = suffix
  455. %macro VECTOR_CLIP_INT32 4-5
  456. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  457. %if %4
  458. cvtsi2ss m4, minm
  459. cvtsi2ss m5, maxm
  460. %else
  461. movd m4, minm
  462. movd m5, maxm
  463. %endif
  464. SPLATD m4
  465. SPLATD m5
  466. .loop:
  467. %assign %%i 1
  468. %rep %2
  469. mova m0, [srcq+mmsize*0*%%i]
  470. mova m1, [srcq+mmsize*1*%%i]
  471. mova m2, [srcq+mmsize*2*%%i]
  472. mova m3, [srcq+mmsize*3*%%i]
  473. %if %3
  474. mova m7, [srcq+mmsize*4*%%i]
  475. mova m8, [srcq+mmsize*5*%%i]
  476. mova m9, [srcq+mmsize*6*%%i]
  477. mova m10, [srcq+mmsize*7*%%i]
  478. %endif
  479. CLIPD m0, m4, m5, m6
  480. CLIPD m1, m4, m5, m6
  481. CLIPD m2, m4, m5, m6
  482. CLIPD m3, m4, m5, m6
  483. %if %3
  484. CLIPD m7, m4, m5, m6
  485. CLIPD m8, m4, m5, m6
  486. CLIPD m9, m4, m5, m6
  487. CLIPD m10, m4, m5, m6
  488. %endif
  489. mova [dstq+mmsize*0*%%i], m0
  490. mova [dstq+mmsize*1*%%i], m1
  491. mova [dstq+mmsize*2*%%i], m2
  492. mova [dstq+mmsize*3*%%i], m3
  493. %if %3
  494. mova [dstq+mmsize*4*%%i], m7
  495. mova [dstq+mmsize*5*%%i], m8
  496. mova [dstq+mmsize*6*%%i], m9
  497. mova [dstq+mmsize*7*%%i], m10
  498. %endif
  499. %assign %%i %%i+1
  500. %endrep
  501. add srcq, mmsize*4*(%2+%3)
  502. add dstq, mmsize*4*(%2+%3)
  503. sub lend, mmsize*(%2+%3)
  504. jg .loop
  505. REP_RET
  506. %endmacro
  507. INIT_MMX mmx
  508. %define CLIPD CLIPD_MMX
  509. VECTOR_CLIP_INT32 0, 1, 0, 0
  510. INIT_XMM sse2
  511. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  512. %define CLIPD CLIPD_SSE2
  513. VECTOR_CLIP_INT32 6, 2, 0, 1
  514. INIT_XMM sse4
  515. %define CLIPD CLIPD_SSE41
  516. %ifdef m8
  517. VECTOR_CLIP_INT32 11, 1, 1, 0
  518. %else
  519. VECTOR_CLIP_INT32 6, 1, 0, 0
  520. %endif
  521. ; %1 = aligned/unaligned
  522. %macro BSWAP_LOOPS 1
  523. mov r3, r2
  524. sar r2, 3
  525. jz .left4_%1
  526. .loop8_%1:
  527. mov%1 m0, [r1 + 0]
  528. mov%1 m1, [r1 + 16]
  529. %if cpuflag(ssse3)
  530. pshufb m0, m2
  531. pshufb m1, m2
  532. mova [r0 + 0], m0
  533. mova [r0 + 16], m1
  534. %else
  535. pshuflw m0, m0, 10110001b
  536. pshuflw m1, m1, 10110001b
  537. pshufhw m0, m0, 10110001b
  538. pshufhw m1, m1, 10110001b
  539. mova m2, m0
  540. mova m3, m1
  541. psllw m0, 8
  542. psllw m1, 8
  543. psrlw m2, 8
  544. psrlw m3, 8
  545. por m2, m0
  546. por m3, m1
  547. mova [r0 + 0], m2
  548. mova [r0 + 16], m3
  549. %endif
  550. add r0, 32
  551. add r1, 32
  552. dec r2
  553. jnz .loop8_%1
  554. .left4_%1:
  555. mov r2, r3
  556. and r3, 4
  557. jz .left
  558. mov%1 m0, [r1]
  559. %if cpuflag(ssse3)
  560. pshufb m0, m2
  561. mova [r0], m0
  562. %else
  563. pshuflw m0, m0, 10110001b
  564. pshufhw m0, m0, 10110001b
  565. mova m2, m0
  566. psllw m0, 8
  567. psrlw m2, 8
  568. por m2, m0
  569. mova [r0], m2
  570. %endif
  571. add r1, 16
  572. add r0, 16
  573. %endmacro
  574. ; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  575. %macro BSWAP32_BUF 0
  576. %if cpuflag(ssse3)
  577. cglobal bswap32_buf, 3,4,3
  578. mov r3, r1
  579. mova m2, [pb_bswap32]
  580. %else
  581. cglobal bswap32_buf, 3,4,5
  582. mov r3, r1
  583. %endif
  584. and r3, 15
  585. jz .start_align
  586. BSWAP_LOOPS u
  587. jmp .left
  588. .start_align:
  589. BSWAP_LOOPS a
  590. .left:
  591. %if cpuflag(ssse3)
  592. mov r3, r2
  593. and r2, 2
  594. jz .left1
  595. movq m0, [r1]
  596. pshufb m0, m2
  597. movq [r0], m0
  598. add r1, 8
  599. add r0, 8
  600. .left1:
  601. and r3, 1
  602. jz .end
  603. mov r2d, [r1]
  604. bswap r2d
  605. mov [r0], r2d
  606. %else
  607. and r2, 3
  608. jz .end
  609. .loop2:
  610. mov r3d, [r1]
  611. bswap r3d
  612. mov [r0], r3d
  613. add r1, 4
  614. add r0, 4
  615. dec r2
  616. jnz .loop2
  617. %endif
  618. .end:
  619. RET
  620. %endmacro
  621. INIT_XMM sse2
  622. BSWAP32_BUF
  623. INIT_XMM ssse3
  624. BSWAP32_BUF