You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

526 lines
12KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pb_f: times 16 db 15
  24. pb_zzzzzzzz77777777: times 8 db -1
  25. pb_7: times 8 db 7
  26. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  27. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  28. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  29. SECTION_TEXT
  30. %macro SCALARPRODUCT 0
  31. ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  32. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  33. shl orderq, 1
  34. add v1q, orderq
  35. add v2q, orderq
  36. neg orderq
  37. pxor m2, m2
  38. .loop:
  39. movu m0, [v1q + orderq]
  40. movu m1, [v1q + orderq + mmsize]
  41. pmaddwd m0, [v2q + orderq]
  42. pmaddwd m1, [v2q + orderq + mmsize]
  43. paddd m2, m0
  44. paddd m2, m1
  45. add orderq, mmsize*2
  46. jl .loop
  47. %if mmsize == 16
  48. movhlps m0, m2
  49. paddd m2, m0
  50. pshuflw m0, m2, 0x4e
  51. %else
  52. pshufw m0, m2, 0x4e
  53. %endif
  54. paddd m2, m0
  55. movd eax, m2
  56. RET
  57. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  58. ; int order, int mul)
  59. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  60. shl orderq, 1
  61. movd m7, mulm
  62. %if mmsize == 16
  63. pshuflw m7, m7, 0
  64. punpcklqdq m7, m7
  65. %else
  66. pshufw m7, m7, 0
  67. %endif
  68. pxor m6, m6
  69. add v1q, orderq
  70. add v2q, orderq
  71. add v3q, orderq
  72. neg orderq
  73. .loop:
  74. movu m0, [v2q + orderq]
  75. movu m1, [v2q + orderq + mmsize]
  76. mova m4, [v1q + orderq]
  77. mova m5, [v1q + orderq + mmsize]
  78. movu m2, [v3q + orderq]
  79. movu m3, [v3q + orderq + mmsize]
  80. pmaddwd m0, m4
  81. pmaddwd m1, m5
  82. pmullw m2, m7
  83. pmullw m3, m7
  84. paddd m6, m0
  85. paddd m6, m1
  86. paddw m2, m4
  87. paddw m3, m5
  88. mova [v1q + orderq], m2
  89. mova [v1q + orderq + mmsize], m3
  90. add orderq, mmsize*2
  91. jl .loop
  92. %if mmsize == 16
  93. movhlps m0, m6
  94. paddd m6, m0
  95. pshuflw m0, m6, 0x4e
  96. %else
  97. pshufw m0, m6, 0x4e
  98. %endif
  99. paddd m6, m0
  100. movd eax, m6
  101. RET
  102. %endmacro
  103. INIT_MMX mmxext
  104. SCALARPRODUCT
  105. INIT_XMM sse2
  106. SCALARPRODUCT
  107. %macro SCALARPRODUCT_LOOP 1
  108. align 16
  109. .loop%1:
  110. sub orderq, mmsize*2
  111. %if %1
  112. mova m1, m4
  113. mova m4, [v2q + orderq]
  114. mova m0, [v2q + orderq + mmsize]
  115. palignr m1, m0, %1
  116. palignr m0, m4, %1
  117. mova m3, m5
  118. mova m5, [v3q + orderq]
  119. mova m2, [v3q + orderq + mmsize]
  120. palignr m3, m2, %1
  121. palignr m2, m5, %1
  122. %else
  123. mova m0, [v2q + orderq]
  124. mova m1, [v2q + orderq + mmsize]
  125. mova m2, [v3q + orderq]
  126. mova m3, [v3q + orderq + mmsize]
  127. %endif
  128. %define t0 [v1q + orderq]
  129. %define t1 [v1q + orderq + mmsize]
  130. %if ARCH_X86_64
  131. mova m8, t0
  132. mova m9, t1
  133. %define t0 m8
  134. %define t1 m9
  135. %endif
  136. pmaddwd m0, t0
  137. pmaddwd m1, t1
  138. pmullw m2, m7
  139. pmullw m3, m7
  140. paddw m2, t0
  141. paddw m3, t1
  142. paddd m6, m0
  143. paddd m6, m1
  144. mova [v1q + orderq], m2
  145. mova [v1q + orderq + mmsize], m3
  146. jg .loop%1
  147. %if %1
  148. jmp .end
  149. %endif
  150. %endmacro
  151. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  152. ; int order, int mul)
  153. INIT_XMM ssse3
  154. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  155. shl orderq, 1
  156. movd m7, mulm
  157. pshuflw m7, m7, 0
  158. punpcklqdq m7, m7
  159. pxor m6, m6
  160. mov r4d, v2d
  161. and r4d, 15
  162. and v2q, ~15
  163. and v3q, ~15
  164. mova m4, [v2q + orderq]
  165. mova m5, [v3q + orderq]
  166. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  167. cmp r4d, 0
  168. je .loop0
  169. cmp r4d, 2
  170. je .loop2
  171. cmp r4d, 4
  172. je .loop4
  173. cmp r4d, 6
  174. je .loop6
  175. cmp r4d, 8
  176. je .loop8
  177. cmp r4d, 10
  178. je .loop10
  179. cmp r4d, 12
  180. je .loop12
  181. SCALARPRODUCT_LOOP 14
  182. SCALARPRODUCT_LOOP 12
  183. SCALARPRODUCT_LOOP 10
  184. SCALARPRODUCT_LOOP 8
  185. SCALARPRODUCT_LOOP 6
  186. SCALARPRODUCT_LOOP 4
  187. SCALARPRODUCT_LOOP 2
  188. SCALARPRODUCT_LOOP 0
  189. .end:
  190. movhlps m0, m6
  191. paddd m6, m0
  192. pshuflw m0, m6, 0x4e
  193. paddd m6, m0
  194. movd eax, m6
  195. RET
  196. ; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
  197. ; const uint8_t *diff, int w,
  198. ; int *left, int *left_top)
  199. INIT_MMX mmxext
  200. cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
  201. movq mm0, [topq]
  202. movq mm2, mm0
  203. movd mm4, [left_topq]
  204. psllq mm2, 8
  205. movq mm1, mm0
  206. por mm4, mm2
  207. movd mm3, [leftq]
  208. psubb mm0, mm4 ; t-tl
  209. add dstq, wq
  210. add topq, wq
  211. add diffq, wq
  212. neg wq
  213. jmp .skip
  214. .loop:
  215. movq mm4, [topq+wq]
  216. movq mm0, mm4
  217. psllq mm4, 8
  218. por mm4, mm1
  219. movq mm1, mm0 ; t
  220. psubb mm0, mm4 ; t-tl
  221. .skip:
  222. movq mm2, [diffq+wq]
  223. %assign i 0
  224. %rep 8
  225. movq mm4, mm0
  226. paddb mm4, mm3 ; t-tl+l
  227. movq mm5, mm3
  228. pmaxub mm3, mm1
  229. pminub mm5, mm1
  230. pminub mm3, mm4
  231. pmaxub mm3, mm5 ; median
  232. paddb mm3, mm2 ; +residual
  233. %if i==0
  234. movq mm7, mm3
  235. psllq mm7, 56
  236. %else
  237. movq mm6, mm3
  238. psrlq mm7, 8
  239. psllq mm6, 56
  240. por mm7, mm6
  241. %endif
  242. %if i<7
  243. psrlq mm0, 8
  244. psrlq mm1, 8
  245. psrlq mm2, 8
  246. %endif
  247. %assign i i+1
  248. %endrep
  249. movq [dstq+wq], mm7
  250. add wq, 8
  251. jl .loop
  252. movzx r2d, byte [dstq-1]
  253. mov [leftq], r2d
  254. movzx r2d, byte [topq-1]
  255. mov [left_topq], r2d
  256. RET
  257. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  258. add srcq, wq
  259. add dstq, wq
  260. neg wq
  261. %%.loop:
  262. %if %2
  263. mova m1, [srcq+wq]
  264. %else
  265. movu m1, [srcq+wq]
  266. %endif
  267. mova m2, m1
  268. psllw m1, 8
  269. paddb m1, m2
  270. mova m2, m1
  271. pshufb m1, m3
  272. paddb m1, m2
  273. pshufb m0, m5
  274. mova m2, m1
  275. pshufb m1, m4
  276. paddb m1, m2
  277. %if mmsize == 16
  278. mova m2, m1
  279. pshufb m1, m6
  280. paddb m1, m2
  281. %endif
  282. paddb m0, m1
  283. %if %1
  284. mova [dstq+wq], m0
  285. %else
  286. movq [dstq+wq], m0
  287. movhps [dstq+wq+8], m0
  288. %endif
  289. add wq, mmsize
  290. jl %%.loop
  291. mov eax, mmsize-1
  292. sub eax, wd
  293. movd m1, eax
  294. pshufb m0, m1
  295. movd eax, m0
  296. RET
  297. %endmacro
  298. ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src,
  299. ; int w, int left)
  300. INIT_MMX ssse3
  301. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  302. .skip_prologue:
  303. mova m5, [pb_7]
  304. mova m4, [pb_zzzz3333zzzzbbbb]
  305. mova m3, [pb_zz11zz55zz99zzdd]
  306. movd m0, leftm
  307. psllq m0, 56
  308. ADD_HFYU_LEFT_LOOP 1, 1
  309. INIT_XMM sse4
  310. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  311. mova m5, [pb_f]
  312. mova m6, [pb_zzzzzzzz77777777]
  313. mova m4, [pb_zzzz3333zzzzbbbb]
  314. mova m3, [pb_zz11zz55zz99zzdd]
  315. movd m0, leftm
  316. pslldq m0, 15
  317. test srcq, 15
  318. jnz .src_unaligned
  319. test dstq, 15
  320. jnz .dst_unaligned
  321. ADD_HFYU_LEFT_LOOP 1, 1
  322. .dst_unaligned:
  323. ADD_HFYU_LEFT_LOOP 0, 1
  324. .src_unaligned:
  325. ADD_HFYU_LEFT_LOOP 0, 0
  326. ;-----------------------------------------------------------------------------
  327. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  328. ; int32_t max, unsigned int len)
  329. ;-----------------------------------------------------------------------------
  330. ; %1 = number of xmm registers used
  331. ; %2 = number of inline load/process/store loops per asm loop
  332. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  333. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  334. ; %5 = suffix
  335. %macro VECTOR_CLIP_INT32 4-5
  336. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  337. %if %4
  338. cvtsi2ss m4, minm
  339. cvtsi2ss m5, maxm
  340. %else
  341. movd m4, minm
  342. movd m5, maxm
  343. %endif
  344. SPLATD m4
  345. SPLATD m5
  346. .loop:
  347. %assign %%i 1
  348. %rep %2
  349. mova m0, [srcq+mmsize*0*%%i]
  350. mova m1, [srcq+mmsize*1*%%i]
  351. mova m2, [srcq+mmsize*2*%%i]
  352. mova m3, [srcq+mmsize*3*%%i]
  353. %if %3
  354. mova m7, [srcq+mmsize*4*%%i]
  355. mova m8, [srcq+mmsize*5*%%i]
  356. mova m9, [srcq+mmsize*6*%%i]
  357. mova m10, [srcq+mmsize*7*%%i]
  358. %endif
  359. CLIPD m0, m4, m5, m6
  360. CLIPD m1, m4, m5, m6
  361. CLIPD m2, m4, m5, m6
  362. CLIPD m3, m4, m5, m6
  363. %if %3
  364. CLIPD m7, m4, m5, m6
  365. CLIPD m8, m4, m5, m6
  366. CLIPD m9, m4, m5, m6
  367. CLIPD m10, m4, m5, m6
  368. %endif
  369. mova [dstq+mmsize*0*%%i], m0
  370. mova [dstq+mmsize*1*%%i], m1
  371. mova [dstq+mmsize*2*%%i], m2
  372. mova [dstq+mmsize*3*%%i], m3
  373. %if %3
  374. mova [dstq+mmsize*4*%%i], m7
  375. mova [dstq+mmsize*5*%%i], m8
  376. mova [dstq+mmsize*6*%%i], m9
  377. mova [dstq+mmsize*7*%%i], m10
  378. %endif
  379. %assign %%i %%i+1
  380. %endrep
  381. add srcq, mmsize*4*(%2+%3)
  382. add dstq, mmsize*4*(%2+%3)
  383. sub lend, mmsize*(%2+%3)
  384. jg .loop
  385. REP_RET
  386. %endmacro
  387. INIT_MMX mmx
  388. %define CLIPD CLIPD_MMX
  389. VECTOR_CLIP_INT32 0, 1, 0, 0
  390. INIT_XMM sse2
  391. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  392. %define CLIPD CLIPD_SSE2
  393. VECTOR_CLIP_INT32 6, 2, 0, 1
  394. INIT_XMM sse4
  395. %define CLIPD CLIPD_SSE41
  396. %ifdef m8
  397. VECTOR_CLIP_INT32 11, 1, 1, 0
  398. %else
  399. VECTOR_CLIP_INT32 6, 1, 0, 0
  400. %endif
  401. ; %1 = aligned/unaligned
  402. %macro BSWAP_LOOPS 1
  403. mov r3, r2
  404. sar r2, 3
  405. jz .left4_%1
  406. .loop8_%1:
  407. mov%1 m0, [r1 + 0]
  408. mov%1 m1, [r1 + 16]
  409. %if cpuflag(ssse3)
  410. pshufb m0, m2
  411. pshufb m1, m2
  412. mov%1 [r0 + 0], m0
  413. mov%1 [r0 + 16], m1
  414. %else
  415. pshuflw m0, m0, 10110001b
  416. pshuflw m1, m1, 10110001b
  417. pshufhw m0, m0, 10110001b
  418. pshufhw m1, m1, 10110001b
  419. mova m2, m0
  420. mova m3, m1
  421. psllw m0, 8
  422. psllw m1, 8
  423. psrlw m2, 8
  424. psrlw m3, 8
  425. por m2, m0
  426. por m3, m1
  427. mov%1 [r0 + 0], m2
  428. mov%1 [r0 + 16], m3
  429. %endif
  430. add r0, 32
  431. add r1, 32
  432. dec r2
  433. jnz .loop8_%1
  434. .left4_%1:
  435. mov r2, r3
  436. and r3, 4
  437. jz .left
  438. mov%1 m0, [r1]
  439. %if cpuflag(ssse3)
  440. pshufb m0, m2
  441. mov%1 [r0], m0
  442. %else
  443. pshuflw m0, m0, 10110001b
  444. pshufhw m0, m0, 10110001b
  445. mova m2, m0
  446. psllw m0, 8
  447. psrlw m2, 8
  448. por m2, m0
  449. mov%1 [r0], m2
  450. %endif
  451. add r1, 16
  452. add r0, 16
  453. %endmacro
  454. ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  455. %macro BSWAP32_BUF 0
  456. %if cpuflag(ssse3)
  457. cglobal bswap32_buf, 3,4,3
  458. mov r3, r1
  459. mova m2, [pb_bswap32]
  460. %else
  461. cglobal bswap32_buf, 3,4,5
  462. mov r3, r1
  463. %endif
  464. and r3, 15
  465. jz .start_align
  466. BSWAP_LOOPS u
  467. jmp .left
  468. .start_align:
  469. BSWAP_LOOPS a
  470. .left:
  471. %if cpuflag(ssse3)
  472. mov r3, r2
  473. and r2, 2
  474. jz .left1
  475. movq m0, [r1]
  476. pshufb m0, m2
  477. movq [r0], m0
  478. add r1, 8
  479. add r0, 8
  480. .left1:
  481. and r3, 1
  482. jz .end
  483. mov r2d, [r1]
  484. bswap r2d
  485. mov [r0], r2d
  486. %else
  487. and r2, 3
  488. jz .end
  489. .loop2:
  490. mov r3d, [r1]
  491. bswap r3d
  492. mov [r0], r3d
  493. add r1, 4
  494. add r0, 4
  495. dec r2
  496. jnz .loop2
  497. %endif
  498. .end:
  499. RET
  500. %endmacro
  501. INIT_XMM sse2
  502. BSWAP32_BUF
  503. INIT_XMM ssse3
  504. BSWAP32_BUF