You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

533 lines
12KB

  1. ;******************************************************************************
  2. ;* MMX optimized DSP utils
  3. ;* Copyright (c) 2008 Loren Merritt
  4. ;* Copyright (c) 2003-2013 Michael Niedermayer
  5. ;* Copyright (c) 2013 Daniel Kang
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA
  25. pb_f: times 16 db 15
  26. pb_zzzzzzzz77777777: times 8 db -1
  27. pb_7: times 8 db 7
  28. pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
  29. pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
  30. pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
  31. SECTION_TEXT
  32. %macro SCALARPRODUCT 0
  33. ; int ff_scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
  34. cglobal scalarproduct_int16, 3,3,3, v1, v2, order
  35. shl orderq, 1
  36. add v1q, orderq
  37. add v2q, orderq
  38. neg orderq
  39. pxor m2, m2
  40. .loop:
  41. movu m0, [v1q + orderq]
  42. movu m1, [v1q + orderq + mmsize]
  43. pmaddwd m0, [v2q + orderq]
  44. pmaddwd m1, [v2q + orderq + mmsize]
  45. paddd m2, m0
  46. paddd m2, m1
  47. add orderq, mmsize*2
  48. jl .loop
  49. %if mmsize == 16
  50. movhlps m0, m2
  51. paddd m2, m0
  52. pshuflw m0, m2, 0x4e
  53. %else
  54. pshufw m0, m2, 0x4e
  55. %endif
  56. paddd m2, m0
  57. movd eax, m2
  58. %if mmsize == 8
  59. emms
  60. %endif
  61. RET
  62. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  63. ; int order, int mul)
  64. cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
  65. shl orderq, 1
  66. movd m7, mulm
  67. %if mmsize == 16
  68. pshuflw m7, m7, 0
  69. punpcklqdq m7, m7
  70. %else
  71. pshufw m7, m7, 0
  72. %endif
  73. pxor m6, m6
  74. add v1q, orderq
  75. add v2q, orderq
  76. add v3q, orderq
  77. neg orderq
  78. .loop:
  79. movu m0, [v2q + orderq]
  80. movu m1, [v2q + orderq + mmsize]
  81. mova m4, [v1q + orderq]
  82. mova m5, [v1q + orderq + mmsize]
  83. movu m2, [v3q + orderq]
  84. movu m3, [v3q + orderq + mmsize]
  85. pmaddwd m0, m4
  86. pmaddwd m1, m5
  87. pmullw m2, m7
  88. pmullw m3, m7
  89. paddd m6, m0
  90. paddd m6, m1
  91. paddw m2, m4
  92. paddw m3, m5
  93. mova [v1q + orderq], m2
  94. mova [v1q + orderq + mmsize], m3
  95. add orderq, mmsize*2
  96. jl .loop
  97. %if mmsize == 16
  98. movhlps m0, m6
  99. paddd m6, m0
  100. pshuflw m0, m6, 0x4e
  101. %else
  102. pshufw m0, m6, 0x4e
  103. %endif
  104. paddd m6, m0
  105. movd eax, m6
  106. RET
  107. %endmacro
  108. INIT_MMX mmxext
  109. SCALARPRODUCT
  110. INIT_XMM sse2
  111. SCALARPRODUCT
  112. %macro SCALARPRODUCT_LOOP 1
  113. align 16
  114. .loop%1:
  115. sub orderq, mmsize*2
  116. %if %1
  117. mova m1, m4
  118. mova m4, [v2q + orderq]
  119. mova m0, [v2q + orderq + mmsize]
  120. palignr m1, m0, %1
  121. palignr m0, m4, %1
  122. mova m3, m5
  123. mova m5, [v3q + orderq]
  124. mova m2, [v3q + orderq + mmsize]
  125. palignr m3, m2, %1
  126. palignr m2, m5, %1
  127. %else
  128. mova m0, [v2q + orderq]
  129. mova m1, [v2q + orderq + mmsize]
  130. mova m2, [v3q + orderq]
  131. mova m3, [v3q + orderq + mmsize]
  132. %endif
  133. %define t0 [v1q + orderq]
  134. %define t1 [v1q + orderq + mmsize]
  135. %if ARCH_X86_64
  136. mova m8, t0
  137. mova m9, t1
  138. %define t0 m8
  139. %define t1 m9
  140. %endif
  141. pmaddwd m0, t0
  142. pmaddwd m1, t1
  143. pmullw m2, m7
  144. pmullw m3, m7
  145. paddw m2, t0
  146. paddw m3, t1
  147. paddd m6, m0
  148. paddd m6, m1
  149. mova [v1q + orderq], m2
  150. mova [v1q + orderq + mmsize], m3
  151. jg .loop%1
  152. %if %1
  153. jmp .end
  154. %endif
  155. %endmacro
  156. ; int ff_scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3,
  157. ; int order, int mul)
  158. INIT_XMM ssse3
  159. cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
  160. shl orderq, 1
  161. movd m7, mulm
  162. pshuflw m7, m7, 0
  163. punpcklqdq m7, m7
  164. pxor m6, m6
  165. mov r4d, v2d
  166. and r4d, 15
  167. and v2q, ~15
  168. and v3q, ~15
  169. mova m4, [v2q + orderq]
  170. mova m5, [v3q + orderq]
  171. ; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
  172. cmp r4d, 0
  173. je .loop0
  174. cmp r4d, 2
  175. je .loop2
  176. cmp r4d, 4
  177. je .loop4
  178. cmp r4d, 6
  179. je .loop6
  180. cmp r4d, 8
  181. je .loop8
  182. cmp r4d, 10
  183. je .loop10
  184. cmp r4d, 12
  185. je .loop12
  186. SCALARPRODUCT_LOOP 14
  187. SCALARPRODUCT_LOOP 12
  188. SCALARPRODUCT_LOOP 10
  189. SCALARPRODUCT_LOOP 8
  190. SCALARPRODUCT_LOOP 6
  191. SCALARPRODUCT_LOOP 4
  192. SCALARPRODUCT_LOOP 2
  193. SCALARPRODUCT_LOOP 0
  194. .end:
  195. movhlps m0, m6
  196. paddd m6, m0
  197. pshuflw m0, m6, 0x4e
  198. paddd m6, m0
  199. movd eax, m6
  200. RET
  201. ; void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
  202. ; const uint8_t *diff, int w,
  203. ; int *left, int *left_top)
  204. INIT_MMX mmxext
  205. cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
  206. movq mm0, [topq]
  207. movq mm2, mm0
  208. movd mm4, [left_topq]
  209. psllq mm2, 8
  210. movq mm1, mm0
  211. por mm4, mm2
  212. movd mm3, [leftq]
  213. psubb mm0, mm4 ; t-tl
  214. add dstq, wq
  215. add topq, wq
  216. add diffq, wq
  217. neg wq
  218. jmp .skip
  219. .loop:
  220. movq mm4, [topq+wq]
  221. movq mm0, mm4
  222. psllq mm4, 8
  223. por mm4, mm1
  224. movq mm1, mm0 ; t
  225. psubb mm0, mm4 ; t-tl
  226. .skip:
  227. movq mm2, [diffq+wq]
  228. %assign i 0
  229. %rep 8
  230. movq mm4, mm0
  231. paddb mm4, mm3 ; t-tl+l
  232. movq mm5, mm3
  233. pmaxub mm3, mm1
  234. pminub mm5, mm1
  235. pminub mm3, mm4
  236. pmaxub mm3, mm5 ; median
  237. paddb mm3, mm2 ; +residual
  238. %if i==0
  239. movq mm7, mm3
  240. psllq mm7, 56
  241. %else
  242. movq mm6, mm3
  243. psrlq mm7, 8
  244. psllq mm6, 56
  245. por mm7, mm6
  246. %endif
  247. %if i<7
  248. psrlq mm0, 8
  249. psrlq mm1, 8
  250. psrlq mm2, 8
  251. %endif
  252. %assign i i+1
  253. %endrep
  254. movq [dstq+wq], mm7
  255. add wq, 8
  256. jl .loop
  257. movzx r2d, byte [dstq-1]
  258. mov [leftq], r2d
  259. movzx r2d, byte [topq-1]
  260. mov [left_topq], r2d
  261. RET
  262. %macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
  263. add srcq, wq
  264. add dstq, wq
  265. neg wq
  266. %%.loop:
  267. %if %2
  268. mova m1, [srcq+wq]
  269. %else
  270. movu m1, [srcq+wq]
  271. %endif
  272. mova m2, m1
  273. psllw m1, 8
  274. paddb m1, m2
  275. mova m2, m1
  276. pshufb m1, m3
  277. paddb m1, m2
  278. pshufb m0, m5
  279. mova m2, m1
  280. pshufb m1, m4
  281. paddb m1, m2
  282. %if mmsize == 16
  283. mova m2, m1
  284. pshufb m1, m6
  285. paddb m1, m2
  286. %endif
  287. paddb m0, m1
  288. %if %1
  289. mova [dstq+wq], m0
  290. %else
  291. movq [dstq+wq], m0
  292. movhps [dstq+wq+8], m0
  293. %endif
  294. add wq, mmsize
  295. jl %%.loop
  296. mov eax, mmsize-1
  297. sub eax, wd
  298. movd m1, eax
  299. pshufb m0, m1
  300. movd eax, m0
  301. RET
  302. %endmacro
  303. ; int ff_add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src,
  304. ; int w, int left)
  305. INIT_MMX ssse3
  306. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  307. .skip_prologue:
  308. mova m5, [pb_7]
  309. mova m4, [pb_zzzz3333zzzzbbbb]
  310. mova m3, [pb_zz11zz55zz99zzdd]
  311. movd m0, leftm
  312. psllq m0, 56
  313. ADD_HFYU_LEFT_LOOP 1, 1
  314. INIT_XMM sse4
  315. cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
  316. mova m5, [pb_f]
  317. mova m6, [pb_zzzzzzzz77777777]
  318. mova m4, [pb_zzzz3333zzzzbbbb]
  319. mova m3, [pb_zz11zz55zz99zzdd]
  320. movd m0, leftm
  321. pslldq m0, 15
  322. test srcq, 15
  323. jnz .src_unaligned
  324. test dstq, 15
  325. jnz .dst_unaligned
  326. ADD_HFYU_LEFT_LOOP 1, 1
  327. .dst_unaligned:
  328. ADD_HFYU_LEFT_LOOP 0, 1
  329. .src_unaligned:
  330. ADD_HFYU_LEFT_LOOP 0, 0
  331. ;-----------------------------------------------------------------------------
  332. ; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
  333. ; int32_t max, unsigned int len)
  334. ;-----------------------------------------------------------------------------
  335. ; %1 = number of xmm registers used
  336. ; %2 = number of inline load/process/store loops per asm loop
  337. ; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
  338. ; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
  339. ; %5 = suffix
  340. %macro VECTOR_CLIP_INT32 4-5
  341. cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
  342. %if %4
  343. cvtsi2ss m4, minm
  344. cvtsi2ss m5, maxm
  345. %else
  346. movd m4, minm
  347. movd m5, maxm
  348. %endif
  349. SPLATD m4
  350. SPLATD m5
  351. .loop:
  352. %assign %%i 1
  353. %rep %2
  354. mova m0, [srcq+mmsize*0*%%i]
  355. mova m1, [srcq+mmsize*1*%%i]
  356. mova m2, [srcq+mmsize*2*%%i]
  357. mova m3, [srcq+mmsize*3*%%i]
  358. %if %3
  359. mova m7, [srcq+mmsize*4*%%i]
  360. mova m8, [srcq+mmsize*5*%%i]
  361. mova m9, [srcq+mmsize*6*%%i]
  362. mova m10, [srcq+mmsize*7*%%i]
  363. %endif
  364. CLIPD m0, m4, m5, m6
  365. CLIPD m1, m4, m5, m6
  366. CLIPD m2, m4, m5, m6
  367. CLIPD m3, m4, m5, m6
  368. %if %3
  369. CLIPD m7, m4, m5, m6
  370. CLIPD m8, m4, m5, m6
  371. CLIPD m9, m4, m5, m6
  372. CLIPD m10, m4, m5, m6
  373. %endif
  374. mova [dstq+mmsize*0*%%i], m0
  375. mova [dstq+mmsize*1*%%i], m1
  376. mova [dstq+mmsize*2*%%i], m2
  377. mova [dstq+mmsize*3*%%i], m3
  378. %if %3
  379. mova [dstq+mmsize*4*%%i], m7
  380. mova [dstq+mmsize*5*%%i], m8
  381. mova [dstq+mmsize*6*%%i], m9
  382. mova [dstq+mmsize*7*%%i], m10
  383. %endif
  384. %assign %%i %%i+1
  385. %endrep
  386. add srcq, mmsize*4*(%2+%3)
  387. add dstq, mmsize*4*(%2+%3)
  388. sub lend, mmsize*(%2+%3)
  389. jg .loop
  390. REP_RET
  391. %endmacro
  392. INIT_MMX mmx
  393. %define CLIPD CLIPD_MMX
  394. VECTOR_CLIP_INT32 0, 1, 0, 0
  395. INIT_XMM sse2
  396. VECTOR_CLIP_INT32 6, 1, 0, 0, _int
  397. %define CLIPD CLIPD_SSE2
  398. VECTOR_CLIP_INT32 6, 2, 0, 1
  399. INIT_XMM sse4
  400. %define CLIPD CLIPD_SSE41
  401. %ifdef m8
  402. VECTOR_CLIP_INT32 11, 1, 1, 0
  403. %else
  404. VECTOR_CLIP_INT32 6, 1, 0, 0
  405. %endif
  406. ; %1 = aligned/unaligned
  407. %macro BSWAP_LOOPS 1
  408. mov r3, r2
  409. sar r2, 3
  410. jz .left4_%1
  411. .loop8_%1:
  412. mov%1 m0, [r1 + 0]
  413. mov%1 m1, [r1 + 16]
  414. %if cpuflag(ssse3)
  415. pshufb m0, m2
  416. pshufb m1, m2
  417. mov%1 [r0 + 0], m0
  418. mov%1 [r0 + 16], m1
  419. %else
  420. pshuflw m0, m0, 10110001b
  421. pshuflw m1, m1, 10110001b
  422. pshufhw m0, m0, 10110001b
  423. pshufhw m1, m1, 10110001b
  424. mova m2, m0
  425. mova m3, m1
  426. psllw m0, 8
  427. psllw m1, 8
  428. psrlw m2, 8
  429. psrlw m3, 8
  430. por m2, m0
  431. por m3, m1
  432. mov%1 [r0 + 0], m2
  433. mov%1 [r0 + 16], m3
  434. %endif
  435. add r0, 32
  436. add r1, 32
  437. dec r2
  438. jnz .loop8_%1
  439. .left4_%1:
  440. mov r2, r3
  441. and r3, 4
  442. jz .left
  443. mov%1 m0, [r1]
  444. %if cpuflag(ssse3)
  445. pshufb m0, m2
  446. mov%1 [r0], m0
  447. %else
  448. pshuflw m0, m0, 10110001b
  449. pshufhw m0, m0, 10110001b
  450. mova m2, m0
  451. psllw m0, 8
  452. psrlw m2, 8
  453. por m2, m0
  454. mov%1 [r0], m2
  455. %endif
  456. add r1, 16
  457. add r0, 16
  458. %endmacro
  459. ; void ff_bswap_buf(uint32_t *dst, const uint32_t *src, int w);
  460. %macro BSWAP32_BUF 0
  461. %if cpuflag(ssse3)
  462. cglobal bswap32_buf, 3,4,3
  463. mov r3, r1
  464. mova m2, [pb_bswap32]
  465. %else
  466. cglobal bswap32_buf, 3,4,5
  467. mov r3, r1
  468. %endif
  469. or r3, r0
  470. and r3, 15
  471. jz .start_align
  472. BSWAP_LOOPS u
  473. jmp .left
  474. .start_align:
  475. BSWAP_LOOPS a
  476. .left:
  477. %if cpuflag(ssse3)
  478. mov r3, r2
  479. and r2, 2
  480. jz .left1
  481. movq m0, [r1]
  482. pshufb m0, m2
  483. movq [r0], m0
  484. add r1, 8
  485. add r0, 8
  486. .left1:
  487. and r3, 1
  488. jz .end
  489. mov r2d, [r1]
  490. bswap r2d
  491. mov [r0], r2d
  492. %else
  493. and r2, 3
  494. jz .end
  495. .loop2:
  496. mov r3d, [r1]
  497. bswap r3d
  498. mov [r0], r3d
  499. add r1, 4
  500. add r0, 4
  501. dec r2
  502. jnz .loop2
  503. %endif
  504. .end:
  505. RET
  506. %endmacro
  507. INIT_XMM sse2
  508. BSWAP32_BUF
  509. INIT_XMM ssse3
  510. BSWAP32_BUF