You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

552 lines
15KB

  1. ;******************************************************************************
  2. ;* x86-optimized vertical line scaling functions
  3. ;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
  4. ;* Kieran Kunhya <kieran@kunhya.com>
  5. ;* (c) 2020 Nelson Gomez <nelson.gomez@microsoft.com>
  6. ;*
  7. ;* This file is part of FFmpeg.
  8. ;*
  9. ;* FFmpeg is free software; you can redistribute it and/or
  10. ;* modify it under the terms of the GNU Lesser General Public
  11. ;* License as published by the Free Software Foundation; either
  12. ;* version 2.1 of the License, or (at your option) any later version.
  13. ;*
  14. ;* FFmpeg is distributed in the hope that it will be useful,
  15. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. ;* Lesser General Public License for more details.
  18. ;*
  19. ;* You should have received a copy of the GNU Lesser General Public
  20. ;* License along with FFmpeg; if not, write to the Free Software
  21. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. ;******************************************************************************
  23. %include "libavutil/x86/x86util.asm"
  24. SECTION_RODATA 32
  25. minshort: times 8 dw 0x8000
  26. yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
  27. yuv2yuvX_10_start: times 4 dd 0x10000
  28. yuv2yuvX_9_start: times 4 dd 0x20000
  29. yuv2yuvX_10_upper: times 8 dw 0x3ff
  30. yuv2yuvX_9_upper: times 8 dw 0x1ff
  31. pd_4: times 4 dd 4
  32. pd_4min0x40000:times 4 dd 4 - (0x40000)
  33. pw_16: times 8 dw 16
  34. pw_32: times 8 dw 32
  35. pd_255: times 8 dd 255
  36. pw_512: times 8 dw 512
  37. pw_1024: times 8 dw 1024
  38. yuv2nv12_shuffle_mask: times 2 db 0, 4, 8, 12, \
  39. -1, -1, -1, -1, \
  40. -1, -1, -1, -1, \
  41. -1, -1, -1, -1
  42. yuv2nv21_shuffle_mask: times 2 db 4, 0, 12, 8, \
  43. -1, -1, -1, -1, \
  44. -1, -1, -1, -1, \
  45. -1, -1, -1, -1
  46. yuv2nv12_permute_mask: dd 0, 4, 1, 2, 3, 5, 6, 7
  47. SECTION .text
  48. ;-----------------------------------------------------------------------------
  49. ; vertical line scaling
  50. ;
  51. ; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
  52. ; const uint8_t *dither, int offset)
  53. ; and
  54. ; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
  55. ; const int16_t **src, uint8_t *dst, int dstW,
  56. ; const uint8_t *dither, int offset)
  57. ;
  58. ; Scale one or $filterSize lines of source data to generate one line of output
  59. ; data. The input is 15 bits in int16_t if $output_size is [8,10] and 19 bits in
  60. ; int32_t if $output_size is 16. $filter is 12 bits. $filterSize is a multiple
  61. ; of 2. $offset is either 0 or 3. $dither holds 8 values.
  62. ;-----------------------------------------------------------------------------
  63. %macro yuv2planeX_mainloop 2
  64. .pixelloop_%2:
  65. %assign %%i 0
  66. ; the rep here is for the 8-bit output MMX case, where dither covers
  67. ; 8 pixels but we can only handle 2 pixels per register, and thus 4
  68. ; pixels per iteration. In order to not have to keep track of where
  69. ; we are w.r.t. dithering, we unroll the MMX/8-bit loop x2.
  70. %if %1 == 8
  71. %assign %%repcnt 16/mmsize
  72. %else
  73. %assign %%repcnt 1
  74. %endif
  75. %rep %%repcnt
  76. %if %1 == 8
  77. %if ARCH_X86_32
  78. mova m2, [rsp+mmsize*(0+%%i)]
  79. mova m1, [rsp+mmsize*(1+%%i)]
  80. %else ; x86-64
  81. mova m2, m8
  82. mova m1, m_dith
  83. %endif ; x86-32/64
  84. %else ; %1 == 9/10/16
  85. mova m1, [yuv2yuvX_%1_start]
  86. mova m2, m1
  87. %endif ; %1 == 8/9/10/16
  88. movsx cntr_reg, fltsizem
  89. .filterloop_%2_ %+ %%i:
  90. ; input pixels
  91. mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
  92. %if %1 == 16
  93. mova m3, [r6+r5*4]
  94. mova m5, [r6+r5*4+mmsize]
  95. %else ; %1 == 8/9/10
  96. mova m3, [r6+r5*2]
  97. %endif ; %1 == 8/9/10/16
  98. mov r6, [srcq+gprsize*cntr_reg-gprsize]
  99. %if %1 == 16
  100. mova m4, [r6+r5*4]
  101. mova m6, [r6+r5*4+mmsize]
  102. %else ; %1 == 8/9/10
  103. mova m4, [r6+r5*2]
  104. %endif ; %1 == 8/9/10/16
  105. ; coefficients
  106. movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
  107. %if %1 == 16
  108. pshuflw m7, m0, 0 ; coeff[0]
  109. pshuflw m0, m0, 0x55 ; coeff[1]
  110. pmovsxwd m7, m7 ; word -> dword
  111. pmovsxwd m0, m0 ; word -> dword
  112. pmulld m3, m7
  113. pmulld m5, m7
  114. pmulld m4, m0
  115. pmulld m6, m0
  116. paddd m2, m3
  117. paddd m1, m5
  118. paddd m2, m4
  119. paddd m1, m6
  120. %else ; %1 == 10/9/8
  121. punpcklwd m5, m3, m4
  122. punpckhwd m3, m4
  123. SPLATD m0
  124. pmaddwd m5, m0
  125. pmaddwd m3, m0
  126. paddd m2, m5
  127. paddd m1, m3
  128. %endif ; %1 == 8/9/10/16
  129. sub cntr_reg, 2
  130. jg .filterloop_%2_ %+ %%i
  131. %if %1 == 16
  132. psrad m2, 31 - %1
  133. psrad m1, 31 - %1
  134. %else ; %1 == 10/9/8
  135. psrad m2, 27 - %1
  136. psrad m1, 27 - %1
  137. %endif ; %1 == 8/9/10/16
  138. %if %1 == 8
  139. packssdw m2, m1
  140. packuswb m2, m2
  141. movh [dstq+r5*1], m2
  142. %else ; %1 == 9/10/16
  143. %if %1 == 16
  144. packssdw m2, m1
  145. paddw m2, [minshort]
  146. %else ; %1 == 9/10
  147. %if cpuflag(sse4)
  148. packusdw m2, m1
  149. %else ; mmxext/sse2
  150. packssdw m2, m1
  151. pmaxsw m2, m6
  152. %endif ; mmxext/sse2/sse4/avx
  153. pminsw m2, [yuv2yuvX_%1_upper]
  154. %endif ; %1 == 9/10/16
  155. mov%2 [dstq+r5*2], m2
  156. %endif ; %1 == 8/9/10/16
  157. add r5, mmsize/2
  158. sub wd, mmsize/2
  159. %assign %%i %%i+2
  160. %endrep
  161. jg .pixelloop_%2
  162. %endmacro
  163. %macro yuv2planeX_fn 3
  164. %if ARCH_X86_32
  165. %define cntr_reg fltsizeq
  166. %define movsx mov
  167. %else
  168. %define cntr_reg r7
  169. %define movsx movsxd
  170. %endif
  171. cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
  172. %if %1 == 8 || %1 == 9 || %1 == 10
  173. pxor m6, m6
  174. %endif ; %1 == 8/9/10
  175. %if %1 == 8
  176. %if ARCH_X86_32
  177. %assign pad 0x2c - (stack_offset & 15)
  178. SUB rsp, pad
  179. %define m_dith m7
  180. %else ; x86-64
  181. %define m_dith m9
  182. %endif ; x86-32
  183. ; create registers holding dither
  184. movq m_dith, [ditherq] ; dither
  185. test offsetd, offsetd
  186. jz .no_rot
  187. %if mmsize == 16
  188. punpcklqdq m_dith, m_dith
  189. %endif ; mmsize == 16
  190. PALIGNR m_dith, m_dith, 3, m0
  191. .no_rot:
  192. %if mmsize == 16
  193. punpcklbw m_dith, m6
  194. %if ARCH_X86_64
  195. punpcklwd m8, m_dith, m6
  196. pslld m8, 12
  197. %else ; x86-32
  198. punpcklwd m5, m_dith, m6
  199. pslld m5, 12
  200. %endif ; x86-32/64
  201. punpckhwd m_dith, m6
  202. pslld m_dith, 12
  203. %if ARCH_X86_32
  204. mova [rsp+ 0], m5
  205. mova [rsp+16], m_dith
  206. %endif
  207. %else ; mmsize == 8
  208. punpcklbw m5, m_dith, m6
  209. punpckhbw m_dith, m6
  210. punpcklwd m4, m5, m6
  211. punpckhwd m5, m6
  212. punpcklwd m3, m_dith, m6
  213. punpckhwd m_dith, m6
  214. pslld m4, 12
  215. pslld m5, 12
  216. pslld m3, 12
  217. pslld m_dith, 12
  218. mova [rsp+ 0], m4
  219. mova [rsp+ 8], m5
  220. mova [rsp+16], m3
  221. mova [rsp+24], m_dith
  222. %endif ; mmsize == 8/16
  223. %endif ; %1 == 8
  224. xor r5, r5
  225. %if mmsize == 8 || %1 == 8
  226. yuv2planeX_mainloop %1, a
  227. %else ; mmsize == 16
  228. test dstq, 15
  229. jnz .unaligned
  230. yuv2planeX_mainloop %1, a
  231. REP_RET
  232. .unaligned:
  233. yuv2planeX_mainloop %1, u
  234. %endif ; mmsize == 8/16
  235. %if %1 == 8
  236. %if ARCH_X86_32
  237. ADD rsp, pad
  238. RET
  239. %else ; x86-64
  240. REP_RET
  241. %endif ; x86-32/64
  242. %else ; %1 == 9/10/16
  243. REP_RET
  244. %endif ; %1 == 8/9/10/16
  245. %endmacro
  246. %if ARCH_X86_32
  247. INIT_MMX mmxext
  248. yuv2planeX_fn 8, 0, 7
  249. yuv2planeX_fn 9, 0, 5
  250. yuv2planeX_fn 10, 0, 5
  251. %endif
  252. INIT_XMM sse2
  253. yuv2planeX_fn 8, 10, 7
  254. yuv2planeX_fn 9, 7, 5
  255. yuv2planeX_fn 10, 7, 5
  256. INIT_XMM sse4
  257. yuv2planeX_fn 8, 10, 7
  258. yuv2planeX_fn 9, 7, 5
  259. yuv2planeX_fn 10, 7, 5
  260. yuv2planeX_fn 16, 8, 5
  261. %if HAVE_AVX_EXTERNAL
  262. INIT_XMM avx
  263. yuv2planeX_fn 8, 10, 7
  264. yuv2planeX_fn 9, 7, 5
  265. yuv2planeX_fn 10, 7, 5
  266. %endif
  267. ; %1=outout-bpc, %2=alignment (u/a)
  268. %macro yuv2plane1_mainloop 2
  269. .loop_%2:
  270. %if %1 == 8
  271. paddsw m0, m2, [srcq+wq*2+mmsize*0]
  272. paddsw m1, m3, [srcq+wq*2+mmsize*1]
  273. psraw m0, 7
  274. psraw m1, 7
  275. packuswb m0, m1
  276. mov%2 [dstq+wq], m0
  277. %elif %1 == 16
  278. paddd m0, m4, [srcq+wq*4+mmsize*0]
  279. paddd m1, m4, [srcq+wq*4+mmsize*1]
  280. paddd m2, m4, [srcq+wq*4+mmsize*2]
  281. paddd m3, m4, [srcq+wq*4+mmsize*3]
  282. psrad m0, 3
  283. psrad m1, 3
  284. psrad m2, 3
  285. psrad m3, 3
  286. %if cpuflag(sse4) ; avx/sse4
  287. packusdw m0, m1
  288. packusdw m2, m3
  289. %else ; mmx/sse2
  290. packssdw m0, m1
  291. packssdw m2, m3
  292. paddw m0, m5
  293. paddw m2, m5
  294. %endif ; mmx/sse2/sse4/avx
  295. mov%2 [dstq+wq*2+mmsize*0], m0
  296. mov%2 [dstq+wq*2+mmsize*1], m2
  297. %else ; %1 == 9/10
  298. paddsw m0, m2, [srcq+wq*2+mmsize*0]
  299. paddsw m1, m2, [srcq+wq*2+mmsize*1]
  300. psraw m0, 15 - %1
  301. psraw m1, 15 - %1
  302. pmaxsw m0, m4
  303. pmaxsw m1, m4
  304. pminsw m0, m3
  305. pminsw m1, m3
  306. mov%2 [dstq+wq*2+mmsize*0], m0
  307. mov%2 [dstq+wq*2+mmsize*1], m1
  308. %endif
  309. add wq, mmsize
  310. jl .loop_%2
  311. %endmacro
  312. %macro yuv2plane1_fn 3
  313. cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
  314. movsxdifnidn wq, wd
  315. add wq, mmsize - 1
  316. and wq, ~(mmsize - 1)
  317. %if %1 == 8
  318. add dstq, wq
  319. %else ; %1 != 8
  320. lea dstq, [dstq+wq*2]
  321. %endif ; %1 == 8
  322. %if %1 == 16
  323. lea srcq, [srcq+wq*4]
  324. %else ; %1 != 16
  325. lea srcq, [srcq+wq*2]
  326. %endif ; %1 == 16
  327. neg wq
  328. %if %1 == 8
  329. pxor m4, m4 ; zero
  330. ; create registers holding dither
  331. movq m3, [ditherq] ; dither
  332. test offsetd, offsetd
  333. jz .no_rot
  334. %if mmsize == 16
  335. punpcklqdq m3, m3
  336. %endif ; mmsize == 16
  337. PALIGNR m3, m3, 3, m2
  338. .no_rot:
  339. %if mmsize == 8
  340. mova m2, m3
  341. punpckhbw m3, m4 ; byte->word
  342. punpcklbw m2, m4 ; byte->word
  343. %else
  344. punpcklbw m3, m4
  345. mova m2, m3
  346. %endif
  347. %elif %1 == 9
  348. pxor m4, m4
  349. mova m3, [pw_512]
  350. mova m2, [pw_32]
  351. %elif %1 == 10
  352. pxor m4, m4
  353. mova m3, [pw_1024]
  354. mova m2, [pw_16]
  355. %else ; %1 == 16
  356. %if cpuflag(sse4) ; sse4/avx
  357. mova m4, [pd_4]
  358. %else ; mmx/sse2
  359. mova m4, [pd_4min0x40000]
  360. mova m5, [minshort]
  361. %endif ; mmx/sse2/sse4/avx
  362. %endif ; %1 == ..
  363. ; actual pixel scaling
  364. %if mmsize == 8
  365. yuv2plane1_mainloop %1, a
  366. %else ; mmsize == 16
  367. test dstq, 15
  368. jnz .unaligned
  369. yuv2plane1_mainloop %1, a
  370. REP_RET
  371. .unaligned:
  372. yuv2plane1_mainloop %1, u
  373. %endif ; mmsize == 8/16
  374. REP_RET
  375. %endmacro
  376. %if ARCH_X86_32
  377. INIT_MMX mmx
  378. yuv2plane1_fn 8, 0, 5
  379. yuv2plane1_fn 16, 0, 3
  380. INIT_MMX mmxext
  381. yuv2plane1_fn 9, 0, 3
  382. yuv2plane1_fn 10, 0, 3
  383. %endif
  384. INIT_XMM sse2
  385. yuv2plane1_fn 8, 5, 5
  386. yuv2plane1_fn 9, 5, 3
  387. yuv2plane1_fn 10, 5, 3
  388. yuv2plane1_fn 16, 6, 3
  389. INIT_XMM sse4
  390. yuv2plane1_fn 16, 5, 3
  391. %if HAVE_AVX_EXTERNAL
  392. INIT_XMM avx
  393. yuv2plane1_fn 8, 5, 5
  394. yuv2plane1_fn 9, 5, 3
  395. yuv2plane1_fn 10, 5, 3
  396. yuv2plane1_fn 16, 5, 3
  397. %endif
  398. %undef movsx
  399. ;-----------------------------------------------------------------------------
  400. ; AVX2 yuv2nv12cX implementation
  401. ;
  402. ; void ff_yuv2nv12cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
  403. ; const int16_t *filter, int filterSize,
  404. ; const int16_t **u, const int16_t **v,
  405. ; uint8_t *dst, int dstWidth)
  406. ;
  407. ; void ff_yuv2nv21cX_avx2(enum AVPixelFormat format, const uint8_t *dither,
  408. ; const int16_t *filter, int filterSize,
  409. ; const int16_t **u, const int16_t **v,
  410. ; uint8_t *dst, int dstWidth)
  411. ;-----------------------------------------------------------------------------
  412. %if ARCH_X86_64
  413. %macro yuv2nv12cX_fn 1
  414. cglobal %1cX, 8, 11, 13, tmp1, dither, filter, filterSize, u, v, dst, dstWidth
  415. mov tmp1q, qword [ditherq]
  416. movq xm0, tmp1q
  417. ror tmp1q, 24
  418. movq xm1, tmp1q
  419. pmovzxbd m0, xm0
  420. pslld m0, m0, 12 ; ditherLo
  421. pmovzxbd m1, xm1
  422. pslld m1, m1, 12 ; ditherHi
  423. pxor m9, m9 ; uint8_min dwords
  424. mova m10, [pd_255] ; uint8_max dwords
  425. mova m11, [%1_shuffle_mask] ; shuffle_mask
  426. mova m12, [yuv2nv12_permute_mask] ; permute mask
  427. DEFINE_ARGS tmp1, tmp2, filter, filterSize, u, v, dst, dstWidth
  428. xor r8q, r8q
  429. nv12_outer_%1:
  430. mova m2, m0 ; resultLo
  431. mova m3, m1 ; resultHi
  432. xor r9q, r9q
  433. nv12_inner_%1:
  434. movsx r10d, word [filterq + (2 * r9q)]
  435. movd xm4, r10d
  436. vpbroadcastd m4, xm4 ; filter
  437. mov tmp1q, [uq + (gprsize * r9q)]
  438. mova xm7, oword [tmp1q + 2 * r8q]
  439. mov tmp2q, [vq + (gprsize * r9q)]
  440. mova xm8, oword [tmp2q + 2 * r8q]
  441. punpcklwd xm5, xm7, xm8
  442. pmovsxwd m5, xm5 ; multiplicandsLo
  443. punpckhwd xm6, xm7, xm8
  444. pmovsxwd m6, xm6 ; multiplicandsHi
  445. pmulld m7, m5, m4 ; mulResultLo
  446. pmulld m8, m6, m4 ; mulResultHi
  447. paddd m2, m2, m7 ; resultLo += mulResultLo
  448. paddd m3, m3, m8 ; resultHi += mulResultHi
  449. inc r9d
  450. cmp r9d, filterSized
  451. jl nv12_inner_%1
  452. ; end of inner loop
  453. psrad m2, m2, 19
  454. psrad m3, m3, 19
  455. ; Vectorized av_clip_uint8
  456. pmaxsd m2, m2, m9
  457. pmaxsd m3, m3, m9
  458. pminsd m2, m2, m10
  459. pminsd m3, m3, m10
  460. ; At this point we have clamped uint8s arranged in this order:
  461. ; m2: u1 0 0 0 v1 0 0 0 [...]
  462. ; m3: u5 0 0 0 v5 0 0 0 [...]
  463. ;
  464. ; First, we shuffle the bytes to make the bytes semi-contiguous.
  465. ; AVX-2 doesn't have cross-lane shuffling, so we'll end up with:
  466. ; m2: u1 v1 u2 v2 0 0 0 0 0 0 0 0 u3 v3 u4 v4
  467. ; m3: u5 v5 u6 v6 0 0 0 0 0 0 0 0 u7 v7 u8 v8
  468. pshufb m2, m2, m11
  469. pshufb m3, m3, m11
  470. ; To fix the cross-lane shuffling issue, we'll then use cross-lane
  471. ; permutation to combine the two segments
  472. vpermd m2, m12, m2
  473. vpermd m3, m12, m3
  474. ; Now we have the final results in the lower 8 bytes of each register
  475. movq [dstq], xm2
  476. movq [dstq + 8], xm3
  477. add r8d, 8
  478. add dstq, 16
  479. cmp r8d, dstWidthd
  480. jl nv12_outer_%1
  481. RET
  482. %endmacro
  483. %if HAVE_AVX2_EXTERNAL
  484. INIT_YMM avx2
  485. yuv2nv12cX_fn yuv2nv12
  486. yuv2nv12cX_fn yuv2nv21
  487. %endif
  488. %endif ; ARCH_X86_64