You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

852 lines
18KB

  1. ;*****************************************************************************
  2. ;* x86-optimized HEVC MC
  3. ;* Copyright 2015 Anton Khirnov
  4. ;*
  5. ;* This file is part of Libav.
  6. ;*
  7. ;* Libav is free software; you can redistribute it and/or
  8. ;* modify it under the terms of the GNU Lesser General Public
  9. ;* License as published by the Free Software Foundation; either
  10. ;* version 2.1 of the License, or (at your option) any later version.
  11. ;*
  12. ;* Libav is distributed in the hope that it will be useful,
  13. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ;* Lesser General Public License for more details.
  16. ;*
  17. ;* You should have received a copy of the GNU Lesser General Public
  18. ;* License along with Libav; if not, write to the Free Software
  19. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ;******************************************************************************
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pw_1023: times 8 dw 1023
  24. cextern hevc_qpel_coeffs
  25. cextern hevc_qpel_coeffs8
  26. cextern hevc_epel_coeffs
  27. cextern hevc_epel_coeffs8
  28. cextern pw_8
  29. cextern pw_16
  30. cextern pw_32
  31. cextern pw_64
  32. SECTION .text
  33. ; %1: width
  34. ; %2: bit depth
  35. %macro COMMON_DEFS 2
  36. %assign blocksize 8
  37. %assign nb_blocks ((%1 + blocksize - 1) / blocksize)
  38. %define last_block_truncated (blocksize * nb_blocks > %1)
  39. %if %2 > 8
  40. %define LOAD_BLOCK movu
  41. %define LOAD_HALFBLOCK movq
  42. %assign pixelsize 2
  43. %else
  44. %define LOAD_BLOCK movq
  45. %define LOAD_HALFBLOCK movd
  46. %assign pixelsize 1
  47. %endif
  48. %define STORE_BLOCK mova
  49. %define STORE_HALFBLOCK movq
  50. %endmacro
  51. ; %1: block index
  52. %macro BLOCK_DEFS 1
  53. %if last_block_truncated && %1 == nb_blocks - 1
  54. %define block_truncated 1
  55. %define LOAD LOAD_HALFBLOCK
  56. %define STORE STORE_HALFBLOCK
  57. %else
  58. %define block_truncated 0
  59. %define LOAD LOAD_BLOCK
  60. %define STORE STORE_BLOCK
  61. %endif
  62. %endmacro
  63. ; hevc_get_pixels_<w>_<d>(int16_t *dst, ptrdiff_t dststride,
  64. ; pixel *src, ptrdiff_t srcstride,
  65. ; int height, int mx, int my, int *mcbuffer)
  66. ; %1: block width
  67. ; %2: bit depth
  68. ; %3: log2 of height unroll
  69. %macro GET_PIXELS 3
  70. cglobal hevc_get_pixels_ %+ %1 %+ _ %+ %2, 5, 5, 2, dst, dststride, src, srcstride, height ; rest of the args unused
  71. %assign shift 14 - %2
  72. COMMON_DEFS %1, %2
  73. %if pixelsize == 1
  74. pxor m0, m0
  75. %endif
  76. shr heightd, %3
  77. .loop:
  78. %assign i 0
  79. %rep (1 << %3)
  80. %assign j 0
  81. %rep nb_blocks
  82. BLOCK_DEFS j
  83. LOAD m1, [srcq + j * pixelsize * blocksize]
  84. %if pixelsize == 1
  85. punpcklbw m1, m0
  86. %endif
  87. psllw m1, shift
  88. STORE [dstq + j * 2 * blocksize], m1
  89. %assign j (j + 1)
  90. %endrep
  91. add dstq, dststrideq
  92. add srcq, srcstrideq
  93. %assign i (i + 1)
  94. %endrep
  95. dec heightd
  96. jg .loop
  97. RET
  98. %endmacro
  99. INIT_XMM sse2
  100. GET_PIXELS 4, 8, 1
  101. GET_PIXELS 8, 8, 1
  102. GET_PIXELS 12, 8, 3
  103. GET_PIXELS 16, 8, 2
  104. GET_PIXELS 24, 8, 3
  105. GET_PIXELS 32, 8, 3
  106. GET_PIXELS 48, 8, 3
  107. GET_PIXELS 64, 8, 3
  108. GET_PIXELS 4, 10, 1
  109. GET_PIXELS 8, 10, 1
  110. GET_PIXELS 12, 10, 3
  111. GET_PIXELS 16, 10, 2
  112. GET_PIXELS 24, 10, 3
  113. GET_PIXELS 32, 10, 3
  114. GET_PIXELS 48, 10, 3
  115. GET_PIXELS 64, 10, 3
  116. ; hevc_qpel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
  117. ; uint8_t *src, ptrdiff_t srcstride,
  118. ; int height, int mx, int my, int *mcbuffer)
  119. ; 8-bit qpel interpolation
  120. ; %1: block width
  121. ; %2: 0 - horizontal; 1 - vertical
  122. %macro QPEL_8 2
  123. %if %2
  124. %define postfix v
  125. %define mvfrac myq
  126. %define coeffsaddr r5q
  127. %define pixstride srcstrideq
  128. %define pixstride3 r5q
  129. %define src_m3 r6q
  130. %else
  131. %define postfix h
  132. %define mvfrac mxq
  133. %define coeffsaddr r6q
  134. %define pixstride 1
  135. %define pixstride3 3
  136. %define src_m3 (srcq - 3)
  137. %endif
  138. COMMON_DEFS %1, 8
  139. cglobal hevc_qpel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 7, dst, dststride, src, srcstride, height, mx, my
  140. and mvfrac, 0x3
  141. dec mvfrac
  142. shl mvfrac, 4
  143. lea coeffsaddr, [hevc_qpel_coeffs8]
  144. mova m0, [coeffsaddr + mvfrac]
  145. SPLATW m1, m0, 1
  146. SPLATW m2, m0, 2
  147. SPLATW m3, m0, 3
  148. SPLATW m0, m0, 0
  149. %if %2
  150. lea pixstride3, [srcstrideq + 2 * srcstrideq]
  151. mov src_m3, srcq
  152. sub src_m3, pixstride3
  153. %endif
  154. .loop:
  155. %assign i 0
  156. %rep nb_blocks
  157. BLOCK_DEFS i
  158. LOAD m4, [src_m3 + i * blocksize]
  159. LOAD m5, [src_m3 + i * blocksize + 1 * pixstride]
  160. punpcklbw m4, m5
  161. pmaddubsw m4, m0
  162. LOAD m5, [src_m3 + i * blocksize + 2 * pixstride]
  163. LOAD m6, [srcq + i * blocksize]
  164. punpcklbw m5, m6
  165. pmaddubsw m5, m1
  166. paddsw m4, m5
  167. LOAD m5, [srcq + i * blocksize + 1 * pixstride]
  168. LOAD m6, [srcq + i * blocksize + 2 * pixstride]
  169. punpcklbw m5, m6
  170. pmaddubsw m5, m2
  171. paddsw m4, m5
  172. LOAD m5, [srcq + i * blocksize + pixstride3]
  173. LOAD m6, [srcq + i * blocksize + 4 * pixstride]
  174. punpcklbw m5, m6
  175. pmaddubsw m5, m3
  176. paddsw m4, m5
  177. STORE [dstq + i * 2 * blocksize], m4
  178. %assign i (i + 1)
  179. %endrep
  180. add dstq, dststrideq
  181. add srcq, srcstrideq
  182. %if %2
  183. add src_m3, srcstrideq
  184. %endif
  185. dec heightd
  186. jg .loop
  187. RET
  188. %endmacro
  189. INIT_XMM ssse3
  190. QPEL_8 4, 0
  191. QPEL_8 8, 0
  192. QPEL_8 12, 0
  193. QPEL_8 16, 0
  194. QPEL_8 24, 0
  195. QPEL_8 32, 0
  196. QPEL_8 48, 0
  197. QPEL_8 64, 0
  198. QPEL_8 4, 1
  199. QPEL_8 8, 1
  200. QPEL_8 12, 1
  201. QPEL_8 16, 1
  202. QPEL_8 24, 1
  203. QPEL_8 32, 1
  204. QPEL_8 48, 1
  205. QPEL_8 64, 1
  206. ; 16-bit qpel interpolation
  207. ; %1: block width
  208. ; %2: shift applied to the result
  209. ; %3: 0 - horizontal; 1 - vertical
  210. %macro QPEL_16 3
  211. %if %3
  212. %define mvfrac myq
  213. %define pixstride srcstrideq
  214. %define pixstride3 sstride3q
  215. %define src_m3 srcm3q
  216. %else
  217. %define mvfrac mxq
  218. %define pixstride 2
  219. %define pixstride3 6
  220. %define src_m3 (srcq - 6)
  221. %endif
  222. COMMON_DEFS %1, 16
  223. and mvfrac, 0x3
  224. dec mvfrac
  225. shl mvfrac, 4
  226. lea coeffsregq, [hevc_qpel_coeffs]
  227. mova m0, [coeffsregq + mvfrac]
  228. pshufd m1, m0, 0x55
  229. pshufd m2, m0, 0xaa
  230. pshufd m3, m0, 0xff
  231. pshufd m0, m0, 0x00
  232. %if %3
  233. lea sstride3q, [srcstrideq + 2 * srcstrideq]
  234. mov srcm3q, srcq
  235. sub srcm3q, sstride3q
  236. %endif
  237. .loop:
  238. %assign i 0
  239. %rep nb_blocks
  240. BLOCK_DEFS i
  241. LOAD m4, [src_m3 + i * 2 * blocksize]
  242. LOAD m5, [src_m3 + i * 2 * blocksize + 1 * pixstride]
  243. LOAD m6, [src_m3 + i * 2 * blocksize + 2 * pixstride]
  244. LOAD m7, [srcq + i * 2 * blocksize + 0 * pixstride]
  245. LOAD m8, [srcq + i * 2 * blocksize + 1 * pixstride]
  246. LOAD m9, [srcq + i * 2 * blocksize + 2 * pixstride]
  247. LOAD m10, [srcq + i * 2 * blocksize + pixstride3]
  248. LOAD m11, [srcq + i * 2 * blocksize + 4 * pixstride]
  249. punpcklwd m12, m4, m5
  250. pmaddwd m12, m0
  251. punpcklwd m13, m6, m7
  252. pmaddwd m13, m1
  253. paddd m12, m13
  254. punpcklwd m13, m8, m9
  255. pmaddwd m13, m2
  256. paddd m12, m13
  257. punpcklwd m13, m10, m11
  258. pmaddwd m13, m3
  259. paddd m12, m13
  260. psrad m12, %2
  261. %if block_truncated == 0
  262. punpckhwd m4, m5
  263. pmaddwd m4, m0
  264. punpckhwd m6, m7
  265. pmaddwd m6, m1
  266. paddd m4, m6
  267. punpckhwd m8, m9
  268. pmaddwd m8, m2
  269. paddd m4, m8
  270. punpckhwd m10, m11
  271. pmaddwd m10, m3
  272. paddd m4, m10
  273. psrad m4, %2
  274. %endif
  275. packssdw m12, m4
  276. STORE [dstq + i * 2 * blocksize], m12
  277. %assign i (i + 1)
  278. %endrep
  279. add dstq, dststrideq
  280. add srcq, srcstrideq
  281. %if %3
  282. add srcm3q, srcstrideq
  283. %endif
  284. dec heightd
  285. jg .loop
  286. RET
  287. %endmacro
  288. %if ARCH_X86_64
  289. %macro QPEL_H_10 1
  290. cglobal hevc_qpel_h_ %+ %1 %+ _10, 7, 9, 14, dst, dststride, src, srcstride, height, mx, my, mcbuffer, coeffsreg
  291. QPEL_16 %1, 2, 0
  292. %endmacro
  293. INIT_XMM avx
  294. QPEL_H_10 4
  295. QPEL_H_10 8
  296. QPEL_H_10 12
  297. QPEL_H_10 16
  298. QPEL_H_10 24
  299. QPEL_H_10 32
  300. QPEL_H_10 48
  301. QPEL_H_10 64
  302. %macro QPEL_V_10 1
  303. cglobal hevc_qpel_v_ %+ %1 %+ _10, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
  304. QPEL_16 %1, 2, 1
  305. %endmacro
  306. INIT_XMM avx
  307. QPEL_V_10 4
  308. QPEL_V_10 8
  309. QPEL_V_10 12
  310. QPEL_V_10 16
  311. QPEL_V_10 24
  312. QPEL_V_10 32
  313. QPEL_V_10 48
  314. QPEL_V_10 64
  315. ; hevc_qpel_hv_<w>(int16_t *dst, ptrdiff_t dststride,
  316. ; uint8_t *src, ptrdiff_t srcstride,
  317. ; int height, int mx, int my, int *mcbuffer)
  318. %macro QPEL_HV 1
  319. cglobal hevc_qpel_hv_ %+ %1, 7, 10, 14, dst, dststride, src, srcstride, height, mx, my, sstride3, srcm3, coeffsreg
  320. QPEL_16 %1, 6, 1
  321. %endmacro
  322. INIT_XMM avx
  323. QPEL_HV 4
  324. QPEL_HV 8
  325. QPEL_HV 12
  326. QPEL_HV 16
  327. QPEL_HV 24
  328. QPEL_HV 32
  329. QPEL_HV 48
  330. QPEL_HV 64
  331. %endif ; ARCH_X86_64
  332. ; hevc_epel_h/v_<w>_8(int16_t *dst, ptrdiff_t dststride,
  333. ; uint8_t *src, ptrdiff_t srcstride,
  334. ; int height, int mx, int my, int *mcbuffer)
  335. ; 8-bit epel interpolation
  336. ; %1: block width
  337. ; %2: 0 - horizontal; 1 - vertical
  338. %macro EPEL_8 2
  339. %if %2
  340. %define postfix v
  341. %define mvfrac myq
  342. %define coeffsaddr r5q
  343. %define pixstride srcstrideq
  344. %define pixstride3 r5q
  345. %else
  346. %define postfix h
  347. %define mvfrac mxq
  348. %define coeffsaddr r6q
  349. %define pixstride 1
  350. %define pixstride3 3
  351. %endif
  352. COMMON_DEFS %1, 8
  353. cglobal hevc_epel_ %+ postfix %+ _ %+ %1 %+ _8, 7, 7, 6, dst, dststride, src, srcstride, height, mx, my
  354. and mvfrac, 0x7
  355. dec mvfrac
  356. shl mvfrac, 4
  357. lea coeffsaddr, [hevc_epel_coeffs8]
  358. movq m0, [coeffsaddr + mvfrac]
  359. SPLATW m1, m0, 1
  360. SPLATW m0, m0, 0
  361. %if %2
  362. lea pixstride3, [srcstrideq + 2 * srcstrideq]
  363. %endif
  364. sub srcq, pixstride
  365. .loop:
  366. %assign i 0
  367. %rep nb_blocks
  368. BLOCK_DEFS i
  369. LOAD m2, [srcq + i * blocksize + 0 * pixstride]
  370. LOAD m3, [srcq + i * blocksize + 1 * pixstride]
  371. LOAD m4, [srcq + i * blocksize + 2 * pixstride]
  372. LOAD m5, [srcq + i * blocksize + pixstride3]
  373. punpcklbw m2, m3
  374. punpcklbw m4, m5
  375. pmaddubsw m2, m0
  376. pmaddubsw m4, m1
  377. paddsw m2, m4
  378. STORE [dstq + i * 2 * blocksize], m2
  379. %assign i (i + 1)
  380. %endrep
  381. add dstq, dststrideq
  382. add srcq, srcstrideq
  383. dec heightd
  384. jg .loop
  385. RET
  386. %endmacro
  387. INIT_XMM ssse3
  388. EPEL_8 4, 0
  389. EPEL_8 8, 0
  390. EPEL_8 12, 0
  391. EPEL_8 16, 0
  392. EPEL_8 24, 0
  393. EPEL_8 32, 0
  394. EPEL_8 4, 1
  395. EPEL_8 8, 1
  396. EPEL_8 12, 1
  397. EPEL_8 16, 1
  398. EPEL_8 24, 1
  399. EPEL_8 32, 1
  400. %macro EPEL_16 3
  401. %if %3
  402. %define mvfrac myq
  403. %define pixstride srcstrideq
  404. %define pixstride3 sstride3q
  405. %else
  406. %define mvfrac mxq
  407. %define pixstride 2
  408. %define pixstride3 6
  409. %endif
  410. COMMON_DEFS %1, 16
  411. and mvfrac, 0x7
  412. dec mvfrac
  413. shl mvfrac, 5
  414. lea coeffsregq, [hevc_epel_coeffs]
  415. mova m0, [coeffsregq + mvfrac]
  416. pshufd m1, m0, 0x55
  417. pshufd m0, m0, 0x00
  418. %if %3
  419. lea sstride3q, [srcstrideq + 2 * srcstrideq]
  420. %endif
  421. sub srcq, pixstride
  422. .loop:
  423. %assign i 0
  424. %rep nb_blocks
  425. BLOCK_DEFS i
  426. LOAD m2, [srcq + i * 2 * blocksize + 0 * pixstride]
  427. LOAD m3, [srcq + i * 2 * blocksize + 1 * pixstride]
  428. LOAD m4, [srcq + i * 2 * blocksize + 2 * pixstride]
  429. LOAD m5, [srcq + i * 2 * blocksize + pixstride3]
  430. punpcklwd m6, m2, m3
  431. punpcklwd m7, m4, m5
  432. pmaddwd m6, m0
  433. pmaddwd m7, m1
  434. paddd m6, m7
  435. psrad m6, %2
  436. %if block_truncated == 0
  437. punpckhwd m2, m3
  438. punpckhwd m4, m5
  439. pmaddwd m2, m0
  440. pmaddwd m4, m1
  441. paddd m2, m4
  442. psrad m2, %2
  443. %endif
  444. packssdw m6, m2
  445. STORE [dstq + i * 2 * blocksize], m6
  446. %assign i (i + 1)
  447. %endrep
  448. add dstq, dststrideq
  449. add srcq, srcstrideq
  450. dec heightd
  451. jg .loop
  452. RET
  453. %endmacro
  454. %if ARCH_X86_64
  455. %macro EPEL_H_10 1
  456. cglobal hevc_epel_h_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
  457. EPEL_16 %1, 2, 0
  458. %endmacro
  459. INIT_XMM avx
  460. EPEL_H_10 4
  461. EPEL_H_10 8
  462. EPEL_H_10 12
  463. EPEL_H_10 16
  464. EPEL_H_10 24
  465. EPEL_H_10 32
  466. %macro EPEL_V_10 1
  467. cglobal hevc_epel_v_ %+ %1 %+ _10, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
  468. EPEL_16 %1, 2, 1
  469. %endmacro
  470. INIT_XMM avx
  471. EPEL_V_10 4
  472. EPEL_V_10 8
  473. EPEL_V_10 12
  474. EPEL_V_10 16
  475. EPEL_V_10 24
  476. EPEL_V_10 32
  477. ; hevc_epel_hv_<w>_8(int16_t *dst, ptrdiff_t dststride,
  478. ; int16_t *src, ptrdiff_t srcstride,
  479. ; int height, int mx, int my, int *mcbuffer)
  480. %macro EPEL_HV 1
  481. cglobal hevc_epel_hv_ %+ %1, 8, 9, 8, dst, dststride, src, srcstride, height, mx, my, sstride3, coeffsreg
  482. EPEL_16 %1, 6, 1
  483. %endmacro
  484. INIT_XMM avx
  485. EPEL_HV 4
  486. EPEL_HV 8
  487. EPEL_HV 12
  488. EPEL_HV 16
  489. EPEL_HV 24
  490. EPEL_HV 32
  491. %endif ; ARCH_X86_64
  492. ; hevc_put_unweighted_pred_<w>_<d>(pixel *dst, ptrdiff_t dststride,
  493. ; int16_t *src, ptrdiff_t srcstride,
  494. ; int height)
  495. %macro AVG 5
  496. %if %3
  497. %if %4 == 4
  498. movq %5, %2
  499. paddsw %1, %5
  500. %else
  501. paddsw %1, %2
  502. %endif
  503. %endif
  504. %endmacro
  505. ; %1: 0 - one source; 1 - two sources
  506. ; %2: width
  507. ; %3: bit depth
  508. %macro PUT_PRED 3
  509. %if %1
  510. cglobal hevc_put_unweighted_pred_avg_ %+ %2 %+ _ %+ %3, 6, 6, 4, dst, dststride, src, src2, srcstride, height
  511. %else
  512. cglobal hevc_put_unweighted_pred_ %+ %2 %+ _ %+ %3, 5, 5, 4, dst, dststride, src, srcstride, height
  513. %endif
  514. %assign shift 14 + %1 - %3
  515. %assign offset (1 << (shift - 1))
  516. %define offset_data pw_ %+ offset
  517. mova m0, [offset_data]
  518. %if %3 > 8
  519. %define STORE_BLOCK movu
  520. %define STORE_HALF movq
  521. %assign pixel_max ((1 << %3) - 1)
  522. %define pw_pixel_max pw_ %+ pixel_max
  523. pxor m1, m1
  524. mova m2, [pw_pixel_max]
  525. %else
  526. %define STORE_BLOCK movq
  527. %define STORE_HALF movd
  528. %endif
  529. .loop:
  530. %assign i 0
  531. %rep (%2 + 7) / 8
  532. %if (i + 1) * 8 > %2
  533. %define LOAD movq
  534. %define STORE STORE_HALF
  535. %else
  536. %define LOAD mova
  537. %define STORE STORE_BLOCK
  538. %endif
  539. LOAD m3, [srcq + 16 * i]
  540. AVG m3, [src2q + 16 * i], %1, %3 - i * 8, m4
  541. paddsw m3, m0
  542. psraw m3, shift
  543. %if %3 == 8
  544. packuswb m3, m3
  545. STORE [dstq + 8 * i], m3
  546. %else
  547. CLIPW m3, m1, m2
  548. STORE [dstq + 16 * i], m3
  549. %endif
  550. %assign i (i + 1)
  551. %endrep
  552. add dstq, dststrideq
  553. add srcq, srcstrideq
  554. %if %1
  555. add src2q, srcstrideq
  556. %endif
  557. dec heightd
  558. jg .loop
  559. RET
  560. %endmacro
  561. INIT_XMM sse2
  562. PUT_PRED 0, 4, 8
  563. PUT_PRED 1, 4, 8
  564. PUT_PRED 0, 8, 8
  565. PUT_PRED 1, 8, 8
  566. PUT_PRED 0, 12, 8
  567. PUT_PRED 1, 12, 8
  568. PUT_PRED 0, 16, 8
  569. PUT_PRED 1, 16, 8
  570. PUT_PRED 0, 24, 8
  571. PUT_PRED 1, 24, 8
  572. PUT_PRED 0, 32, 8
  573. PUT_PRED 1, 32, 8
  574. PUT_PRED 0, 48, 8
  575. PUT_PRED 1, 48, 8
  576. PUT_PRED 0, 64, 8
  577. PUT_PRED 1, 64, 8
  578. PUT_PRED 0, 4, 10
  579. PUT_PRED 1, 4, 10
  580. PUT_PRED 0, 8, 10
  581. PUT_PRED 1, 8, 10
  582. PUT_PRED 0, 12, 10
  583. PUT_PRED 1, 12, 10
  584. PUT_PRED 0, 16, 10
  585. PUT_PRED 1, 16, 10
  586. PUT_PRED 0, 24, 10
  587. PUT_PRED 1, 24, 10
  588. PUT_PRED 0, 32, 10
  589. PUT_PRED 1, 32, 10
  590. PUT_PRED 0, 48, 10
  591. PUT_PRED 1, 48, 10
  592. PUT_PRED 0, 64, 10
  593. PUT_PRED 1, 64, 10
  594. %macro PUT_WEIGHTED_PRED 3
  595. %if %1
  596. cglobal hevc_put_weighted_pred_avg_ %+ %2 %+ _ %+ %3, 11, 11, 8, denom, weight0, weight1, offset0, offset1, dst, dststride, src0, src1, srcstride, height
  597. %else
  598. cglobal hevc_put_weighted_pred_ %+ %2 %+ _ %+ %3, 8, 8, 8, denom, weight0, offset0, dst, dststride, src0, srcstride, height
  599. %endif
  600. and denomd, 0xff
  601. movsx weight0d, weight0w
  602. movsx offset0d, offset0w
  603. %if %1
  604. movsx weight1d, weight1w
  605. movsx offset1d, offset1w
  606. %endif
  607. add denomd, 14 + %1 - %3
  608. movd m0, denomd
  609. %if %3 > 8
  610. %assign pixel_max ((1 << %3) - 1)
  611. %define pw_pixel_max pw_ %+ pixel_max
  612. pxor m4, m4
  613. mova m5, [pw_pixel_max]
  614. shl offset0d, %3 - 8
  615. %if %1
  616. shl offset1d, %3 - 8
  617. %endif
  618. %endif
  619. %if %1
  620. lea offset0d, [offset0d + offset1d + 1]
  621. %else
  622. lea offset0d, [2 * offset0d + 1]
  623. %endif
  624. movd m1, offset0d
  625. SPLATD m1
  626. pslld m1, m0
  627. psrad m1, 1
  628. movd m2, weight0d
  629. SPLATD m2
  630. %if %1
  631. movd m3, weight1d
  632. SPLATD m3
  633. %endif
  634. .loop:
  635. %assign i 0
  636. %rep (%2 + 3) / 4
  637. pmovsxwd m6, [src0q + 8 * i]
  638. pmulld m6, m2
  639. %if %1
  640. pmovsxwd m7, [src1q + 8 * i]
  641. pmulld m7, m3
  642. paddd m6, m7
  643. %endif
  644. paddd m6, m1
  645. psrad m6, m0
  646. packssdw m6, m6
  647. %if %3 > 8
  648. CLIPW m6, m4, m5
  649. movq [dstq + 8 * i], m6
  650. %else
  651. packuswb m6, m6
  652. movd [dstq + 4 * i], m6
  653. %endif
  654. %assign i (i + 1)
  655. %endrep
  656. add dstq, dststrideq
  657. add src0q, srcstrideq
  658. %if %1
  659. add src1q, srcstrideq
  660. %endif
  661. dec heightd
  662. jg .loop
  663. RET
  664. %endmacro
  665. %if ARCH_X86_64
  666. INIT_XMM sse4
  667. PUT_WEIGHTED_PRED 0, 4, 8
  668. PUT_WEIGHTED_PRED 1, 4, 8
  669. PUT_WEIGHTED_PRED 0, 8, 8
  670. PUT_WEIGHTED_PRED 1, 8, 8
  671. PUT_WEIGHTED_PRED 0, 12, 8
  672. PUT_WEIGHTED_PRED 1, 12, 8
  673. PUT_WEIGHTED_PRED 0, 16, 8
  674. PUT_WEIGHTED_PRED 1, 16, 8
  675. PUT_WEIGHTED_PRED 0, 24, 8
  676. PUT_WEIGHTED_PRED 1, 24, 8
  677. PUT_WEIGHTED_PRED 0, 32, 8
  678. PUT_WEIGHTED_PRED 1, 32, 8
  679. PUT_WEIGHTED_PRED 0, 48, 8
  680. PUT_WEIGHTED_PRED 1, 48, 8
  681. PUT_WEIGHTED_PRED 0, 64, 8
  682. PUT_WEIGHTED_PRED 1, 64, 8
  683. PUT_WEIGHTED_PRED 0, 4, 10
  684. PUT_WEIGHTED_PRED 1, 4, 10
  685. PUT_WEIGHTED_PRED 0, 8, 10
  686. PUT_WEIGHTED_PRED 1, 8, 10
  687. PUT_WEIGHTED_PRED 0, 12, 10
  688. PUT_WEIGHTED_PRED 1, 12, 10
  689. PUT_WEIGHTED_PRED 0, 16, 10
  690. PUT_WEIGHTED_PRED 1, 16, 10
  691. PUT_WEIGHTED_PRED 0, 24, 10
  692. PUT_WEIGHTED_PRED 1, 24, 10
  693. PUT_WEIGHTED_PRED 0, 32, 10
  694. PUT_WEIGHTED_PRED 1, 32, 10
  695. PUT_WEIGHTED_PRED 0, 48, 10
  696. PUT_WEIGHTED_PRED 1, 48, 10
  697. PUT_WEIGHTED_PRED 0, 64, 10
  698. PUT_WEIGHTED_PRED 1, 64, 10
  699. %endif ; ARCH_X86_64