You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1342 lines
43KB

  1. ; /*
  2. ; * Provide SSE luma and chroma mc functions for HEVC decoding
  3. ; * Copyright (c) 2013 Pierre-Edouard LEPERE
  4. ; *
  5. ; * This file is part of FFmpeg.
  6. ; *
  7. ; * FFmpeg is free software; you can redistribute it and/or
  8. ; * modify it under the terms of the GNU Lesser General Public
  9. ; * License as published by the Free Software Foundation; either
  10. ; * version 2.1 of the License, or (at your option) any later version.
  11. ; *
  12. ; * FFmpeg is distributed in the hope that it will be useful,
  13. ; * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. ; * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. ; * Lesser General Public License for more details.
  16. ; *
  17. ; * You should have received a copy of the GNU Lesser General Public
  18. ; * License along with FFmpeg; if not, write to the Free Software
  19. ; * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. ; */
  21. %include "libavutil/x86/x86util.asm"
  22. SECTION_RODATA
  23. pw_8: times 8 dw (1 << 9)
  24. pw_10: times 8 dw (1 << 11)
  25. pw_12: times 8 dw (1 << 13)
  26. pw_bi_8: times 8 dw (1 << 8)
  27. pw_bi_10: times 8 dw (1 << 10)
  28. pw_bi_12: times 8 dw (1 << 12)
  29. max_pixels_10: times 8 dw ((1 << 10)-1)
  30. max_pixels_12: times 8 dw ((1 << 12)-1)
  31. zero: times 4 dd 0
  32. one_per_32: times 4 dd 1
  33. SECTION .text
  34. %macro EPEL_TABLE 4
  35. hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
  36. times %2 d%3 10, -2
  37. times %2 d%3 -4, 54
  38. times %2 d%3 16, -2
  39. times %2 d%3 -6, 46
  40. times %2 d%3 28, -4
  41. times %2 d%3 -4, 36
  42. times %2 d%3 36, -4
  43. times %2 d%3 -4, 28
  44. times %2 d%3 46, -6
  45. times %2 d%3 -2, 16
  46. times %2 d%3 54, -4
  47. times %2 d%3 -2, 10
  48. times %2 d%3 58, -2
  49. %endmacro
  50. EPEL_TABLE 8, 8, b, sse4
  51. EPEL_TABLE 10, 4, w, sse4
  52. EPEL_TABLE 12, 4, w, sse4
  53. %macro QPEL_TABLE 4
  54. hevc_qpel_filters_%4_%1 times %2 d%3 -1, 4
  55. times %2 d%3 -10, 58
  56. times %2 d%3 17, -5
  57. times %2 d%3 1, 0
  58. times %2 d%3 -1, 4
  59. times %2 d%3 -11, 40
  60. times %2 d%3 40,-11
  61. times %2 d%3 4, -1
  62. times %2 d%3 0, 1
  63. times %2 d%3 -5, 17
  64. times %2 d%3 58,-10
  65. times %2 d%3 4, -1
  66. %endmacro
  67. QPEL_TABLE 8, 8, b, sse4
  68. QPEL_TABLE 10, 4, w, sse4
  69. QPEL_TABLE 12, 4, w, sse4
  70. %define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
  71. %if ARCH_X86_64
  72. %macro SIMPLE_BILOAD 4 ;width, tab, r1, r2
  73. %if %1 <= 4
  74. movq %3, [%2] ; load data from source2
  75. %elif %1 <= 8
  76. movdqa %3, [%2] ; load data from source2
  77. %elif %1 <= 12
  78. movdqa %3, [%2] ; load data from source2
  79. movq %4, [%2+16] ; load data from source2
  80. %else
  81. movdqa %3, [%2] ; load data from source2
  82. movdqa %4, [%2+16] ; load data from source2
  83. %endif
  84. %endmacro
  85. %macro SIMPLE_LOAD 4 ;width, bitd, tab, r1
  86. %if %1 == 2 || (%2 == 8 && %1 <= 4)
  87. movd %4, [%3] ; load data from source
  88. %elif %1 == 4 || (%2 == 8 && %1 <= 8)
  89. movq %4, [%3] ; load data from source
  90. %else
  91. movdqu %4, [%3] ; load data from source
  92. %endif
  93. %endmacro
  94. %macro SIMPLE_8LOAD 5 ;width, bitd, tab, r1, r2
  95. %if %1 == 2 || (%2 == 8 && %1 <= 4)
  96. movq %4, [%3] ; load data from source2
  97. %elif %1 == 4 || (%2 == 8 && %1 <= 8)
  98. movdqa %4, [%3] ; load data from source2
  99. %elif %1 <= 12
  100. movdqa %4, [%3] ; load data from source2
  101. movq %5, [%3+16] ; load data from source2
  102. %else
  103. movdqa %4, [%3] ; load data from source2
  104. movdqa %5, [%3+16] ; load data from source2
  105. %endif
  106. %endmacro
  107. %macro EPEL_FILTER 2-4 ; bit depth, filter index
  108. %ifdef PIC
  109. lea rfilterq, [hevc_epel_filters_sse4_%1]
  110. %else
  111. %define rfilterq hevc_epel_filters_sse4_%1
  112. %endif
  113. sub %2q, 1
  114. shl %2q, 5 ; multiply by 32
  115. %if %0 == 2
  116. movdqa m14, [rfilterq + %2q] ; get 2 first values of filters
  117. movdqa m15, [rfilterq + %2q+16] ; get 2 last values of filters
  118. %else
  119. movdqa %3, [rfilterq + %2q] ; get 2 first values of filters
  120. movdqa %4, [rfilterq + %2q+16] ; get 2 last values of filters
  121. %endif
  122. %endmacro
  123. %macro EPEL_HV_FILTER 1
  124. %ifdef PIC
  125. lea rfilterq, [hevc_epel_filters_sse4_%1]
  126. %else
  127. %define rfilterq hevc_epel_filters_sse4_%1
  128. %endif
  129. sub mxq, 1
  130. sub myq, 1
  131. shl mxq, 5 ; multiply by 32
  132. shl myq, 5 ; multiply by 32
  133. movdqa m14, [rfilterq + mxq] ; get 2 first values of filters
  134. movdqa m15, [rfilterq + mxq+16] ; get 2 last values of filters
  135. lea r3srcq, [srcstrideq*3]
  136. %ifdef PIC
  137. lea rfilterq, [hevc_epel_filters_sse4_10]
  138. %else
  139. %define rfilterq hevc_epel_filters_sse4_10
  140. %endif
  141. movdqa m12, [rfilterq + myq] ; get 2 first values of filters
  142. movdqa m13, [rfilterq + myq+16] ; get 2 last values of filters
  143. %endmacro
  144. %macro QPEL_FILTER 2
  145. %ifdef PIC
  146. lea rfilterq, [hevc_qpel_filters_sse4_%1]
  147. %else
  148. %define rfilterq hevc_qpel_filters_sse4_%1
  149. %endif
  150. lea %2q, [%2q*8-8]
  151. movdqa m12, [rfilterq + %2q*8] ; get 4 first values of filters
  152. movdqa m13, [rfilterq + %2q*8 + 16] ; get 4 first values of filters
  153. movdqa m14, [rfilterq + %2q*8 + 32] ; get 4 first values of filters
  154. movdqa m15, [rfilterq + %2q*8 + 48] ; get 4 first values of filters
  155. %endmacro
  156. %macro EPEL_LOAD 4
  157. %ifdef PIC
  158. lea rfilterq, [%2]
  159. %else
  160. %define rfilterq %2
  161. %endif
  162. movdqu m0, [rfilterq ] ;load 128bit of x
  163. %ifnum %3
  164. movdqu m1, [rfilterq+ %3] ;load 128bit of x+stride
  165. movdqu m2, [rfilterq+2*%3] ;load 128bit of x+2*stride
  166. movdqu m3, [rfilterq+3*%3] ;load 128bit of x+3*stride
  167. %else
  168. movdqu m1, [rfilterq+ %3q] ;load 128bit of x+stride
  169. movdqu m2, [rfilterq+2*%3q] ;load 128bit of x+2*stride
  170. movdqu m3, [rfilterq+r3srcq] ;load 128bit of x+2*stride
  171. %endif
  172. %if %1 == 8
  173. %if %4 > 8
  174. SBUTTERFLY bw, 0, 1, 10
  175. SBUTTERFLY bw, 2, 3, 10
  176. %else
  177. punpcklbw m0, m1
  178. punpcklbw m2, m3
  179. %endif
  180. %else
  181. %if %4 > 4
  182. SBUTTERFLY wd, 0, 1, 10
  183. SBUTTERFLY wd, 2, 3, 10
  184. %else
  185. punpcklwd m0, m1
  186. punpcklwd m2, m3
  187. %endif
  188. %endif
  189. %endmacro
  190. %macro QPEL_H_LOAD 4
  191. %assign %%stride (%1+7)/8
  192. %if %1 == 8
  193. %if %3 <= 4
  194. %define %%load movd
  195. %elif %3 == 8
  196. %define %%load movq
  197. %else
  198. %define %%load movdqu
  199. %endif
  200. %else
  201. %if %3 == 2
  202. %define %%load movd
  203. %elif %3 == 4
  204. %define %%load movq
  205. %else
  206. %define %%load movdqu
  207. %endif
  208. %endif
  209. %%load m0, [%2-3*%%stride] ;load data from source
  210. %%load m1, [%2-2*%%stride]
  211. %%load m2, [%2-%%stride ]
  212. %%load m3, [%2 ]
  213. %%load m4, [%2+%%stride ]
  214. %%load m5, [%2+2*%%stride]
  215. %%load m6, [%2+3*%%stride]
  216. %%load m7, [%2+4*%%stride]
  217. %if %1 == 8
  218. %if %3 > 8
  219. SBUTTERFLY wd, 0, 1, %4
  220. SBUTTERFLY wd, 2, 3, %4
  221. SBUTTERFLY wd, 4, 5, %4
  222. SBUTTERFLY wd, 6, 7, %4
  223. %else
  224. punpcklwd m0, m1
  225. punpcklwd m2, m3
  226. punpcklwd m4, m5
  227. punpcklwd m6, m7
  228. %endif
  229. %else
  230. %if %3 > 4
  231. SBUTTERFLY dq, 0, 1, %4
  232. SBUTTERFLY dq, 2, 3, %4
  233. SBUTTERFLY dq, 4, 5, %4
  234. SBUTTERFLY dq, 6, 7, %4
  235. %else
  236. punpckldq m0, m1
  237. punpckldq m2, m3
  238. punpckldq m4, m5
  239. punpckldq m6, m7
  240. %endif
  241. %endif
  242. %endmacro
  243. %macro QPEL_V_LOAD 4
  244. lea r12q, [%2]
  245. sub r12q, r3srcq
  246. movdqu m0, [r12 ] ;load x- 3*srcstride
  247. movdqu m1, [r12+ %3q ] ;load x- 2*srcstride
  248. movdqu m2, [r12+ 2*%3q ] ;load x-srcstride
  249. movdqu m3, [%2 ] ;load x
  250. movdqu m4, [%2+ %3q] ;load x+stride
  251. movdqu m5, [%2+ 2*%3q] ;load x+2*stride
  252. movdqu m6, [%2+r3srcq] ;load x+3*stride
  253. movdqu m7, [%2+ 4*%3q] ;load x+4*stride
  254. %if %1 == 8
  255. %if %4 > 8
  256. SBUTTERFLY bw, 0, 1, 8
  257. SBUTTERFLY bw, 2, 3, 8
  258. SBUTTERFLY bw, 4, 5, 8
  259. SBUTTERFLY bw, 6, 7, 8
  260. %else
  261. punpcklbw m0, m1
  262. punpcklbw m2, m3
  263. punpcklbw m4, m5
  264. punpcklbw m6, m7
  265. %endif
  266. %else
  267. %if %4 > 4
  268. SBUTTERFLY wd, 0, 1, 8
  269. SBUTTERFLY wd, 2, 3, 8
  270. SBUTTERFLY wd, 4, 5, 8
  271. SBUTTERFLY wd, 6, 7, 8
  272. %else
  273. punpcklwd m0, m1
  274. punpcklwd m2, m3
  275. punpcklwd m4, m5
  276. punpcklwd m6, m7
  277. %endif
  278. %endif
  279. %endmacro
  280. %macro PEL_12STORE2 3
  281. movd [%1], %2
  282. %endmacro
  283. %macro PEL_12STORE4 3
  284. movq [%1], %2
  285. %endmacro
  286. %macro PEL_12STORE6 3
  287. movq [%1], %2
  288. psrldq %2, 8
  289. movd [%1+8], %2
  290. %endmacro
  291. %macro PEL_12STORE8 3
  292. movdqa [%1], %2
  293. %endmacro
  294. %macro PEL_12STORE12 3
  295. movdqa [%1], %2
  296. movq [%1+16], %3
  297. %endmacro
  298. %macro PEL_12STORE16 3
  299. PEL_12STORE8 %1, %2, %3
  300. movdqa [%1+16], %3
  301. %endmacro
  302. %macro PEL_10STORE2 3
  303. movd [%1], %2
  304. %endmacro
  305. %macro PEL_10STORE4 3
  306. movq [%1], %2
  307. %endmacro
  308. %macro PEL_10STORE6 3
  309. movq [%1], %2
  310. psrldq %2, 8
  311. movd [%1+8], %2
  312. %endmacro
  313. %macro PEL_10STORE8 3
  314. movdqa [%1], %2
  315. %endmacro
  316. %macro PEL_10STORE12 3
  317. movdqa [%1], %2
  318. movq [%1+16], %3
  319. %endmacro
  320. %macro PEL_10STORE16 3
  321. PEL_10STORE8 %1, %2, %3
  322. movdqa [%1+16], %3
  323. %endmacro
  324. %macro PEL_8STORE2 3
  325. pextrw [%1], %2, 0
  326. %endmacro
  327. %macro PEL_8STORE4 3
  328. movd [%1], %2
  329. %endmacro
  330. %macro PEL_8STORE6 3
  331. movd [%1], %2
  332. pextrw [%1+4], %2, 2
  333. %endmacro
  334. %macro PEL_8STORE8 3
  335. movq [%1], %2
  336. %endmacro
  337. %macro PEL_8STORE12 3
  338. movq [%1], %2
  339. psrldq %2, 8
  340. movd [%1+8], %2
  341. %endmacro
  342. %macro PEL_8STORE16 3
  343. movdqa [%1], %2
  344. %endmacro
  345. %macro LOOP_END 4
  346. lea %1q, [%1q+2*%2q] ; dst += dststride
  347. lea %3q, [%3q+ %4q] ; src += srcstride
  348. dec heightd ; cmp height
  349. jnz .loop ; height loop
  350. %endmacro
  351. %macro MC_PIXEL_COMPUTE 2 ;width, bitdepth
  352. %if %2 == 8
  353. %if %1 > 8
  354. punpckhbw m1, m0, m2
  355. psllw m1, 14-%2
  356. %endif
  357. punpcklbw m0, m2
  358. %endif
  359. psllw m0, 14-%2
  360. %endmacro
  361. %macro EPEL_COMPUTE 4 ; bitdepth, width, filter1, filter2
  362. %if %1 == 8
  363. pmaddubsw m0, %3 ;x1*c1+x2*c2
  364. pmaddubsw m2, %4 ;x3*c3+x4*c4
  365. paddw m0, m2
  366. %if %2 > 8
  367. pmaddubsw m1, %3
  368. pmaddubsw m3, %4
  369. paddw m1, m3
  370. %endif
  371. %else
  372. pmaddwd m0, %3
  373. pmaddwd m2, %4
  374. paddd m0, m2
  375. %if %2 > 4
  376. pmaddwd m1, %3
  377. pmaddwd m3, %4
  378. paddd m1, m3
  379. %endif
  380. %if %1 != 8
  381. psrad m0, %1-8
  382. psrad m1, %1-8
  383. %endif
  384. packssdw m0, m1
  385. %endif
  386. %endmacro
  387. %macro QPEL_HV_COMPUTE 4 ; width, bitdepth, filter idx
  388. %ifdef PIC
  389. lea rfilterq, [hevc_qpel_filters_sse4_%2]
  390. %else
  391. %define rfilterq hevc_qpel_filters_sse4_%2
  392. %endif
  393. %if %2 == 8
  394. pmaddubsw m0, [rfilterq + %3q*8 ] ;x1*c1+x2*c2
  395. pmaddubsw m2, [rfilterq + %3q*8+16] ;x3*c3+x4*c4
  396. pmaddubsw m4, [rfilterq + %3q*8+32] ;x5*c5+x6*c6
  397. pmaddubsw m6, [rfilterq + %3q*8+48] ;x7*c7+x8*c8
  398. paddw m0, m2
  399. paddw m4, m6
  400. paddw m0, m4
  401. %else
  402. pmaddwd m0, [rfilterq + %3q*8 ]
  403. pmaddwd m2, [rfilterq + %3q*8+16]
  404. pmaddwd m4, [rfilterq + %3q*8+32]
  405. pmaddwd m6, [rfilterq + %3q*8+48]
  406. paddd m0, m2
  407. paddd m4, m6
  408. paddd m0, m4
  409. %if %2 != 8
  410. psrad m0, %2-8
  411. %endif
  412. %if %1 > 4
  413. pmaddwd m1, [rfilterq + %3q*8 ]
  414. pmaddwd m3, [rfilterq + %3q*8+16]
  415. pmaddwd m5, [rfilterq + %3q*8+32]
  416. pmaddwd m7, [rfilterq + %3q*8+48]
  417. paddd m1, m3
  418. paddd m5, m7
  419. paddd m1, m5
  420. %if %2 != 8
  421. psrad m1, %2-8
  422. %endif
  423. %endif
  424. p%4 m0, m1
  425. %endif
  426. %endmacro
  427. %macro QPEL_COMPUTE 2 ; width, bitdepth
  428. %if %2 == 8
  429. pmaddubsw m0, m12 ;x1*c1+x2*c2
  430. pmaddubsw m2, m13 ;x3*c3+x4*c4
  431. pmaddubsw m4, m14 ;x5*c5+x6*c6
  432. pmaddubsw m6, m15 ;x7*c7+x8*c8
  433. paddw m0, m2
  434. paddw m4, m6
  435. paddw m0, m4
  436. %if %1 > 8
  437. pmaddubsw m1, m12
  438. pmaddubsw m3, m13
  439. pmaddubsw m5, m14
  440. pmaddubsw m7, m15
  441. paddw m1, m3
  442. paddw m5, m7
  443. paddw m1, m5
  444. %endif
  445. %else
  446. pmaddwd m0, m12
  447. pmaddwd m2, m13
  448. pmaddwd m4, m14
  449. pmaddwd m6, m15
  450. paddd m0, m2
  451. paddd m4, m6
  452. paddd m0, m4
  453. %if %2 != 8
  454. psrad m0, %2-8
  455. %endif
  456. %if %1 > 4
  457. pmaddwd m1, m12
  458. pmaddwd m3, m13
  459. pmaddwd m5, m14
  460. pmaddwd m7, m15
  461. paddd m1, m3
  462. paddd m5, m7
  463. paddd m1, m5
  464. %if %2 != 8
  465. psrad m1, %2-8
  466. %endif
  467. %endif
  468. %endif
  469. %endmacro
  470. %macro BI_COMPUTE 7 ; width, bitd, src1l, src1h, scr2l, scr2h, pw
  471. paddsw %3, %5
  472. %if %1 > 8
  473. paddsw %4, %6
  474. %endif
  475. UNI_COMPUTE %1, %2, %3, %4, %7
  476. %endmacro
  477. %macro UNI_COMPUTE 5
  478. pmulhrsw %3, %5
  479. %if %1 > 8 || (%2 > 8 && %1 > 4)
  480. pmulhrsw %4, %5
  481. %endif
  482. %if %2 == 8
  483. packuswb %3, %4
  484. %else
  485. pminsw %3, [max_pixels_%2]
  486. pmaxsw %3, [zero]
  487. %if %1 > 8
  488. pminsw %4, [max_pixels_%2]
  489. pmaxsw %4, [zero]
  490. %endif
  491. %endif
  492. %endmacro
  493. INIT_XMM sse4 ; adds ff_ and _sse4 to function name
  494. ; ******************************
  495. ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
  496. ; uint8_t *_src, ptrdiff_t _srcstride,
  497. ; int height, int mx, int my)
  498. ; ******************************
  499. %macro HEVC_PUT_HEVC_PEL_PIXELS 2
  500. cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
  501. pxor m2, m2
  502. .loop
  503. SIMPLE_LOAD %1, %2, srcq, m0
  504. MC_PIXEL_COMPUTE %1, %2
  505. PEL_10STORE%1 dstq, m0, m1
  506. LOOP_END dst, dststride, src, srcstride
  507. RET
  508. cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
  509. pxor m2, m2
  510. .loop
  511. SIMPLE_LOAD %1, %2, srcq, m0
  512. PEL_%2STORE%1 dstq, m0, m1
  513. add dstq, dststrideq ; dst += dststride
  514. add srcq, srcstrideq ; src += srcstride
  515. dec heightd ; cmp height
  516. jnz .loop ; height loop
  517. RET
  518. cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
  519. pxor m2, m2
  520. movdqa m5, [pw_bi_%2]
  521. .loop
  522. SIMPLE_LOAD %1, %2, srcq, m0
  523. SIMPLE_BILOAD %1, src2q, m3, m4
  524. MC_PIXEL_COMPUTE %1, %2
  525. BI_COMPUTE %1, %2, m0, m1, m3, m4, m5
  526. PEL_%2STORE%1 dstq, m0, m1
  527. add dstq, dststrideq ; dst += dststride
  528. add srcq, srcstrideq ; src += srcstride
  529. lea src2q, [src2q+2*src2strideq] ; src += srcstride
  530. dec heightd ; cmp height
  531. jnz .loop ; height loop
  532. RET
  533. %endmacro
  534. ; ******************************
  535. ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
  536. ; uint8_t *_src, ptrdiff_t _srcstride,
  537. ; int width, int height, int mx, int my,
  538. ; int16_t* mcbuffer)
  539. ; ******************************
  540. %macro HEVC_PUT_HEVC_EPEL 2
  541. cglobal hevc_put_hevc_epel_h%1_%2, 6, 7, 6, dst, dststride, src, srcstride, height, mx, rfilter
  542. %assign %%stride ((%2 + 7)/8)
  543. EPEL_FILTER %2, mx, m4, m5
  544. .loop
  545. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  546. EPEL_COMPUTE %2, %1, m4, m5
  547. PEL_10STORE%1 dstq, m0, m1
  548. LOOP_END dst, dststride, src, srcstride
  549. RET
  550. cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
  551. %assign %%stride ((%2 + 7)/8)
  552. movdqa m6, [pw_%2]
  553. EPEL_FILTER %2, mx, m4, m5
  554. .loop
  555. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  556. EPEL_COMPUTE %2, %1, m4, m5
  557. UNI_COMPUTE %1, %2, m0, m1, m6
  558. PEL_%2STORE%1 dstq, m0, m1
  559. add dstq, dststrideq ; dst += dststride
  560. add srcq, srcstrideq ; src += srcstride
  561. dec heightd ; cmp height
  562. jnz .loop ; height loop
  563. RET
  564. cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx, rfilter
  565. movdqa m6, [pw_bi_%2]
  566. EPEL_FILTER %2, mx, m4, m5
  567. .loop
  568. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  569. EPEL_COMPUTE %2, %1, m4, m5
  570. SIMPLE_BILOAD %1, src2q, m2, m3
  571. BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
  572. PEL_%2STORE%1 dstq, m0, m1
  573. add dstq, dststrideq ; dst += dststride
  574. add srcq, srcstrideq ; src += srcstride
  575. lea src2q, [src2q+2*src2strideq] ; src += srcstride
  576. dec heightd ; cmp height
  577. jnz .loop ; height loop
  578. RET
  579. ; ******************************
  580. ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
  581. ; uint8_t *_src, ptrdiff_t _srcstride,
  582. ; int width, int height, int mx, int my,
  583. ; int16_t* mcbuffer)
  584. ; ******************************
  585. cglobal hevc_put_hevc_epel_v%1_%2, 7, 8, 6, dst, dststride, src, srcstride, height, r3src, my, rfilter
  586. lea r3srcq, [srcstrideq*3]
  587. sub srcq, srcstrideq
  588. EPEL_FILTER %2, my, m4, m5
  589. .loop
  590. EPEL_LOAD %2, srcq, srcstride, %1
  591. EPEL_COMPUTE %2, %1, m4, m5
  592. PEL_10STORE%1 dstq, m0, m1
  593. LOOP_END dst, dststride, src, srcstride
  594. RET
  595. cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
  596. lea r3srcq, [srcstrideq*3]
  597. movdqa m6, [pw_%2]
  598. sub srcq, srcstrideq
  599. EPEL_FILTER %2, my, m4, m5
  600. .loop
  601. EPEL_LOAD %2, srcq, srcstride, %1
  602. EPEL_COMPUTE %2, %1, m4, m5
  603. UNI_COMPUTE %1, %2, m0, m1, m6
  604. PEL_%2STORE%1 dstq, m0, m1
  605. add dstq, dststrideq ; dst += dststride
  606. add srcq, srcstrideq ; src += srcstride
  607. dec heightd ; cmp height
  608. jnz .loop ; height loop
  609. RET
  610. cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my, rfilter
  611. lea r3srcq, [srcstrideq*3]
  612. movdqa m6, [pw_bi_%2]
  613. sub srcq, srcstrideq
  614. EPEL_FILTER %2, my, m4, m5
  615. .loop
  616. EPEL_LOAD %2, srcq, srcstride, %1
  617. EPEL_COMPUTE %2, %1, m4, m5
  618. SIMPLE_BILOAD %1, src2q, m2, m3
  619. BI_COMPUTE %1, %2, m0, m1, m2, m3, m6
  620. PEL_%2STORE%1 dstq, m0, m1
  621. add dstq, dststrideq ; dst += dststride
  622. add srcq, srcstrideq ; src += srcstride
  623. lea src2q, [src2q+2*src2strideq] ; src += srcstride
  624. dec heightd ; cmp height
  625. jnz .loop ; height loop
  626. RET
  627. %endmacro
  628. ; ******************************
  629. ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
  630. ; uint8_t *_src, ptrdiff_t _srcstride,
  631. ; int width, int height, int mx, int my)
  632. ; ******************************
  633. %macro HEVC_PUT_HEVC_EPEL_HV 2
  634. cglobal hevc_put_hevc_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
  635. %assign %%stride ((%2 + 7)/8)
  636. sub srcq, srcstrideq
  637. EPEL_HV_FILTER %2
  638. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  639. EPEL_COMPUTE %2, %1, m14, m15
  640. SWAP m4, m0
  641. add srcq, srcstrideq
  642. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  643. EPEL_COMPUTE %2, %1, m14, m15
  644. SWAP m5, m0
  645. add srcq, srcstrideq
  646. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  647. EPEL_COMPUTE %2, %1, m14, m15
  648. SWAP m6, m0
  649. add srcq, srcstrideq
  650. .loop
  651. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  652. EPEL_COMPUTE %2, %1, m14, m15
  653. SWAP m7, m0
  654. punpcklwd m0, m4, m5
  655. punpcklwd m2, m6, m7
  656. %if %1 > 4
  657. punpckhwd m1, m4, m5
  658. punpckhwd m3, m6, m7
  659. %endif
  660. EPEL_COMPUTE 14, %1, m12, m13
  661. PEL_10STORE%1 dstq, m0, m1
  662. movdqa m4, m5
  663. movdqa m5, m6
  664. movdqa m6, m7
  665. LOOP_END dst, dststride, src, srcstride
  666. RET
  667. cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
  668. %assign %%stride ((%2 + 7)/8)
  669. sub srcq, srcstrideq
  670. EPEL_HV_FILTER %2
  671. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  672. EPEL_COMPUTE %2, %1, m14, m15
  673. SWAP m4, m0
  674. add srcq, srcstrideq
  675. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  676. EPEL_COMPUTE %2, %1, m14, m15
  677. SWAP m5, m0
  678. add srcq, srcstrideq
  679. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  680. EPEL_COMPUTE %2, %1, m14, m15
  681. SWAP m6, m0
  682. add srcq, srcstrideq
  683. .loop
  684. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  685. EPEL_COMPUTE %2, %1, m14, m15
  686. SWAP m7, m0
  687. punpcklwd m0, m4, m5
  688. punpcklwd m2, m6, m7
  689. %if %1 > 4
  690. punpckhwd m1, m4, m5
  691. punpckhwd m3, m6, m7
  692. %endif
  693. EPEL_COMPUTE 14, %1, m12, m13
  694. UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
  695. PEL_%2STORE%1 dstq, m0, m1
  696. movdqa m4, m5
  697. movdqa m5, m6
  698. movdqa m6, m7
  699. add dstq, dststrideq ; dst += dststride
  700. add srcq, srcstrideq ; src += srcstride
  701. dec heightd ; cmp height
  702. jnz .loop ; height loop
  703. RET
  704. cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
  705. %assign %%stride ((%2 + 7)/8)
  706. sub srcq, srcstrideq
  707. EPEL_HV_FILTER %2
  708. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  709. EPEL_COMPUTE %2, %1, m14, m15
  710. SWAP m4, m0
  711. add srcq, srcstrideq
  712. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  713. EPEL_COMPUTE %2, %1, m14, m15
  714. SWAP m5, m0
  715. add srcq, srcstrideq
  716. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  717. EPEL_COMPUTE %2, %1, m14, m15
  718. SWAP m6, m0
  719. add srcq, srcstrideq
  720. .loop
  721. EPEL_LOAD %2, srcq-%%stride, %%stride, %1
  722. EPEL_COMPUTE %2, %1, m14, m15
  723. SWAP m7, m0
  724. punpcklwd m0, m4, m5
  725. punpcklwd m2, m6, m7
  726. %if %1 > 4
  727. punpckhwd m1, m4, m5
  728. punpckhwd m3, m6, m7
  729. %endif
  730. EPEL_COMPUTE 14, %1, m12, m13
  731. SIMPLE_BILOAD %1, src2q, m8, m9
  732. BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
  733. PEL_%2STORE%1 dstq, m0, m1
  734. movdqa m4, m5
  735. movdqa m5, m6
  736. movdqa m6, m7
  737. add dstq, dststrideq ; dst += dststride
  738. add srcq, srcstrideq ; src += srcstride
  739. lea src2q, [src2q+2*src2strideq] ; src += srcstride
  740. dec heightd ; cmp height
  741. jnz .loop ; height loop
  742. RET
  743. %endmacro
  744. ; ******************************
  745. ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
  746. ; uint8_t *_src, ptrdiff_t _srcstride,
  747. ; int width, int height, int mx, int my)
  748. ; ******************************
  749. %macro HEVC_PUT_HEVC_QPEL 2
  750. cglobal hevc_put_hevc_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
  751. QPEL_FILTER %2, mx
  752. .loop
  753. QPEL_H_LOAD %2, srcq, %1, 10
  754. QPEL_COMPUTE %1, %2
  755. %if %2 > 8
  756. packssdw m0, m1
  757. %endif
  758. PEL_10STORE%1 dstq, m0, m1
  759. LOOP_END dst, dststride, src, srcstride
  760. RET
  761. cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
  762. movdqa m9, [pw_%2]
  763. QPEL_FILTER %2, mx
  764. .loop
  765. QPEL_H_LOAD %2, srcq, %1, 10
  766. QPEL_COMPUTE %1, %2
  767. %if %2 > 8
  768. packssdw m0, m1
  769. %endif
  770. UNI_COMPUTE %1, %2, m0, m1, m9
  771. PEL_%2STORE%1 dstq, m0, m1
  772. add dstq, dststrideq ; dst += dststride
  773. add srcq, srcstrideq ; src += srcstride
  774. dec heightd ; cmp height
  775. jnz .loop ; height loop
  776. RET
  777. cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride, src2, src2stride, height, mx, rfilter
  778. movdqa m9, [pw_bi_%2]
  779. QPEL_FILTER %2, mx
  780. .loop
  781. QPEL_H_LOAD %2, srcq, %1, 10
  782. QPEL_COMPUTE %1, %2
  783. %if %2 > 8
  784. packssdw m0, m1
  785. %endif
  786. SIMPLE_BILOAD %1, src2q, m10, m11
  787. BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
  788. PEL_%2STORE%1 dstq, m0, m1
  789. add dstq, dststrideq ; dst += dststride
  790. add srcq, srcstrideq ; src += srcstride
  791. lea src2q, [src2q+2*src2strideq] ; src += srcstride
  792. dec heightd ; cmp height
  793. jnz .loop ; height loop
  794. RET
  795. ; ******************************
  796. ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
  797. ; uint8_t *_src, ptrdiff_t _srcstride,
  798. ; int width, int height, int mx, int my)
  799. ; ******************************
  800. cglobal hevc_put_hevc_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
  801. lea r3srcq, [srcstrideq*3]
  802. QPEL_FILTER %2, my
  803. .loop
  804. QPEL_V_LOAD %2, srcq, srcstride, %1
  805. QPEL_COMPUTE %1, %2
  806. %if %2 > 8
  807. packssdw m0, m1
  808. %endif
  809. PEL_10STORE%1 dstq, m0, m1
  810. LOOP_END dst, dststride, src, srcstride
  811. RET
  812. cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
  813. movdqa m9, [pw_%2]
  814. lea r3srcq, [srcstrideq*3]
  815. QPEL_FILTER %2, my
  816. .loop
  817. QPEL_V_LOAD %2, srcq, srcstride, %1
  818. QPEL_COMPUTE %1, %2
  819. %if %2 > 8
  820. packusdw m0, m1
  821. %endif
  822. UNI_COMPUTE %1, %2, m0, m1, m9
  823. PEL_%2STORE%1 dstq, m0, m1
  824. add dstq, dststrideq ; dst += dststride
  825. add srcq, srcstrideq ; src += srcstride
  826. dec heightd ; cmp height
  827. jnz .loop ; height loop
  828. RET
  829. cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 14, 16 , dst, dststride, src, srcstride, src2, src2stride, height, r3src, my, rfilter
  830. movdqa m9, [pw_bi_%2]
  831. lea r3srcq, [srcstrideq*3]
  832. QPEL_FILTER %2, my
  833. .loop
  834. SIMPLE_BILOAD %1, src2q, m10, m11
  835. QPEL_V_LOAD %2, srcq, srcstride, %1
  836. QPEL_COMPUTE %1, %2
  837. %if %2 > 8
  838. packssdw m0, m1
  839. %endif
  840. BI_COMPUTE %1, %2, m0, m1, m10, m11, m9
  841. PEL_%2STORE%1 dstq, m0, m1
  842. add dstq, dststrideq ; dst += dststride
  843. add srcq, srcstrideq ; src += srcstride
  844. lea src2q, [src2q+2*src2strideq] ; src += srcstride
  845. dec heightd ; cmp height
  846. jnz .loop ; height loop
  847. RET
  848. %endmacro
  849. ; ******************************
  850. ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
  851. ; uint8_t *_src, ptrdiff_t _srcstride,
  852. ; int height, int mx, int my)
  853. ; ******************************
  854. %macro HEVC_PUT_HEVC_QPEL_HV 2
  855. cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
  856. lea mxq, [mxq*8-8]
  857. lea myq, [myq*8-8]
  858. lea r3srcq, [srcstrideq*3]
  859. sub srcq, r3srcq
  860. QPEL_H_LOAD %2, srcq, %1, 15
  861. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  862. SWAP m8, m0
  863. add srcq, srcstrideq
  864. QPEL_H_LOAD %2, srcq, %1, 15
  865. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  866. SWAP m9, m0
  867. add srcq, srcstrideq
  868. QPEL_H_LOAD %2, srcq, %1, 15
  869. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  870. SWAP m10, m0
  871. add srcq, srcstrideq
  872. QPEL_H_LOAD %2, srcq, %1, 15
  873. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  874. SWAP m11, m0
  875. add srcq, srcstrideq
  876. QPEL_H_LOAD %2, srcq, %1, 15
  877. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  878. SWAP m12, m0
  879. add srcq, srcstrideq
  880. QPEL_H_LOAD %2, srcq, %1, 15
  881. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  882. SWAP m13, m0
  883. add srcq, srcstrideq
  884. QPEL_H_LOAD %2, srcq, %1, 15
  885. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  886. SWAP m14, m0
  887. add srcq, srcstrideq
  888. .loop
  889. QPEL_H_LOAD %2, srcq, %1, 15
  890. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  891. SWAP m15, m0
  892. punpcklwd m0, m8, m9
  893. punpcklwd m2, m10, m11
  894. punpcklwd m4, m12, m13
  895. punpcklwd m6, m14, m15
  896. %if %1 > 4
  897. punpckhwd m1, m8, m9
  898. punpckhwd m3, m10, m11
  899. punpckhwd m5, m12, m13
  900. punpckhwd m7, m14, m15
  901. %endif
  902. QPEL_HV_COMPUTE %1, 14, my, ackssdw
  903. PEL_10STORE%1 dstq, m0, m1
  904. %if %1 <= 4
  905. movq m8, m9
  906. movq m9, m10
  907. movq m10, m11
  908. movq m11, m12
  909. movq m12, m13
  910. movq m13, m14
  911. movq m14, m15
  912. %else
  913. movdqa m8, m9
  914. movdqa m9, m10
  915. movdqa m10, m11
  916. movdqa m11, m12
  917. movdqa m12, m13
  918. movdqa m13, m14
  919. movdqa m14, m15
  920. %endif
  921. LOOP_END dst, dststride, src, srcstride
  922. RET
  923. cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
  924. lea mxq, [mxq*8-8]
  925. lea myq, [myq*8-8]
  926. lea r3srcq, [srcstrideq*3]
  927. sub srcq, r3srcq
  928. QPEL_H_LOAD %2, srcq, %1, 15
  929. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  930. SWAP m8, m0
  931. add srcq, srcstrideq
  932. QPEL_H_LOAD %2, srcq, %1, 15
  933. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  934. SWAP m9, m0
  935. add srcq, srcstrideq
  936. QPEL_H_LOAD %2, srcq, %1, 15
  937. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  938. SWAP m10, m0
  939. add srcq, srcstrideq
  940. QPEL_H_LOAD %2, srcq, %1, 15
  941. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  942. SWAP m11, m0
  943. add srcq, srcstrideq
  944. QPEL_H_LOAD %2, srcq, %1, 15
  945. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  946. SWAP m12, m0
  947. add srcq, srcstrideq
  948. QPEL_H_LOAD %2, srcq, %1, 15
  949. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  950. SWAP m13, m0
  951. add srcq, srcstrideq
  952. QPEL_H_LOAD %2, srcq, %1, 15
  953. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  954. SWAP m14, m0
  955. add srcq, srcstrideq
  956. .loop
  957. QPEL_H_LOAD %2, srcq, %1, 15
  958. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  959. SWAP m15, m0
  960. punpcklwd m0, m8, m9
  961. punpcklwd m2, m10, m11
  962. punpcklwd m4, m12, m13
  963. punpcklwd m6, m14, m15
  964. %if %1 > 4
  965. punpckhwd m1, m8, m9
  966. punpckhwd m3, m10, m11
  967. punpckhwd m5, m12, m13
  968. punpckhwd m7, m14, m15
  969. %endif
  970. QPEL_HV_COMPUTE %1, 14, my, ackusdw
  971. UNI_COMPUTE %1, %2, m0, m1, [pw_%2]
  972. PEL_%2STORE%1 dstq, m0, m1
  973. %if %1 <= 4
  974. movq m8, m9
  975. movq m9, m10
  976. movq m10, m11
  977. movq m11, m12
  978. movq m12, m13
  979. movq m13, m14
  980. movq m14, m15
  981. %else
  982. movdqa m8, m9
  983. movdqa m9, m10
  984. movdqa m10, m11
  985. movdqa m11, m12
  986. movdqa m12, m13
  987. movdqa m13, m14
  988. movdqa m14, m15
  989. %endif
  990. add dstq, dststrideq ; dst += dststride
  991. add srcq, srcstrideq ; src += srcstride
  992. dec heightd ; cmp height
  993. jnz .loop ; height loop
  994. RET
  995. cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
  996. lea mxq, [mxq*8-8]
  997. lea myq, [myq*8-8]
  998. lea r3srcq, [srcstrideq*3]
  999. sub srcq, r3srcq
  1000. QPEL_H_LOAD %2, srcq, %1, 15
  1001. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1002. SWAP m8, m0
  1003. add srcq, srcstrideq
  1004. QPEL_H_LOAD %2, srcq, %1, 15
  1005. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1006. SWAP m9, m0
  1007. add srcq, srcstrideq
  1008. QPEL_H_LOAD %2, srcq, %1, 15
  1009. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1010. SWAP m10, m0
  1011. add srcq, srcstrideq
  1012. QPEL_H_LOAD %2, srcq, %1, 15
  1013. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1014. SWAP m11, m0
  1015. add srcq, srcstrideq
  1016. QPEL_H_LOAD %2, srcq, %1, 15
  1017. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1018. SWAP m12, m0
  1019. add srcq, srcstrideq
  1020. QPEL_H_LOAD %2, srcq, %1, 15
  1021. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1022. SWAP m13, m0
  1023. add srcq, srcstrideq
  1024. QPEL_H_LOAD %2, srcq, %1, 15
  1025. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1026. SWAP m14, m0
  1027. add srcq, srcstrideq
  1028. .loop
  1029. QPEL_H_LOAD %2, srcq, %1, 15
  1030. QPEL_HV_COMPUTE %1, %2, mx, ackssdw
  1031. SWAP m15, m0
  1032. punpcklwd m0, m8, m9
  1033. punpcklwd m2, m10, m11
  1034. punpcklwd m4, m12, m13
  1035. punpcklwd m6, m14, m15
  1036. %if %1 > 4
  1037. punpckhwd m1, m8, m9
  1038. punpckhwd m3, m10, m11
  1039. punpckhwd m5, m12, m13
  1040. punpckhwd m7, m14, m15
  1041. %endif
  1042. QPEL_HV_COMPUTE %1, 14, my, ackssdw
  1043. SIMPLE_BILOAD %1, src2q, m8, m9 ;m9 not used in this case
  1044. BI_COMPUTE %1, %2, m0, m1, m8, m9, [pw_bi_%2]
  1045. PEL_%2STORE%1 dstq, m0, m1
  1046. %if %1 <= 4
  1047. movq m8, m9
  1048. movq m9, m10
  1049. movq m10, m11
  1050. movq m11, m12
  1051. movq m12, m13
  1052. movq m13, m14
  1053. movq m14, m15
  1054. %else
  1055. movdqa m8, m9
  1056. movdqa m9, m10
  1057. movdqa m10, m11
  1058. movdqa m11, m12
  1059. movdqa m12, m13
  1060. movdqa m13, m14
  1061. movdqa m14, m15
  1062. %endif
  1063. add dstq, dststrideq ; dst += dststride
  1064. add srcq, srcstrideq ; src += srcstride
  1065. lea src2q, [src2q+2*src2strideq] ; src += srcstride
  1066. dec heightd ; cmp height
  1067. jnz .loop ; height loop
  1068. RET
  1069. %endmacro
  1070. %macro WEIGHTING_FUNCS 2
  1071. %if WIN64 || ARCH_X86_32
  1072. cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox
  1073. mov r4d, denomm
  1074. %define SHIFT r4d
  1075. %else
  1076. cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox
  1077. %define SHIFT denomd
  1078. %endif
  1079. lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom
  1080. movd m2, wxm ; WX
  1081. movd m4, SHIFT ; shift
  1082. punpcklwd m2, m2
  1083. dec SHIFT
  1084. movdqu m5, [one_per_32]
  1085. movd m6, SHIFT
  1086. pshufd m2, m2, 0
  1087. mov SHIFT, oxm
  1088. pslld m5, m6
  1089. %if %2 != 8
  1090. shl SHIFT, %2-8 ; ox << (bitd - 8)
  1091. %endif
  1092. movd m3, SHIFT ; OX
  1093. pshufd m3, m3, 0
  1094. %if WIN64 || ARCH_X86_32
  1095. mov SHIFT, heightm
  1096. %endif
  1097. .loop
  1098. SIMPLE_LOAD %1, 10, srcq, m0
  1099. pmulhw m6, m0, m2
  1100. pmullw m0, m2
  1101. punpckhwd m1, m0, m6
  1102. punpcklwd m0, m6
  1103. paddd m0, m5
  1104. paddd m1, m5
  1105. psrad m0, m4
  1106. psrad m1, m4
  1107. paddd m0, m3
  1108. paddd m1, m3
  1109. packusdw m0, m1
  1110. %if %2 == 8
  1111. packuswb m0, m0
  1112. %else
  1113. pminsw m0, [max_pixels_%2]
  1114. %endif
  1115. PEL_%2STORE%1 dstq, m0, m1
  1116. add dstq, dststrideq ; dst += dststride
  1117. lea srcq, [srcq+2*srcstrideq] ; src += srcstride
  1118. dec heightd ; cmp height
  1119. jnz .loop ; height loop
  1120. RET
  1121. cglobal hevc_put_hevc_bi_w%1_%2, 6, 7, 10, dst, dststride, src, srcstride, src2, src2stride, height, denom, wx0, wx1, ox0, ox1
  1122. mov r6d, denomm
  1123. movd m2, wx0m ; WX0
  1124. lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom
  1125. movd m3, wx1m ; WX1
  1126. movd m0, r6d ; shift
  1127. punpcklwd m2, m2
  1128. inc r6d
  1129. punpcklwd m3, m3
  1130. movd m5, r6d ; shift+1
  1131. pshufd m2, m2, 0
  1132. mov r6d, ox0m
  1133. pshufd m3, m3, 0
  1134. add r6d, ox1m
  1135. %if %2 != 8
  1136. shl r6d, %2-8 ; ox << (bitd - 8)
  1137. %endif
  1138. inc r6d
  1139. movd m4, r6d ; offset
  1140. pshufd m4, m4, 0
  1141. mov r6d, heightm
  1142. pslld m4, m0
  1143. .loop
  1144. SIMPLE_LOAD %1, 10, srcq, m0
  1145. SIMPLE_LOAD %1, 10, src2q, m8
  1146. pmulhw m6, m0, m3
  1147. pmullw m0, m3
  1148. pmulhw m7, m8, m2
  1149. pmullw m8, m2
  1150. punpckhwd m1, m0, m6
  1151. punpcklwd m0, m6
  1152. punpckhwd m9, m8, m7
  1153. punpcklwd m8, m7
  1154. paddd m0, m8
  1155. paddd m1, m9
  1156. paddd m0, m4
  1157. paddd m1, m4
  1158. psrad m0, m5
  1159. psrad m1, m5
  1160. packusdw m0, m1
  1161. %if %2 == 8
  1162. packuswb m0, m0
  1163. %else
  1164. pminsw m0, [max_pixels_%2]
  1165. %endif
  1166. PEL_%2STORE%1 dstq, m0, m1
  1167. add dstq, dststrideq ; dst += dststride
  1168. lea srcq, [srcq+2*srcstrideq] ; src += srcstride
  1169. lea src2q, [src2q+2*src2strideq] ; src2 += srcstride
  1170. dec r6d ; cmp height
  1171. jnz .loop ; height loop
  1172. RET
  1173. %endmacro
  1174. WEIGHTING_FUNCS 2, 8
  1175. WEIGHTING_FUNCS 4, 8
  1176. WEIGHTING_FUNCS 6, 8
  1177. WEIGHTING_FUNCS 8, 8
  1178. WEIGHTING_FUNCS 2, 10
  1179. WEIGHTING_FUNCS 4, 10
  1180. WEIGHTING_FUNCS 6, 10
  1181. WEIGHTING_FUNCS 8, 10
  1182. WEIGHTING_FUNCS 2, 12
  1183. WEIGHTING_FUNCS 4, 12
  1184. WEIGHTING_FUNCS 6, 12
  1185. WEIGHTING_FUNCS 8, 12
  1186. HEVC_PUT_HEVC_PEL_PIXELS 2, 8
  1187. HEVC_PUT_HEVC_PEL_PIXELS 4, 8
  1188. HEVC_PUT_HEVC_PEL_PIXELS 6, 8
  1189. HEVC_PUT_HEVC_PEL_PIXELS 8, 8
  1190. HEVC_PUT_HEVC_PEL_PIXELS 12, 8
  1191. HEVC_PUT_HEVC_PEL_PIXELS 16, 8
  1192. HEVC_PUT_HEVC_PEL_PIXELS 2, 10
  1193. HEVC_PUT_HEVC_PEL_PIXELS 4, 10
  1194. HEVC_PUT_HEVC_PEL_PIXELS 6, 10
  1195. HEVC_PUT_HEVC_PEL_PIXELS 8, 10
  1196. HEVC_PUT_HEVC_PEL_PIXELS 2, 12
  1197. HEVC_PUT_HEVC_PEL_PIXELS 4, 12
  1198. HEVC_PUT_HEVC_PEL_PIXELS 6, 12
  1199. HEVC_PUT_HEVC_PEL_PIXELS 8, 12
  1200. HEVC_PUT_HEVC_EPEL 2, 8
  1201. HEVC_PUT_HEVC_EPEL 4, 8
  1202. HEVC_PUT_HEVC_EPEL 6, 8
  1203. HEVC_PUT_HEVC_EPEL 8, 8
  1204. HEVC_PUT_HEVC_EPEL 12, 8
  1205. HEVC_PUT_HEVC_EPEL 16, 8
  1206. HEVC_PUT_HEVC_EPEL 2, 10
  1207. HEVC_PUT_HEVC_EPEL 4, 10
  1208. HEVC_PUT_HEVC_EPEL 6, 10
  1209. HEVC_PUT_HEVC_EPEL 8, 10
  1210. HEVC_PUT_HEVC_EPEL 2, 12
  1211. HEVC_PUT_HEVC_EPEL 4, 12
  1212. HEVC_PUT_HEVC_EPEL 6, 12
  1213. HEVC_PUT_HEVC_EPEL 8, 12
  1214. HEVC_PUT_HEVC_EPEL_HV 2, 8
  1215. HEVC_PUT_HEVC_EPEL_HV 4, 8
  1216. HEVC_PUT_HEVC_EPEL_HV 6, 8
  1217. HEVC_PUT_HEVC_EPEL_HV 8, 8
  1218. HEVC_PUT_HEVC_EPEL_HV 2, 10
  1219. HEVC_PUT_HEVC_EPEL_HV 4, 10
  1220. HEVC_PUT_HEVC_EPEL_HV 6, 10
  1221. HEVC_PUT_HEVC_EPEL_HV 8, 10
  1222. HEVC_PUT_HEVC_EPEL_HV 2, 12
  1223. HEVC_PUT_HEVC_EPEL_HV 4, 12
  1224. HEVC_PUT_HEVC_EPEL_HV 6, 12
  1225. HEVC_PUT_HEVC_EPEL_HV 8, 12
  1226. HEVC_PUT_HEVC_QPEL 4, 8
  1227. HEVC_PUT_HEVC_QPEL 8, 8
  1228. HEVC_PUT_HEVC_QPEL 12, 8
  1229. HEVC_PUT_HEVC_QPEL 16, 8
  1230. HEVC_PUT_HEVC_QPEL 4, 10
  1231. HEVC_PUT_HEVC_QPEL 8, 10
  1232. HEVC_PUT_HEVC_QPEL 4, 12
  1233. HEVC_PUT_HEVC_QPEL 8, 12
  1234. HEVC_PUT_HEVC_QPEL_HV 2, 8
  1235. HEVC_PUT_HEVC_QPEL_HV 4, 8
  1236. HEVC_PUT_HEVC_QPEL_HV 6, 8
  1237. HEVC_PUT_HEVC_QPEL_HV 8, 8
  1238. HEVC_PUT_HEVC_QPEL_HV 2, 10
  1239. HEVC_PUT_HEVC_QPEL_HV 4, 10
  1240. HEVC_PUT_HEVC_QPEL_HV 6, 10
  1241. HEVC_PUT_HEVC_QPEL_HV 8, 10
  1242. HEVC_PUT_HEVC_QPEL_HV 2, 12
  1243. HEVC_PUT_HEVC_QPEL_HV 4, 12
  1244. HEVC_PUT_HEVC_QPEL_HV 6, 12
  1245. HEVC_PUT_HEVC_QPEL_HV 8, 12
  1246. %endif ; ARCH_X86_64