You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

623 lines
17KB

  1. ;******************************************************************************
  2. ;* VP9 motion compensation SIMD optimizations
  3. ;*
  4. ;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
  5. ;*
  6. ;* This file is part of Libav.
  7. ;*
  8. ;* Libav is free software; you can redistribute it and/or
  9. ;* modify it under the terms of the GNU Lesser General Public
  10. ;* License as published by the Free Software Foundation; either
  11. ;* version 2.1 of the License, or (at your option) any later version.
  12. ;*
  13. ;* Libav is distributed in the hope that it will be useful,
  14. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. ;* Lesser General Public License for more details.
  17. ;*
  18. ;* You should have received a copy of the GNU Lesser General Public
  19. ;* License along with Libav; if not, write to the Free Software
  20. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. ;******************************************************************************
  22. %include "libavutil/x86/x86util.asm"
  23. SECTION_RODATA 32
  24. cextern pw_256
  25. cextern pw_64
  26. %macro F8_SSSE3_TAPS 8
  27. times 16 db %1, %2
  28. times 16 db %3, %4
  29. times 16 db %5, %6
  30. times 16 db %7, %8
  31. %endmacro
  32. %macro F8_SSE2_TAPS 8
  33. times 8 dw %1
  34. times 8 dw %2
  35. times 8 dw %3
  36. times 8 dw %4
  37. times 8 dw %5
  38. times 8 dw %6
  39. times 8 dw %7
  40. times 8 dw %8
  41. %endmacro
  42. %macro FILTER 1
  43. const filters_%1 ; smooth
  44. F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
  45. F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0
  46. F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0
  47. F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0
  48. F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0
  49. F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0
  50. F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1
  51. F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1
  52. F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1
  53. F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1
  54. F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2
  55. F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2
  56. F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2
  57. F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2
  58. F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3
  59. ; regular
  60. F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0
  61. F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0
  62. F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1
  63. F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1
  64. F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1
  65. F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1
  66. F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1
  67. F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1
  68. F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1
  69. F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1
  70. F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1
  71. F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1
  72. F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1
  73. F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1
  74. F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0
  75. ; sharp
  76. F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0
  77. F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1
  78. F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2
  79. F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2
  80. F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3
  81. F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3
  82. F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4
  83. F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4
  84. F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4
  85. F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4
  86. F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4
  87. F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4
  88. F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3
  89. F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2
  90. F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1
  91. %endmacro
  92. %define F8_TAPS F8_SSSE3_TAPS
  93. ; int8_t ff_filters_ssse3[3][15][4][32]
  94. FILTER ssse3
  95. %define F8_TAPS F8_SSE2_TAPS
  96. ; int16_t ff_filters_sse2[3][15][8][8]
  97. FILTER sse2
  98. SECTION .text
  99. %macro filter_sse2_h_fn 1
  100. %assign %%px mmsize/2
  101. cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 15, dst, dstride, src, sstride, h, filtery
  102. pxor m5, m5
  103. mova m6, [pw_64]
  104. mova m7, [filteryq+ 0]
  105. %if ARCH_X86_64 && mmsize > 8
  106. mova m8, [filteryq+ 16]
  107. mova m9, [filteryq+ 32]
  108. mova m10, [filteryq+ 48]
  109. mova m11, [filteryq+ 64]
  110. mova m12, [filteryq+ 80]
  111. mova m13, [filteryq+ 96]
  112. mova m14, [filteryq+112]
  113. %endif
  114. .loop:
  115. movh m0, [srcq-3]
  116. movh m1, [srcq-2]
  117. movh m2, [srcq-1]
  118. movh m3, [srcq+0]
  119. movh m4, [srcq+1]
  120. punpcklbw m0, m5
  121. punpcklbw m1, m5
  122. punpcklbw m2, m5
  123. punpcklbw m3, m5
  124. punpcklbw m4, m5
  125. pmullw m0, m7
  126. %if ARCH_X86_64 && mmsize > 8
  127. pmullw m1, m8
  128. pmullw m2, m9
  129. pmullw m3, m10
  130. pmullw m4, m11
  131. %else
  132. pmullw m1, [filteryq+ 16]
  133. pmullw m2, [filteryq+ 32]
  134. pmullw m3, [filteryq+ 48]
  135. pmullw m4, [filteryq+ 64]
  136. %endif
  137. paddw m0, m1
  138. paddw m2, m3
  139. paddw m0, m4
  140. movh m1, [srcq+2]
  141. movh m3, [srcq+3]
  142. movh m4, [srcq+4]
  143. add srcq, sstrideq
  144. punpcklbw m1, m5
  145. punpcklbw m3, m5
  146. punpcklbw m4, m5
  147. %if ARCH_X86_64 && mmsize > 8
  148. pmullw m1, m12
  149. pmullw m3, m13
  150. pmullw m4, m14
  151. %else
  152. pmullw m1, [filteryq+ 80]
  153. pmullw m3, [filteryq+ 96]
  154. pmullw m4, [filteryq+112]
  155. %endif
  156. paddw m0, m1
  157. paddw m3, m4
  158. paddw m0, m6
  159. paddw m2, m3
  160. paddsw m0, m2
  161. psraw m0, 7
  162. %ifidn %1, avg
  163. movh m1, [dstq]
  164. %endif
  165. packuswb m0, m0
  166. %ifidn %1, avg
  167. pavgb m0, m1
  168. %endif
  169. movh [dstq], m0
  170. add dstq, dstrideq
  171. dec hd
  172. jg .loop
  173. RET
  174. %endmacro
  175. INIT_MMX mmxext
  176. filter_sse2_h_fn put
  177. filter_sse2_h_fn avg
  178. INIT_XMM sse2
  179. filter_sse2_h_fn put
  180. filter_sse2_h_fn avg
  181. %macro filter_h_fn 1
  182. %assign %%px mmsize/2
  183. cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
  184. mova m6, [pw_256]
  185. mova m7, [filteryq+ 0]
  186. %if ARCH_X86_64 && mmsize > 8
  187. mova m8, [filteryq+32]
  188. mova m9, [filteryq+64]
  189. mova m10, [filteryq+96]
  190. %endif
  191. .loop:
  192. movh m0, [srcq-3]
  193. movh m1, [srcq-2]
  194. movh m2, [srcq-1]
  195. movh m3, [srcq+0]
  196. movh m4, [srcq+1]
  197. movh m5, [srcq+2]
  198. punpcklbw m0, m1
  199. punpcklbw m2, m3
  200. movh m1, [srcq+3]
  201. movh m3, [srcq+4]
  202. add srcq, sstrideq
  203. punpcklbw m4, m5
  204. punpcklbw m1, m3
  205. pmaddubsw m0, m7
  206. %if ARCH_X86_64 && mmsize > 8
  207. pmaddubsw m2, m8
  208. pmaddubsw m4, m9
  209. pmaddubsw m1, m10
  210. %else
  211. pmaddubsw m2, [filteryq+32]
  212. pmaddubsw m4, [filteryq+64]
  213. pmaddubsw m1, [filteryq+96]
  214. %endif
  215. paddw m0, m4
  216. paddw m2, m1
  217. paddsw m0, m2
  218. pmulhrsw m0, m6
  219. %ifidn %1, avg
  220. movh m1, [dstq]
  221. %endif
  222. packuswb m0, m0
  223. %ifidn %1, avg
  224. pavgb m0, m1
  225. %endif
  226. movh [dstq], m0
  227. add dstq, dstrideq
  228. dec hd
  229. jg .loop
  230. RET
  231. %endmacro
  232. INIT_MMX ssse3
  233. filter_h_fn put
  234. filter_h_fn avg
  235. INIT_XMM ssse3
  236. filter_h_fn put
  237. filter_h_fn avg
  238. %if ARCH_X86_64
  239. %macro filter_hx2_fn 1
  240. %assign %%px mmsize
  241. cglobal vp9_%1_8tap_1d_h_ %+ %%px, 6, 6, 14, dst, dstride, src, sstride, h, filtery
  242. mova m13, [pw_256]
  243. mova m8, [filteryq+ 0]
  244. mova m9, [filteryq+32]
  245. mova m10, [filteryq+64]
  246. mova m11, [filteryq+96]
  247. .loop:
  248. movu m0, [srcq-3]
  249. movu m1, [srcq-2]
  250. movu m2, [srcq-1]
  251. movu m3, [srcq+0]
  252. movu m4, [srcq+1]
  253. movu m5, [srcq+2]
  254. movu m6, [srcq+3]
  255. movu m7, [srcq+4]
  256. add srcq, sstrideq
  257. SBUTTERFLY bw, 0, 1, 12
  258. SBUTTERFLY bw, 2, 3, 12
  259. SBUTTERFLY bw, 4, 5, 12
  260. SBUTTERFLY bw, 6, 7, 12
  261. pmaddubsw m0, m8
  262. pmaddubsw m1, m8
  263. pmaddubsw m2, m9
  264. pmaddubsw m3, m9
  265. pmaddubsw m4, m10
  266. pmaddubsw m5, m10
  267. pmaddubsw m6, m11
  268. pmaddubsw m7, m11
  269. paddw m0, m4
  270. paddw m1, m5
  271. paddw m2, m6
  272. paddw m3, m7
  273. paddsw m0, m2
  274. paddsw m1, m3
  275. pmulhrsw m0, m13
  276. pmulhrsw m1, m13
  277. packuswb m0, m1
  278. %ifidn %1, avg
  279. pavgb m0, [dstq]
  280. %endif
  281. mova [dstq], m0
  282. add dstq, dstrideq
  283. dec hd
  284. jg .loop
  285. RET
  286. %endmacro
  287. INIT_XMM ssse3
  288. filter_hx2_fn put
  289. filter_hx2_fn avg
  290. %if HAVE_AVX2_EXTERNAL
  291. INIT_YMM avx2
  292. filter_hx2_fn put
  293. filter_hx2_fn avg
  294. %endif
  295. %endif ; ARCH_X86_64
  296. %macro filter_sse2_v_fn 1
  297. %assign %%px mmsize/2
  298. %if ARCH_X86_64
  299. cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 15, dst, dstride, src, sstride, h, filtery, src4, sstride3
  300. %else
  301. cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 15, dst, dstride, src, sstride, filtery, src4, sstride3
  302. mov filteryq, r5mp
  303. %define hd r4mp
  304. %endif
  305. pxor m5, m5
  306. mova m6, [pw_64]
  307. lea sstride3q, [sstrideq*3]
  308. lea src4q, [srcq+sstrideq]
  309. sub srcq, sstride3q
  310. mova m7, [filteryq+ 0]
  311. %if ARCH_X86_64 && mmsize > 8
  312. mova m8, [filteryq+ 16]
  313. mova m9, [filteryq+ 32]
  314. mova m10, [filteryq+ 48]
  315. mova m11, [filteryq+ 64]
  316. mova m12, [filteryq+ 80]
  317. mova m13, [filteryq+ 96]
  318. mova m14, [filteryq+112]
  319. %endif
  320. .loop:
  321. ; FIXME maybe reuse loads from previous rows, or just
  322. ; more generally unroll this to prevent multiple loads of
  323. ; the same data?
  324. movh m0, [srcq]
  325. movh m1, [srcq+sstrideq]
  326. movh m2, [srcq+sstrideq*2]
  327. movh m3, [srcq+sstride3q]
  328. add srcq, sstrideq
  329. movh m4, [src4q]
  330. punpcklbw m0, m5
  331. punpcklbw m1, m5
  332. punpcklbw m2, m5
  333. punpcklbw m3, m5
  334. punpcklbw m4, m5
  335. pmullw m0, m7
  336. %if ARCH_X86_64 && mmsize > 8
  337. pmullw m1, m8
  338. pmullw m2, m9
  339. pmullw m3, m10
  340. pmullw m4, m11
  341. %else
  342. pmullw m1, [filteryq+ 16]
  343. pmullw m2, [filteryq+ 32]
  344. pmullw m3, [filteryq+ 48]
  345. pmullw m4, [filteryq+ 64]
  346. %endif
  347. paddw m0, m1
  348. paddw m2, m3
  349. paddw m0, m4
  350. movh m1, [src4q+sstrideq]
  351. movh m3, [src4q+sstrideq*2]
  352. movh m4, [src4q+sstride3q]
  353. add src4q, sstrideq
  354. punpcklbw m1, m5
  355. punpcklbw m3, m5
  356. punpcklbw m4, m5
  357. %if ARCH_X86_64 && mmsize > 8
  358. pmullw m1, m12
  359. pmullw m3, m13
  360. pmullw m4, m14
  361. %else
  362. pmullw m1, [filteryq+ 80]
  363. pmullw m3, [filteryq+ 96]
  364. pmullw m4, [filteryq+112]
  365. %endif
  366. paddw m0, m1
  367. paddw m3, m4
  368. paddw m0, m6
  369. paddw m2, m3
  370. paddsw m0, m2
  371. psraw m0, 7
  372. %ifidn %1, avg
  373. movh m1, [dstq]
  374. %endif
  375. packuswb m0, m0
  376. %ifidn %1, avg
  377. pavgb m0, m1
  378. %endif
  379. movh [dstq], m0
  380. add dstq, dstrideq
  381. dec hd
  382. jg .loop
  383. RET
  384. %endmacro
  385. INIT_MMX mmxext
  386. filter_sse2_v_fn put
  387. filter_sse2_v_fn avg
  388. INIT_XMM sse2
  389. filter_sse2_v_fn put
  390. filter_sse2_v_fn avg
  391. %macro filter_v_fn 1
  392. %assign %%px mmsize/2
  393. %if ARCH_X86_64
  394. cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
  395. %else
  396. cglobal vp9_%1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
  397. mov filteryq, r5mp
  398. %define hd r4mp
  399. %endif
  400. mova m6, [pw_256]
  401. lea sstride3q, [sstrideq*3]
  402. lea src4q, [srcq+sstrideq]
  403. sub srcq, sstride3q
  404. mova m7, [filteryq+ 0]
  405. %if ARCH_X86_64 && mmsize > 8
  406. mova m8, [filteryq+32]
  407. mova m9, [filteryq+64]
  408. mova m10, [filteryq+96]
  409. %endif
  410. .loop:
  411. ; FIXME maybe reuse loads from previous rows, or just more generally
  412. ; unroll this to prevent multiple loads of the same data?
  413. movh m0, [srcq]
  414. movh m1, [srcq+sstrideq]
  415. movh m2, [srcq+sstrideq*2]
  416. movh m3, [srcq+sstride3q]
  417. movh m4, [src4q]
  418. movh m5, [src4q+sstrideq]
  419. punpcklbw m0, m1
  420. punpcklbw m2, m3
  421. movh m1, [src4q+sstrideq*2]
  422. movh m3, [src4q+sstride3q]
  423. add srcq, sstrideq
  424. add src4q, sstrideq
  425. punpcklbw m4, m5
  426. punpcklbw m1, m3
  427. pmaddubsw m0, m7
  428. %if ARCH_X86_64 && mmsize > 8
  429. pmaddubsw m2, m8
  430. pmaddubsw m4, m9
  431. pmaddubsw m1, m10
  432. %else
  433. pmaddubsw m2, [filteryq+32]
  434. pmaddubsw m4, [filteryq+64]
  435. pmaddubsw m1, [filteryq+96]
  436. %endif
  437. paddw m0, m4
  438. paddw m2, m1
  439. paddsw m0, m2
  440. pmulhrsw m0, m6
  441. %ifidn %1, avg
  442. movh m1, [dstq]
  443. %endif
  444. packuswb m0, m0
  445. %ifidn %1, avg
  446. pavgb m0, m1
  447. %endif
  448. movh [dstq], m0
  449. add dstq, dstrideq
  450. dec hd
  451. jg .loop
  452. RET
  453. %endmacro
  454. INIT_MMX ssse3
  455. filter_v_fn put
  456. filter_v_fn avg
  457. INIT_XMM ssse3
  458. filter_v_fn put
  459. filter_v_fn avg
  460. %if ARCH_X86_64
  461. %macro filter_vx2_fn 1
  462. %assign %%px mmsize
  463. cglobal vp9_%1_8tap_1d_v_ %+ %%px, 6, 8, 14, dst, dstride, src, sstride, h, filtery, src4, sstride3
  464. mova m13, [pw_256]
  465. lea sstride3q, [sstrideq*3]
  466. lea src4q, [srcq+sstrideq]
  467. sub srcq, sstride3q
  468. mova m8, [filteryq+ 0]
  469. mova m9, [filteryq+32]
  470. mova m10, [filteryq+64]
  471. mova m11, [filteryq+96]
  472. .loop:
  473. ; FIXME maybe reuse loads from previous rows, or just
  474. ; more generally unroll this to prevent multiple loads of
  475. ; the same data?
  476. movu m0, [srcq]
  477. movu m1, [srcq+sstrideq]
  478. movu m2, [srcq+sstrideq*2]
  479. movu m3, [srcq+sstride3q]
  480. movu m4, [src4q]
  481. movu m5, [src4q+sstrideq]
  482. movu m6, [src4q+sstrideq*2]
  483. movu m7, [src4q+sstride3q]
  484. add srcq, sstrideq
  485. add src4q, sstrideq
  486. SBUTTERFLY bw, 0, 1, 12
  487. SBUTTERFLY bw, 2, 3, 12
  488. SBUTTERFLY bw, 4, 5, 12
  489. SBUTTERFLY bw, 6, 7, 12
  490. pmaddubsw m0, m8
  491. pmaddubsw m1, m8
  492. pmaddubsw m2, m9
  493. pmaddubsw m3, m9
  494. pmaddubsw m4, m10
  495. pmaddubsw m5, m10
  496. pmaddubsw m6, m11
  497. pmaddubsw m7, m11
  498. paddw m0, m4
  499. paddw m1, m5
  500. paddw m2, m6
  501. paddw m3, m7
  502. paddsw m0, m2
  503. paddsw m1, m3
  504. pmulhrsw m0, m13
  505. pmulhrsw m1, m13
  506. packuswb m0, m1
  507. %ifidn %1, avg
  508. pavgb m0, [dstq]
  509. %endif
  510. mova [dstq], m0
  511. add dstq, dstrideq
  512. dec hd
  513. jg .loop
  514. RET
  515. %endmacro
  516. INIT_XMM ssse3
  517. filter_vx2_fn put
  518. filter_vx2_fn avg
  519. %if HAVE_AVX2_EXTERNAL
  520. INIT_YMM avx2
  521. filter_vx2_fn put
  522. filter_vx2_fn avg
  523. %endif
  524. %endif ; ARCH_X86_64
  525. %macro fpel_fn 6
  526. %if %2 == 4
  527. %define %%srcfn movh
  528. %define %%dstfn movh
  529. %else
  530. %define %%srcfn movu
  531. %define %%dstfn mova
  532. %endif
  533. %if %2 <= mmsize
  534. cglobal vp9_%1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
  535. lea sstride3q, [sstrideq*3]
  536. lea dstride3q, [dstrideq*3]
  537. %else
  538. cglobal vp9_%1%2, 5, 5, 4, dst, dstride, src, sstride, h
  539. %endif
  540. .loop:
  541. %%srcfn m0, [srcq]
  542. %%srcfn m1, [srcq+s%3]
  543. %%srcfn m2, [srcq+s%4]
  544. %%srcfn m3, [srcq+s%5]
  545. lea srcq, [srcq+sstrideq*%6]
  546. %ifidn %1, avg
  547. pavgb m0, [dstq]
  548. pavgb m1, [dstq+d%3]
  549. pavgb m2, [dstq+d%4]
  550. pavgb m3, [dstq+d%5]
  551. %endif
  552. %%dstfn [dstq], m0
  553. %%dstfn [dstq+d%3], m1
  554. %%dstfn [dstq+d%4], m2
  555. %%dstfn [dstq+d%5], m3
  556. lea dstq, [dstq+dstrideq*%6]
  557. sub hd, %6
  558. jnz .loop
  559. RET
  560. %endmacro
  561. %define d16 16
  562. %define s16 16
  563. %define d32 32
  564. %define s32 32
  565. INIT_MMX mmx
  566. fpel_fn put, 4, strideq, strideq*2, stride3q, 4
  567. fpel_fn put, 8, strideq, strideq*2, stride3q, 4
  568. INIT_MMX mmxext
  569. fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
  570. fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
  571. INIT_XMM sse
  572. fpel_fn put, 16, strideq, strideq*2, stride3q, 4
  573. fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
  574. fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
  575. INIT_XMM sse2
  576. fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
  577. fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
  578. fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
  579. INIT_YMM avx
  580. fpel_fn put, 32, strideq, strideq*2, stride3q, 4
  581. fpel_fn put, 64, mmsize, strideq, strideq+mmsize, 2
  582. %if HAVE_AVX2_EXTERNAL
  583. INIT_YMM avx2
  584. fpel_fn avg, 32, strideq, strideq*2, stride3q, 4
  585. fpel_fn avg, 64, mmsize, strideq, strideq+mmsize, 2
  586. %endif
  587. %undef s16
  588. %undef d16
  589. %undef s32
  590. %undef d32