You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

863 lines
21KB

  1. ;*****************************************************************************
  2. ;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
  3. ;*****************************************************************************
  4. ;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
  5. ;* Copyright (C) 2012 Daniel Kang
  6. ;*
  7. ;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
  8. ;*
  9. ;* This file is part of Libav.
  10. ;*
  11. ;* Libav is free software; you can redistribute it and/or
  12. ;* modify it under the terms of the GNU Lesser General Public
  13. ;* License as published by the Free Software Foundation; either
  14. ;* version 2.1 of the License, or (at your option) any later version.
  15. ;*
  16. ;* Libav is distributed in the hope that it will be useful,
  17. ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  18. ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  19. ;* Lesser General Public License for more details.
  20. ;*
  21. ;* You should have received a copy of the GNU Lesser General Public
  22. ;* License along with Libav; if not, write to the Free Software
  23. ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24. ;******************************************************************************
  25. %include "libavutil/x86/x86util.asm"
  26. SECTION_RODATA 32
  27. cextern pw_16
  28. cextern pw_5
  29. cextern pb_0
  30. SECTION .text
  31. %macro op_avgh 3
  32. movh %3, %2
  33. pavgb %1, %3
  34. movh %2, %1
  35. %endmacro
  36. %macro op_avg 2-3
  37. pavgb %1, %2
  38. mova %2, %1
  39. %endmacro
  40. %macro op_puth 2-3
  41. movh %2, %1
  42. %endmacro
  43. %macro op_put 2-3
  44. mova %2, %1
  45. %endmacro
  46. %macro QPEL4_H_LOWPASS_OP 1
  47. cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
  48. movsxdifnidn r2, r2d
  49. movsxdifnidn r3, r3d
  50. pxor m7, m7
  51. mova m4, [pw_5]
  52. mova m5, [pw_16]
  53. mov r4d, 4
  54. .loop:
  55. movh m1, [r1-1]
  56. movh m2, [r1+0]
  57. movh m3, [r1+1]
  58. movh m0, [r1+2]
  59. punpcklbw m1, m7
  60. punpcklbw m2, m7
  61. punpcklbw m3, m7
  62. punpcklbw m0, m7
  63. paddw m1, m0
  64. paddw m2, m3
  65. movh m0, [r1-2]
  66. movh m3, [r1+3]
  67. punpcklbw m0, m7
  68. punpcklbw m3, m7
  69. paddw m0, m3
  70. psllw m2, 2
  71. psubw m2, m1
  72. pmullw m2, m4
  73. paddw m0, m5
  74. paddw m0, m2
  75. psraw m0, 5
  76. packuswb m0, m0
  77. op_%1h m0, [r0], m6
  78. add r0, r2
  79. add r1, r3
  80. dec r4d
  81. jg .loop
  82. REP_RET
  83. %endmacro
  84. INIT_MMX mmxext
  85. QPEL4_H_LOWPASS_OP put
  86. QPEL4_H_LOWPASS_OP avg
  87. %macro QPEL8_H_LOWPASS_OP 1
  88. cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
  89. movsxdifnidn r2, r2d
  90. movsxdifnidn r3, r3d
  91. mov r4d, 8
  92. pxor m7, m7
  93. mova m6, [pw_5]
  94. .loop:
  95. mova m0, [r1]
  96. mova m2, [r1+1]
  97. mova m1, m0
  98. mova m3, m2
  99. punpcklbw m0, m7
  100. punpckhbw m1, m7
  101. punpcklbw m2, m7
  102. punpckhbw m3, m7
  103. paddw m0, m2
  104. paddw m1, m3
  105. psllw m0, 2
  106. psllw m1, 2
  107. mova m2, [r1-1]
  108. mova m4, [r1+2]
  109. mova m3, m2
  110. mova m5, m4
  111. punpcklbw m2, m7
  112. punpckhbw m3, m7
  113. punpcklbw m4, m7
  114. punpckhbw m5, m7
  115. paddw m2, m4
  116. paddw m5, m3
  117. psubw m0, m2
  118. psubw m1, m5
  119. pmullw m0, m6
  120. pmullw m1, m6
  121. movd m2, [r1-2]
  122. movd m5, [r1+7]
  123. punpcklbw m2, m7
  124. punpcklbw m5, m7
  125. paddw m2, m3
  126. paddw m4, m5
  127. mova m5, [pw_16]
  128. paddw m2, m5
  129. paddw m4, m5
  130. paddw m0, m2
  131. paddw m1, m4
  132. psraw m0, 5
  133. psraw m1, 5
  134. packuswb m0, m1
  135. op_%1 m0, [r0], m4
  136. add r0, r2
  137. add r1, r3
  138. dec r4d
  139. jg .loop
  140. REP_RET
  141. %endmacro
  142. INIT_MMX mmxext
  143. QPEL8_H_LOWPASS_OP put
  144. QPEL8_H_LOWPASS_OP avg
  145. %macro QPEL8_H_LOWPASS_OP_XMM 1
  146. cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
  147. movsxdifnidn r2, r2d
  148. movsxdifnidn r3, r3d
  149. mov r4d, 8
  150. pxor m7, m7
  151. mova m6, [pw_5]
  152. .loop:
  153. movu m1, [r1-2]
  154. mova m0, m1
  155. punpckhbw m1, m7
  156. punpcklbw m0, m7
  157. mova m2, m1
  158. mova m3, m1
  159. mova m4, m1
  160. mova m5, m1
  161. palignr m4, m0, 2
  162. palignr m3, m0, 4
  163. palignr m2, m0, 6
  164. palignr m1, m0, 8
  165. palignr m5, m0, 10
  166. paddw m0, m5
  167. paddw m2, m3
  168. paddw m1, m4
  169. psllw m2, 2
  170. psubw m2, m1
  171. paddw m0, [pw_16]
  172. pmullw m2, m6
  173. paddw m2, m0
  174. psraw m2, 5
  175. packuswb m2, m2
  176. op_%1h m2, [r0], m4
  177. add r1, r3
  178. add r0, r2
  179. dec r4d
  180. jne .loop
  181. REP_RET
  182. %endmacro
  183. INIT_XMM ssse3
  184. QPEL8_H_LOWPASS_OP_XMM put
  185. QPEL8_H_LOWPASS_OP_XMM avg
  186. %macro QPEL4_H_LOWPASS_L2_OP 1
  187. cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
  188. movsxdifnidn r3, r3d
  189. movsxdifnidn r4, r4d
  190. pxor m7, m7
  191. mova m4, [pw_5]
  192. mova m5, [pw_16]
  193. mov r5d, 4
  194. .loop:
  195. movh m1, [r1-1]
  196. movh m2, [r1+0]
  197. movh m3, [r1+1]
  198. movh m0, [r1+2]
  199. punpcklbw m1, m7
  200. punpcklbw m2, m7
  201. punpcklbw m3, m7
  202. punpcklbw m0, m7
  203. paddw m1, m0
  204. paddw m2, m3
  205. movh m0, [r1-2]
  206. movh m3, [r1+3]
  207. punpcklbw m0, m7
  208. punpcklbw m3, m7
  209. paddw m0, m3
  210. psllw m2, 2
  211. psubw m2, m1
  212. pmullw m2, m4
  213. paddw m0, m5
  214. paddw m0, m2
  215. movh m3, [r2]
  216. psraw m0, 5
  217. packuswb m0, m0
  218. pavgb m0, m3
  219. op_%1h m0, [r0], m6
  220. add r0, r3
  221. add r1, r3
  222. add r2, r4
  223. dec r5d
  224. jg .loop
  225. REP_RET
  226. %endmacro
  227. INIT_MMX mmxext
  228. QPEL4_H_LOWPASS_L2_OP put
  229. QPEL4_H_LOWPASS_L2_OP avg
  230. %macro QPEL8_H_LOWPASS_L2_OP 1
  231. cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
  232. movsxdifnidn r3, r3d
  233. movsxdifnidn r4, r4d
  234. mov r5d, 8
  235. pxor m7, m7
  236. mova m6, [pw_5]
  237. .loop:
  238. mova m0, [r1]
  239. mova m2, [r1+1]
  240. mova m1, m0
  241. mova m3, m2
  242. punpcklbw m0, m7
  243. punpckhbw m1, m7
  244. punpcklbw m2, m7
  245. punpckhbw m3, m7
  246. paddw m0, m2
  247. paddw m1, m3
  248. psllw m0, 2
  249. psllw m1, 2
  250. mova m2, [r1-1]
  251. mova m4, [r1+2]
  252. mova m3, m2
  253. mova m5, m4
  254. punpcklbw m2, m7
  255. punpckhbw m3, m7
  256. punpcklbw m4, m7
  257. punpckhbw m5, m7
  258. paddw m2, m4
  259. paddw m5, m3
  260. psubw m0, m2
  261. psubw m1, m5
  262. pmullw m0, m6
  263. pmullw m1, m6
  264. movd m2, [r1-2]
  265. movd m5, [r1+7]
  266. punpcklbw m2, m7
  267. punpcklbw m5, m7
  268. paddw m2, m3
  269. paddw m4, m5
  270. mova m5, [pw_16]
  271. paddw m2, m5
  272. paddw m4, m5
  273. paddw m0, m2
  274. paddw m1, m4
  275. psraw m0, 5
  276. psraw m1, 5
  277. mova m4, [r2]
  278. packuswb m0, m1
  279. pavgb m0, m4
  280. op_%1 m0, [r0], m4
  281. add r0, r3
  282. add r1, r3
  283. add r2, r4
  284. dec r5d
  285. jg .loop
  286. REP_RET
  287. %endmacro
  288. INIT_MMX mmxext
  289. QPEL8_H_LOWPASS_L2_OP put
  290. QPEL8_H_LOWPASS_L2_OP avg
  291. %macro QPEL8_H_LOWPASS_L2_OP_XMM 1
  292. cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
  293. movsxdifnidn r3, r3d
  294. movsxdifnidn r4, r4d
  295. mov r5d, 8
  296. pxor m7, m7
  297. mova m6, [pw_5]
  298. .loop:
  299. lddqu m1, [r1-2]
  300. mova m0, m1
  301. punpckhbw m1, m7
  302. punpcklbw m0, m7
  303. mova m2, m1
  304. mova m3, m1
  305. mova m4, m1
  306. mova m5, m1
  307. palignr m4, m0, 2
  308. palignr m3, m0, 4
  309. palignr m2, m0, 6
  310. palignr m1, m0, 8
  311. palignr m5, m0, 10
  312. paddw m0, m5
  313. paddw m2, m3
  314. paddw m1, m4
  315. psllw m2, 2
  316. movh m3, [r2]
  317. psubw m2, m1
  318. paddw m0, [pw_16]
  319. pmullw m2, m6
  320. paddw m2, m0
  321. psraw m2, 5
  322. packuswb m2, m2
  323. pavgb m2, m3
  324. op_%1h m2, [r0], m4
  325. add r1, r3
  326. add r0, r3
  327. add r2, r4
  328. dec r5d
  329. jg .loop
  330. REP_RET
  331. %endmacro
  332. INIT_XMM ssse3
  333. QPEL8_H_LOWPASS_L2_OP_XMM put
  334. QPEL8_H_LOWPASS_L2_OP_XMM avg
  335. ; All functions that call this are required to have function arguments of
  336. ; dst, src, dstStride, srcStride
  337. %macro FILT_V 1
  338. mova m6, m2
  339. movh m5, [r1]
  340. paddw m6, m3
  341. psllw m6, 2
  342. psubw m6, m1
  343. psubw m6, m4
  344. punpcklbw m5, m7
  345. pmullw m6, [pw_5]
  346. paddw m0, [pw_16]
  347. add r1, r3
  348. paddw m0, m5
  349. paddw m6, m0
  350. psraw m6, 5
  351. packuswb m6, m6
  352. op_%1h m6, [r0], m0 ; 1
  353. add r0, r2
  354. SWAP 0, 1, 2, 3, 4, 5
  355. %endmacro
  356. %macro QPEL4_V_LOWPASS_OP 1
  357. cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
  358. movsxdifnidn r2, r2d
  359. movsxdifnidn r3, r3d
  360. sub r1, r3
  361. sub r1, r3
  362. pxor m7, m7
  363. movh m0, [r1]
  364. movh m1, [r1+r3]
  365. lea r1, [r1+2*r3]
  366. movh m2, [r1]
  367. movh m3, [r1+r3]
  368. lea r1, [r1+2*r3]
  369. movh m4, [r1]
  370. add r1, r3
  371. punpcklbw m0, m7
  372. punpcklbw m1, m7
  373. punpcklbw m2, m7
  374. punpcklbw m3, m7
  375. punpcklbw m4, m7
  376. FILT_V %1
  377. FILT_V %1
  378. FILT_V %1
  379. FILT_V %1
  380. RET
  381. %endmacro
  382. INIT_MMX mmxext
  383. QPEL4_V_LOWPASS_OP put
  384. QPEL4_V_LOWPASS_OP avg
  385. %macro QPEL8OR16_V_LOWPASS_OP 1
  386. %if cpuflag(sse2)
  387. cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
  388. movsxdifnidn r2, r2d
  389. movsxdifnidn r3, r3d
  390. sub r1, r3
  391. sub r1, r3
  392. %else
  393. cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
  394. movsxdifnidn r2, r2d
  395. movsxdifnidn r3, r3d
  396. %endif
  397. pxor m7, m7
  398. movh m0, [r1]
  399. movh m1, [r1+r3]
  400. lea r1, [r1+2*r3]
  401. movh m2, [r1]
  402. movh m3, [r1+r3]
  403. lea r1, [r1+2*r3]
  404. movh m4, [r1]
  405. add r1, r3
  406. punpcklbw m0, m7
  407. punpcklbw m1, m7
  408. punpcklbw m2, m7
  409. punpcklbw m3, m7
  410. punpcklbw m4, m7
  411. FILT_V %1
  412. FILT_V %1
  413. FILT_V %1
  414. FILT_V %1
  415. FILT_V %1
  416. FILT_V %1
  417. FILT_V %1
  418. FILT_V %1
  419. cmp r4d, 16
  420. jne .end
  421. FILT_V %1
  422. FILT_V %1
  423. FILT_V %1
  424. FILT_V %1
  425. FILT_V %1
  426. FILT_V %1
  427. FILT_V %1
  428. FILT_V %1
  429. .end:
  430. REP_RET
  431. %endmacro
  432. INIT_MMX mmxext
  433. QPEL8OR16_V_LOWPASS_OP put
  434. QPEL8OR16_V_LOWPASS_OP avg
  435. INIT_XMM sse2
  436. QPEL8OR16_V_LOWPASS_OP put
  437. QPEL8OR16_V_LOWPASS_OP avg
  438. ; All functions that use this are required to have args:
  439. ; src, tmp, srcSize
  440. %macro FILT_HV 1 ; offset
  441. mova m6, m2
  442. movh m5, [r0]
  443. paddw m6, m3
  444. psllw m6, 2
  445. paddw m0, [pw_16]
  446. psubw m6, m1
  447. psubw m6, m4
  448. punpcklbw m5, m7
  449. pmullw m6, [pw_5]
  450. paddw m0, m5
  451. add r0, r2
  452. paddw m6, m0
  453. mova [r1+%1], m6
  454. SWAP 0, 1, 2, 3, 4, 5
  455. %endmacro
  456. %macro QPEL4_HV1_LOWPASS_OP 1
  457. cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
  458. movsxdifnidn r2, r2d
  459. pxor m7, m7
  460. movh m0, [r0]
  461. movh m1, [r0+r2]
  462. lea r0, [r0+2*r2]
  463. movh m2, [r0]
  464. movh m3, [r0+r2]
  465. lea r0, [r0+2*r2]
  466. movh m4, [r0]
  467. add r0, r2
  468. punpcklbw m0, m7
  469. punpcklbw m1, m7
  470. punpcklbw m2, m7
  471. punpcklbw m3, m7
  472. punpcklbw m4, m7
  473. FILT_HV 0*24
  474. FILT_HV 1*24
  475. FILT_HV 2*24
  476. FILT_HV 3*24
  477. RET
  478. cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
  479. movsxdifnidn r2, r2d
  480. mov r3d, 4
  481. .loop:
  482. mova m0, [r0]
  483. paddw m0, [r0+10]
  484. mova m1, [r0+2]
  485. paddw m1, [r0+8]
  486. mova m2, [r0+4]
  487. paddw m2, [r0+6]
  488. psubw m0, m1
  489. psraw m0, 2
  490. psubw m0, m1
  491. paddsw m0, m2
  492. psraw m0, 2
  493. paddw m0, m2
  494. psraw m0, 6
  495. packuswb m0, m0
  496. op_%1h m0, [r1], m7
  497. add r0, 24
  498. add r1, r2
  499. dec r3d
  500. jnz .loop
  501. REP_RET
  502. %endmacro
  503. INIT_MMX mmxext
  504. QPEL4_HV1_LOWPASS_OP put
  505. QPEL4_HV1_LOWPASS_OP avg
  506. %macro QPEL8OR16_HV1_LOWPASS_OP 1
  507. cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
  508. movsxdifnidn r2, r2d
  509. pxor m7, m7
  510. movh m0, [r0]
  511. movh m1, [r0+r2]
  512. lea r0, [r0+2*r2]
  513. movh m2, [r0]
  514. movh m3, [r0+r2]
  515. lea r0, [r0+2*r2]
  516. movh m4, [r0]
  517. add r0, r2
  518. punpcklbw m0, m7
  519. punpcklbw m1, m7
  520. punpcklbw m2, m7
  521. punpcklbw m3, m7
  522. punpcklbw m4, m7
  523. FILT_HV 0*48
  524. FILT_HV 1*48
  525. FILT_HV 2*48
  526. FILT_HV 3*48
  527. FILT_HV 4*48
  528. FILT_HV 5*48
  529. FILT_HV 6*48
  530. FILT_HV 7*48
  531. cmp r3d, 16
  532. jne .end
  533. FILT_HV 8*48
  534. FILT_HV 9*48
  535. FILT_HV 10*48
  536. FILT_HV 11*48
  537. FILT_HV 12*48
  538. FILT_HV 13*48
  539. FILT_HV 14*48
  540. FILT_HV 15*48
  541. .end:
  542. REP_RET
  543. %endmacro
  544. INIT_MMX mmxext
  545. QPEL8OR16_HV1_LOWPASS_OP put
  546. QPEL8OR16_HV1_LOWPASS_OP avg
  547. INIT_XMM sse2
  548. QPEL8OR16_HV1_LOWPASS_OP put
  549. %macro QPEL8OR16_HV2_LOWPASS_OP 1
  550. ; unused is to match ssse3 and mmxext args
  551. cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
  552. movsxdifnidn r2, r2d
  553. .loop:
  554. mova m0, [r1]
  555. mova m3, [r1+8]
  556. mova m1, [r1+2]
  557. mova m4, [r1+10]
  558. paddw m0, m4
  559. paddw m1, m3
  560. paddw m3, [r1+18]
  561. paddw m4, [r1+16]
  562. mova m2, [r1+4]
  563. mova m5, [r1+12]
  564. paddw m2, [r1+6]
  565. paddw m5, [r1+14]
  566. psubw m0, m1
  567. psubw m3, m4
  568. psraw m0, 2
  569. psraw m3, 2
  570. psubw m0, m1
  571. psubw m3, m4
  572. paddsw m0, m2
  573. paddsw m3, m5
  574. psraw m0, 2
  575. psraw m3, 2
  576. paddw m0, m2
  577. paddw m3, m5
  578. psraw m0, 6
  579. psraw m3, 6
  580. packuswb m0, m3
  581. op_%1 m0, [r0], m7
  582. add r1, 48
  583. add r0, r2
  584. dec r4d
  585. jne .loop
  586. REP_RET
  587. %endmacro
  588. INIT_MMX mmxext
  589. QPEL8OR16_HV2_LOWPASS_OP put
  590. QPEL8OR16_HV2_LOWPASS_OP avg
  591. %macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
  592. cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
  593. movsxdifnidn r2, r2d
  594. movsxdifnidn r3, r3d
  595. cmp r4d, 16
  596. je .op16
  597. .loop8:
  598. mova m1, [r1+16]
  599. mova m0, [r1]
  600. mova m2, m1
  601. mova m3, m1
  602. mova m4, m1
  603. mova m5, m1
  604. palignr m5, m0, 10
  605. palignr m4, m0, 8
  606. palignr m3, m0, 6
  607. palignr m2, m0, 4
  608. palignr m1, m0, 2
  609. paddw m0, m5
  610. paddw m1, m4
  611. paddw m2, m3
  612. psubw m0, m1
  613. psraw m0, 2
  614. psubw m0, m1
  615. paddw m0, m2
  616. psraw m0, 2
  617. paddw m0, m2
  618. psraw m0, 6
  619. packuswb m0, m0
  620. op_%1h m0, [r0], m7
  621. add r1, 48
  622. add r0, r2
  623. dec r4d
  624. jne .loop8
  625. jmp .done
  626. .op16:
  627. mova m4, [r1+32]
  628. mova m5, [r1+16]
  629. mova m7, [r1]
  630. mova m3, m4
  631. mova m2, m4
  632. mova m1, m4
  633. mova m0, m4
  634. palignr m0, m5, 10
  635. palignr m1, m5, 8
  636. palignr m2, m5, 6
  637. palignr m3, m5, 4
  638. palignr m4, m5, 2
  639. paddw m0, m5
  640. paddw m1, m4
  641. paddw m2, m3
  642. mova m6, m5
  643. mova m4, m5
  644. mova m3, m5
  645. palignr m4, m7, 8
  646. palignr m6, m7, 2
  647. palignr m3, m7, 10
  648. paddw m4, m6
  649. mova m6, m5
  650. palignr m5, m7, 6
  651. palignr m6, m7, 4
  652. paddw m3, m7
  653. paddw m5, m6
  654. psubw m0, m1
  655. psubw m3, m4
  656. psraw m0, 2
  657. psraw m3, 2
  658. psubw m0, m1
  659. psubw m3, m4
  660. paddw m0, m2
  661. paddw m3, m5
  662. psraw m0, 2
  663. psraw m3, 2
  664. paddw m0, m2
  665. paddw m3, m5
  666. psraw m0, 6
  667. psraw m3, 6
  668. packuswb m3, m0
  669. op_%1 m3, [r0], m7
  670. add r1, 48
  671. add r0, r2
  672. dec r4d
  673. jne .op16
  674. .done:
  675. REP_RET
  676. %endmacro
  677. INIT_XMM ssse3
  678. QPEL8OR16_HV2_LOWPASS_OP_XMM put
  679. QPEL8OR16_HV2_LOWPASS_OP_XMM avg
  680. %macro PIXELS4_L2_SHIFT5 1
  681. cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
  682. movsxdifnidn r3, r3d
  683. movsxdifnidn r4, r4d
  684. mova m0, [r1]
  685. mova m1, [r1+24]
  686. psraw m0, 5
  687. psraw m1, 5
  688. packuswb m0, m0
  689. packuswb m1, m1
  690. pavgb m0, [r2]
  691. pavgb m1, [r2+r4]
  692. op_%1h m0, [r0], m4
  693. op_%1h m1, [r0+r3], m5
  694. lea r2, [r2+r4*2]
  695. lea r0, [r0+r3*2]
  696. mova m0, [r1+48]
  697. mova m1, [r1+72]
  698. psraw m0, 5
  699. psraw m1, 5
  700. packuswb m0, m0
  701. packuswb m1, m1
  702. pavgb m0, [r2]
  703. pavgb m1, [r2+r4]
  704. op_%1h m0, [r0], m4
  705. op_%1h m1, [r0+r3], m5
  706. RET
  707. %endmacro
  708. INIT_MMX mmxext
  709. PIXELS4_L2_SHIFT5 put
  710. PIXELS4_L2_SHIFT5 avg
  711. %macro PIXELS8_L2_SHIFT5 1
  712. cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
  713. movsxdifnidn r3, r3d
  714. movsxdifnidn r4, r4d
  715. .loop:
  716. mova m0, [r1]
  717. mova m1, [r1+8]
  718. mova m2, [r1+48]
  719. mova m3, [r1+48+8]
  720. psraw m0, 5
  721. psraw m1, 5
  722. psraw m2, 5
  723. psraw m3, 5
  724. packuswb m0, m1
  725. packuswb m2, m3
  726. pavgb m0, [r2]
  727. pavgb m2, [r2+r4]
  728. op_%1 m0, [r0], m4
  729. op_%1 m2, [r0+r3], m5
  730. lea r2, [r2+2*r4]
  731. add r1, 48*2
  732. lea r0, [r0+2*r3]
  733. sub r5d, 2
  734. jne .loop
  735. REP_RET
  736. %endmacro
  737. INIT_MMX mmxext
  738. PIXELS8_L2_SHIFT5 put
  739. PIXELS8_L2_SHIFT5 avg
  740. %if ARCH_X86_64
  741. %macro QPEL16_H_LOWPASS_L2_OP 1
  742. cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
  743. movsxdifnidn r3, r3d
  744. movsxdifnidn r4, r4d
  745. mov r5d, 16
  746. pxor m15, m15
  747. mova m14, [pw_5]
  748. mova m13, [pw_16]
  749. .loop:
  750. lddqu m1, [r1+6]
  751. lddqu m7, [r1-2]
  752. mova m0, m1
  753. punpckhbw m1, m15
  754. punpcklbw m0, m15
  755. punpcklbw m7, m15
  756. mova m2, m1
  757. mova m6, m0
  758. mova m3, m1
  759. mova m8, m0
  760. mova m4, m1
  761. mova m9, m0
  762. mova m12, m0
  763. mova m11, m1
  764. palignr m11, m0, 10
  765. palignr m12, m7, 10
  766. palignr m4, m0, 2
  767. palignr m9, m7, 2
  768. palignr m3, m0, 4
  769. palignr m8, m7, 4
  770. palignr m2, m0, 6
  771. palignr m6, m7, 6
  772. paddw m11, m0
  773. palignr m1, m0, 8
  774. palignr m0, m7, 8
  775. paddw m7, m12
  776. paddw m2, m3
  777. paddw m6, m8
  778. paddw m1, m4
  779. paddw m0, m9
  780. psllw m2, 2
  781. psllw m6, 2
  782. psubw m2, m1
  783. psubw m6, m0
  784. paddw m11, m13
  785. paddw m7, m13
  786. pmullw m2, m14
  787. pmullw m6, m14
  788. lddqu m3, [r2]
  789. paddw m2, m11
  790. paddw m6, m7
  791. psraw m2, 5
  792. psraw m6, 5
  793. packuswb m6, m2
  794. pavgb m6, m3
  795. op_%1 m6, [r0], m11
  796. add r1, r3
  797. add r0, r3
  798. add r2, r4
  799. dec r5d
  800. jg .loop
  801. REP_RET
  802. %endmacro
  803. INIT_XMM ssse3
  804. QPEL16_H_LOWPASS_L2_OP put
  805. QPEL16_H_LOWPASS_L2_OP avg
  806. %endif