You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

921 lines
33KB

  1. /*
  2. * Copyright (c) 2011 Janne Grunau <janne-libav@jannau.net>
  3. * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. #include "neon.S"
  23. .macro qpel_lowpass r0, r1, rc1, rc2, shift
  24. vext.8 d25, \r0, \r1, #1 @ src[-1]
  25. vext.8 d26, \r0, \r1, #4 @ src[ 2]
  26. vext.8 d24, \r0, \r1, #5 @ src[ 3]
  27. vaddl.u8 q9, d25, d26
  28. vaddl.u8 q8, \r0, d24
  29. vext.8 d27, \r0, \r1, #2 @ src[ 0]
  30. vshl.s16 q12, q9, #2
  31. vsub.s16 q8, q8, q9
  32. vext.8 d28, \r0, \r1, #3 @ src[ 1]
  33. vsub.s16 q8, q8, q12
  34. vmlal.u8 q8, d27, \rc1
  35. vmlal.u8 q8, d28, \rc2
  36. vqrshrun.s16 \r0, q8, #\shift
  37. .endm
  38. .macro qpel_lowpass_x2 r0, r1, r2, r3, rc1, rc2, shift
  39. vext.8 d25, \r0, \r1, #1 @ src[-1]
  40. vext.8 d26, \r0, \r1, #4 @ src[ 2]
  41. vext.8 d24, \r0, \r1, #5 @ src[ 3]
  42. vaddl.u8 q9, d25, d26
  43. vaddl.u8 q8, \r0, d24
  44. vext.8 d29, \r0, \r1, #2 @ src[ 0]
  45. vext.8 d28, \r0, \r1, #3 @ src[ 1]
  46. vshl.s16 q10, q9, #2
  47. vext.8 \r1, \r2, \r3, #1 @ src[-1]
  48. vsub.s16 q8, q8, q9
  49. vext.8 d22, \r2, \r3, #4 @ src[ 2]
  50. vext.8 \r0, \r2, \r3, #5 @ src[ 3]
  51. vaddl.u8 q13, \r1, d22
  52. vaddl.u8 q12, \r2, \r0
  53. vsub.s16 q8, q8, q10
  54. vshl.s16 q9, q13, #2
  55. vsub.s16 q12, q12, q13
  56. vmlal.u8 q8, d29, \rc1
  57. vmlal.u8 q8, d28, \rc2
  58. vsub.s16 q12, q12, q9
  59. vext.8 d26, \r2, \r3, #2 @ src[ 0]
  60. vext.8 d27, \r2, \r3, #3 @ src[ 1]
  61. vmlal.u8 q12, d26, \rc1
  62. vmlal.u8 q12, d27, \rc2
  63. vqrshrun.s16 \r0, q8, #\shift
  64. vqrshrun.s16 \r2, q12, #\shift
  65. .endm
  66. .macro rv40_qpel8_h shift
  67. function put_rv40_qpel8_h_lp_packed_s\shift\()_neon
  68. 1:
  69. vld1.8 {q2}, [r1], r2
  70. vld1.8 {q3}, [r1], r2
  71. qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, \shift
  72. vst1.8 {d4}, [r12,:64]!
  73. vst1.8 {d6}, [r12,:64]!
  74. subs r3, r3, #2
  75. bgt 1b
  76. vld1.8 {q2}, [r1]
  77. qpel_lowpass d4, d5, d0, d1, \shift
  78. vst1.8 {d4}, [r12,:64]!
  79. bx lr
  80. endfunc
  81. .endm
  82. .macro rv40_qpel8_v shift, type
  83. function \type\()_rv40_qpel8_v_lp_packed_s\shift\()_neon
  84. vld1.64 {d2}, [r1,:64]!
  85. vld1.64 {d3}, [r1,:64]!
  86. vld1.64 {d4}, [r1,:64]!
  87. vld1.64 {d5}, [r1,:64]!
  88. vld1.64 {d6}, [r1,:64]!
  89. vld1.64 {d7}, [r1,:64]!
  90. vld1.64 {d8}, [r1,:64]!
  91. vld1.64 {d9}, [r1,:64]!
  92. vld1.64 {d10}, [r1,:64]!
  93. vld1.64 {d11}, [r1,:64]!
  94. vld1.64 {d12}, [r1,:64]!
  95. vld1.64 {d13}, [r1,:64]!
  96. vld1.64 {d14}, [r1,:64]!
  97. transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
  98. transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
  99. qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, \shift
  100. qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, \shift
  101. qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, \shift
  102. qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, \shift
  103. transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
  104. .ifc \type,avg
  105. vld1.64 d12, [r0,:64], r2
  106. vld1.64 d13, [r0,:64], r2
  107. vld1.64 d14, [r0,:64], r2
  108. vld1.64 d15, [r0,:64], r2
  109. vld1.64 d16, [r0,:64], r2
  110. vld1.64 d17, [r0,:64], r2
  111. vld1.64 d18, [r0,:64], r2
  112. vld1.64 d19, [r0,:64], r2
  113. sub r0, r0, r2, lsl #3
  114. vrhadd.u8 q1, q1, q6
  115. vrhadd.u8 q2, q2, q7
  116. vrhadd.u8 q3, q3, q8
  117. vrhadd.u8 q4, q4, q9
  118. .endif
  119. vst1.64 d2, [r0,:64], r2
  120. vst1.64 d3, [r0,:64], r2
  121. vst1.64 d4, [r0,:64], r2
  122. vst1.64 d5, [r0,:64], r2
  123. vst1.64 d6, [r0,:64], r2
  124. vst1.64 d7, [r0,:64], r2
  125. vst1.64 d8, [r0,:64], r2
  126. vst1.64 d9, [r0,:64], r2
  127. bx lr
  128. endfunc
  129. .endm
  130. rv40_qpel8_h 5
  131. rv40_qpel8_h 6
  132. .macro rv40_qpel type
  133. function \type\()_rv40_qpel8_h_lowpass_neon
  134. .ifc \type,avg
  135. mov r12, r0
  136. .endif
  137. 1:
  138. vld1.8 {q2}, [r1], r2
  139. vld1.8 {q3}, [r1], r2
  140. qpel_lowpass_x2 d4, d5, d6, d7, d0, d1, 6
  141. .ifc \type,avg
  142. vld1.8 {d3}, [r12,:64], r2
  143. vld1.8 {d16}, [r12,:64], r2
  144. vrhadd.u8 d4, d4, d3
  145. vrhadd.u8 d6, d6, d16
  146. .endif
  147. vst1.8 {d4}, [r0,:64], r2
  148. vst1.8 {d6}, [r0,:64], r2
  149. subs r3, r3, #2
  150. bgt 1b
  151. bx lr
  152. endfunc
  153. function \type\()_rv40_qpel8_v_lowpass_neon
  154. vld1.64 {d2}, [r1], r2
  155. vld1.64 {d3}, [r1], r2
  156. vld1.64 {d4}, [r1], r2
  157. vld1.64 {d5}, [r1], r2
  158. vld1.64 {d6}, [r1], r2
  159. vld1.64 {d7}, [r1], r2
  160. vld1.64 {d8}, [r1], r2
  161. vld1.64 {d9}, [r1], r2
  162. vld1.64 {d10}, [r1], r2
  163. vld1.64 {d11}, [r1], r2
  164. vld1.64 {d12}, [r1], r2
  165. vld1.64 {d13}, [r1], r2
  166. vld1.64 {d14}, [r1]
  167. transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
  168. transpose_8x8 d10, d11, d12, d13, d14, d15, d30, d31
  169. qpel_lowpass_x2 d2, d10, d3, d11, d0, d1, 6
  170. qpel_lowpass_x2 d4, d12, d5, d13, d0, d1, 6
  171. qpel_lowpass_x2 d6, d14, d7, d15, d0, d1, 6
  172. qpel_lowpass_x2 d8, d30, d9, d31, d0, d1, 6
  173. transpose_8x8 d2, d3, d4, d5, d6, d7, d8, d9
  174. .ifc \type,avg
  175. vld1.64 d12, [r0,:64], r2
  176. vld1.64 d13, [r0,:64], r2
  177. vld1.64 d14, [r0,:64], r2
  178. vld1.64 d15, [r0,:64], r2
  179. vld1.64 d16, [r0,:64], r2
  180. vld1.64 d17, [r0,:64], r2
  181. vld1.64 d18, [r0,:64], r2
  182. vld1.64 d19, [r0,:64], r2
  183. sub r0, r0, r2, lsl #3
  184. vrhadd.u8 q1, q1, q6
  185. vrhadd.u8 q2, q2, q7
  186. vrhadd.u8 q3, q3, q8
  187. vrhadd.u8 q4, q4, q9
  188. .endif
  189. vst1.64 d2, [r0,:64], r2
  190. vst1.64 d3, [r0,:64], r2
  191. vst1.64 d4, [r0,:64], r2
  192. vst1.64 d5, [r0,:64], r2
  193. vst1.64 d6, [r0,:64], r2
  194. vst1.64 d7, [r0,:64], r2
  195. vst1.64 d8, [r0,:64], r2
  196. vst1.64 d9, [r0,:64], r2
  197. bx lr
  198. endfunc
  199. rv40_qpel8_v 5, \type
  200. rv40_qpel8_v 6, \type
  201. function ff_\type\()_rv40_qpel8_mc10_neon, export=1
  202. sub r1, r1, #2
  203. mov r3, #8
  204. vmov.i8 d0, #52
  205. vmov.i8 d1, #20
  206. b \type\()_rv40_qpel8_h_lowpass_neon
  207. endfunc
  208. function ff_\type\()_rv40_qpel8_mc30_neon, export=1
  209. sub r1, r1, #2
  210. mov r3, #8
  211. vmov.i8 d0, #20
  212. vmov.i8 d1, #52
  213. b \type\()_rv40_qpel8_h_lowpass_neon
  214. endfunc
  215. function ff_\type\()_rv40_qpel8_mc01_neon, export=1
  216. push {r4, lr}
  217. vpush {d8-d15}
  218. sub r1, r1, r2, lsl #1
  219. vmov.i8 d0, #52
  220. vmov.i8 d1, #20
  221. bl \type\()_rv40_qpel8_v_lowpass_neon
  222. vpop {d8-d15}
  223. pop {r4, pc}
  224. endfunc
  225. function ff_\type\()_rv40_qpel8_mc11_neon, export=1
  226. push {r4, lr}
  227. vpush {d8-d15}
  228. sub sp, sp, #14*8
  229. add r12, sp, #7
  230. bic r12, r12, #7
  231. sub r1, r1, r2, lsl #1
  232. sub r1, r1, #2
  233. mov r3, #12
  234. vmov.i8 d0, #52
  235. vmov.i8 d1, #20
  236. bl put_rv40_qpel8_h_lp_packed_s6_neon
  237. add r1, sp, #7
  238. bic r1, r1, #7
  239. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  240. add sp, sp, #14*8
  241. vpop {d8-d15}
  242. pop {r4, pc}
  243. endfunc
  244. function ff_\type\()_rv40_qpel8_mc21_neon, export=1
  245. push {r4, lr}
  246. vpush {d8-d15}
  247. sub sp, sp, #14*8
  248. add r12, sp, #7
  249. bic r12, r12, #7
  250. sub r1, r1, r2, lsl #1
  251. sub r1, r1, #2
  252. mov r3, #12
  253. vmov.i8 d0, #20
  254. vmov.i8 d1, #20
  255. bl put_rv40_qpel8_h_lp_packed_s5_neon
  256. add r1, sp, #7
  257. bic r1, r1, #7
  258. vmov.i8 d0, #52
  259. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  260. add sp, sp, #14*8
  261. vpop {d8-d15}
  262. pop {r4, pc}
  263. endfunc
  264. function ff_\type\()_rv40_qpel8_mc31_neon, export=1
  265. push {r4, lr}
  266. vpush {d8-d15}
  267. sub sp, sp, #14*8
  268. add r12, sp, #7
  269. bic r12, r12, #7
  270. sub r1, r1, r2, lsl #1
  271. sub r1, r1, #2
  272. mov r3, #12
  273. vmov.i8 d0, #20
  274. vmov.i8 d1, #52
  275. bl put_rv40_qpel8_h_lp_packed_s6_neon
  276. add r1, sp, #7
  277. bic r1, r1, #7
  278. vswp d0, d1
  279. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  280. add sp, sp, #14*8
  281. vpop {d8-d15}
  282. pop {r4, pc}
  283. endfunc
  284. function ff_\type\()_rv40_qpel8_mc12_neon, export=1
  285. push {r4, lr}
  286. vpush {d8-d15}
  287. sub sp, sp, #14*8
  288. add r12, sp, #7
  289. bic r12, r12, #7
  290. sub r1, r1, r2, lsl #1
  291. sub r1, r1, #2
  292. mov r3, #12
  293. vmov.i8 d0, #52
  294. vmov.i8 d1, #20
  295. bl put_rv40_qpel8_h_lp_packed_s6_neon
  296. add r1, sp, #7
  297. bic r1, r1, #7
  298. vmov.i8 d0, #20
  299. bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
  300. add sp, sp, #14*8
  301. vpop {d8-d15}
  302. pop {r4, pc}
  303. endfunc
  304. function ff_\type\()_rv40_qpel8_mc22_neon, export=1
  305. push {r4, lr}
  306. vpush {d8-d15}
  307. sub sp, sp, #14*8
  308. add r12, sp, #7
  309. bic r12, r12, #7
  310. sub r1, r1, r2, lsl #1
  311. sub r1, r1, #2
  312. mov r3, #12
  313. vmov.i8 d0, #20
  314. vmov.i8 d1, #20
  315. bl put_rv40_qpel8_h_lp_packed_s5_neon
  316. add r1, sp, #7
  317. bic r1, r1, #7
  318. bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
  319. add sp, sp, #14*8
  320. vpop {d8-d15}
  321. pop {r4, pc}
  322. endfunc
  323. function ff_\type\()_rv40_qpel8_mc32_neon, export=1
  324. push {r4, lr}
  325. vpush {d8-d15}
  326. sub sp, sp, #14*8
  327. add r12, sp, #7
  328. bic r12, r12, #7
  329. sub r1, r1, r2, lsl #1
  330. sub r1, r1, #2
  331. mov r3, #12
  332. vmov.i8 d0, #20
  333. vmov.i8 d1, #52
  334. bl put_rv40_qpel8_h_lp_packed_s6_neon
  335. add r1, sp, #7
  336. bic r1, r1, #7
  337. vmov.i8 d1, #20
  338. bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
  339. add sp, sp, #14*8
  340. vpop {d8-d15}
  341. pop {r4, pc}
  342. endfunc
  343. function ff_\type\()_rv40_qpel8_mc03_neon, export=1
  344. push {r4, lr}
  345. vpush {d8-d15}
  346. sub r1, r1, r2, lsl #1
  347. vmov.i8 d0, #20
  348. vmov.i8 d1, #52
  349. bl \type\()_rv40_qpel8_v_lowpass_neon
  350. vpop {d8-d15}
  351. pop {r4, pc}
  352. endfunc
  353. function ff_\type\()_rv40_qpel8_mc33_neon, export=1
  354. mov r3, #8
  355. b X(ff_\type\()_pixels8_xy2_neon)
  356. endfunc
  357. function ff_\type\()_rv40_qpel8_mc13_neon, export=1
  358. push {r4, lr}
  359. vpush {d8-d15}
  360. sub sp, sp, #14*8
  361. add r12, sp, #7
  362. bic r12, r12, #7
  363. sub r1, r1, r2, lsl #1
  364. sub r1, r1, #2
  365. mov r3, #12
  366. vmov.i8 d0, #52
  367. vmov.i8 d1, #20
  368. bl put_rv40_qpel8_h_lp_packed_s6_neon
  369. add r1, sp, #7
  370. bic r1, r1, #7
  371. vswp d0, d1
  372. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  373. add sp, sp, #14*8
  374. vpop {d8-d15}
  375. pop {r4, pc}
  376. endfunc
  377. function ff_\type\()_rv40_qpel8_mc23_neon, export=1
  378. push {r4, lr}
  379. vpush {d8-d15}
  380. sub sp, sp, #14*8
  381. add r12, sp, #7
  382. bic r12, r12, #7
  383. sub r1, r1, r2, lsl #1
  384. sub r1, r1, #2
  385. mov r3, #12
  386. vmov.i8 d0, #20
  387. vmov.i8 d1, #20
  388. bl put_rv40_qpel8_h_lp_packed_s5_neon
  389. add r1, sp, #7
  390. bic r1, r1, #7
  391. vmov.i8 d1, #52
  392. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  393. add sp, sp, #14*8
  394. vpop {d8-d15}
  395. pop {r4, pc}
  396. endfunc
  397. function ff_\type\()_rv40_qpel16_mc10_neon, export=1
  398. vmov.i8 d0, #52
  399. vmov.i8 d1, #20
  400. .L\type\()_rv40_qpel16_h:
  401. push {r1, lr}
  402. sub r1, r1, #2
  403. mov r3, #16
  404. bl \type\()_rv40_qpel8_h_lowpass_neon
  405. pop {r1, lr}
  406. sub r0, r0, r2, lsl #4
  407. add r0, r0, #8
  408. add r1, r1, #6
  409. mov r3, #16
  410. b \type\()_rv40_qpel8_h_lowpass_neon
  411. endfunc
  412. function ff_\type\()_rv40_qpel16_mc30_neon, export=1
  413. vmov.i8 d0, #20
  414. vmov.i8 d1, #52
  415. b .L\type\()_rv40_qpel16_h
  416. endfunc
  417. function ff_\type\()_rv40_qpel16_mc01_neon, export=1
  418. vmov.i8 d0, #52
  419. vmov.i8 d1, #20
  420. .L\type\()_rv40_qpel16_v:
  421. sub r1, r1, r2, lsl #1
  422. push {r1, lr}
  423. vpush {d8-d15}
  424. bl \type\()_rv40_qpel8_v_lowpass_neon
  425. sub r1, r1, r2, lsl #2
  426. bl \type\()_rv40_qpel8_v_lowpass_neon
  427. ldr r1, [sp, #64]
  428. sub r0, r0, r2, lsl #4
  429. add r0, r0, #8
  430. add r1, r1, #8
  431. bl \type\()_rv40_qpel8_v_lowpass_neon
  432. sub r1, r1, r2, lsl #2
  433. bl \type\()_rv40_qpel8_v_lowpass_neon
  434. vpop {d8-d15}
  435. pop {r1, pc}
  436. endfunc
  437. function ff_\type\()_rv40_qpel16_mc11_neon, export=1
  438. sub r1, r1, r2, lsl #1
  439. sub r1, r1, #2
  440. push {r1, lr}
  441. vpush {d8-d15}
  442. sub sp, sp, #44*8
  443. add r12, sp, #7
  444. bic r12, r12, #7
  445. mov r3, #20
  446. vmov.i8 d0, #52
  447. vmov.i8 d1, #20
  448. bl put_rv40_qpel8_h_lp_packed_s6_neon
  449. ldr r1, [sp, #416]
  450. add r1, r1, #8
  451. mov r3, #20
  452. bl put_rv40_qpel8_h_lp_packed_s6_neon
  453. .L\type\()_rv40_qpel16_v_s6:
  454. add r1, sp, #7
  455. bic r1, r1, #7
  456. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  457. sub r1, r1, #40
  458. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  459. sub r0, r0, r2, lsl #4
  460. add r0, r0, #8
  461. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  462. sub r1, r1, #40
  463. bl \type\()_rv40_qpel8_v_lp_packed_s6_neon
  464. add sp, sp, #44*8
  465. vpop {d8-d15}
  466. pop {r1, pc}
  467. endfunc
  468. function ff_\type\()_rv40_qpel16_mc21_neon, export=1
  469. sub r1, r1, r2, lsl #1
  470. sub r1, r1, #2
  471. push {r1, lr}
  472. vpush {d8-d15}
  473. sub sp, sp, #44*8
  474. add r12, sp, #7
  475. bic r12, r12, #7
  476. mov r3, #20
  477. vmov.i8 d0, #20
  478. vmov.i8 d1, #20
  479. bl put_rv40_qpel8_h_lp_packed_s5_neon
  480. ldr r1, [sp, #416]
  481. add r1, r1, #8
  482. mov r3, #20
  483. bl put_rv40_qpel8_h_lp_packed_s5_neon
  484. vmov.i8 d0, #52
  485. b .L\type\()_rv40_qpel16_v_s6
  486. endfunc
  487. function ff_\type\()_rv40_qpel16_mc31_neon, export=1
  488. sub r1, r1, r2, lsl #1
  489. sub r1, r1, #2
  490. push {r1, lr}
  491. vpush {d8-d15}
  492. sub sp, sp, #44*8
  493. add r12, sp, #7
  494. bic r12, r12, #7
  495. mov r3, #20
  496. vmov.i8 d0, #20
  497. vmov.i8 d1, #52
  498. bl put_rv40_qpel8_h_lp_packed_s6_neon
  499. ldr r1, [sp, #416]
  500. add r1, r1, #8
  501. mov r3, #20
  502. bl put_rv40_qpel8_h_lp_packed_s6_neon
  503. vswp d0, d1
  504. b .L\type\()_rv40_qpel16_v_s6
  505. endfunc
  506. function ff_\type\()_rv40_qpel16_mc12_neon, export=1
  507. sub r1, r1, r2, lsl #1
  508. sub r1, r1, #2
  509. push {r1, lr}
  510. vpush {d8-d15}
  511. sub sp, sp, #44*8
  512. add r12, sp, #7
  513. bic r12, r12, #7
  514. mov r3, #20
  515. vmov.i8 d0, #52
  516. vmov.i8 d1, #20
  517. bl put_rv40_qpel8_h_lp_packed_s6_neon
  518. ldr r1, [sp, #416]
  519. add r1, r1, #8
  520. mov r3, #20
  521. bl put_rv40_qpel8_h_lp_packed_s6_neon
  522. vmov.i8 d0, #20
  523. .L\type\()_rv40_qpel16_v_s5:
  524. add r1, sp, #7
  525. bic r1, r1, #7
  526. bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
  527. sub r1, r1, #40
  528. bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
  529. sub r0, r0, r2, lsl #4
  530. add r0, r0, #8
  531. bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
  532. sub r1, r1, #40
  533. bl \type\()_rv40_qpel8_v_lp_packed_s5_neon
  534. add sp, sp, #44*8
  535. vpop {d8-d15}
  536. pop {r1, pc}
  537. endfunc
  538. function ff_\type\()_rv40_qpel16_mc22_neon, export=1
  539. sub r1, r1, r2, lsl #1
  540. sub r1, r1, #2
  541. push {r1, lr}
  542. vpush {d8-d15}
  543. sub sp, sp, #44*8
  544. add r12, sp, #7
  545. bic r12, r12, #7
  546. mov r3, #20
  547. vmov.i8 d0, #20
  548. vmov.i8 d1, #20
  549. bl put_rv40_qpel8_h_lp_packed_s5_neon
  550. ldr r1, [sp, #416]
  551. add r1, r1, #8
  552. mov r3, #20
  553. bl put_rv40_qpel8_h_lp_packed_s5_neon
  554. b .L\type\()_rv40_qpel16_v_s5
  555. endfunc
  556. function ff_\type\()_rv40_qpel16_mc32_neon, export=1
  557. sub r1, r1, r2, lsl #1
  558. sub r1, r1, #2
  559. push {r1, lr}
  560. vpush {d8-d15}
  561. sub sp, sp, #44*8
  562. add r12, sp, #7
  563. bic r12, r12, #7
  564. mov r3, #20
  565. vmov.i8 d0, #20
  566. vmov.i8 d1, #52
  567. bl put_rv40_qpel8_h_lp_packed_s6_neon
  568. ldr r1, [sp, #416]
  569. add r1, r1, #8
  570. mov r3, #20
  571. bl put_rv40_qpel8_h_lp_packed_s6_neon
  572. vmov.i8 d1, #20
  573. b .L\type\()_rv40_qpel16_v_s5
  574. endfunc
  575. function ff_\type\()_rv40_qpel16_mc03_neon, export=1
  576. vmov.i8 d0, #20
  577. vmov.i8 d1, #52
  578. b .L\type\()_rv40_qpel16_v
  579. endfunc
  580. function ff_\type\()_rv40_qpel16_mc13_neon, export=1
  581. sub r1, r1, r2, lsl #1
  582. sub r1, r1, #2
  583. push {r1, lr}
  584. vpush {d8-d15}
  585. sub sp, sp, #44*8
  586. add r12, sp, #7
  587. bic r12, r12, #7
  588. mov r3, #20
  589. vmov.i8 d0, #52
  590. vmov.i8 d1, #20
  591. bl put_rv40_qpel8_h_lp_packed_s6_neon
  592. ldr r1, [sp, #416]
  593. add r1, r1, #8
  594. mov r3, #20
  595. bl put_rv40_qpel8_h_lp_packed_s6_neon
  596. vswp d0, d1
  597. b .L\type\()_rv40_qpel16_v_s6
  598. endfunc
  599. function ff_\type\()_rv40_qpel16_mc23_neon, export=1
  600. sub r1, r1, r2, lsl #1
  601. sub r1, r1, #2
  602. push {r1, lr}
  603. vpush {d8-d15}
  604. sub sp, sp, #44*8
  605. add r12, sp, #7
  606. bic r12, r12, #7
  607. mov r3, #20
  608. vmov.i8 d0, #20
  609. vmov.i8 d1, #20
  610. bl put_rv40_qpel8_h_lp_packed_s5_neon
  611. ldr r1, [sp, #416]
  612. add r1, r1, #8
  613. mov r3, #20
  614. bl put_rv40_qpel8_h_lp_packed_s5_neon
  615. vmov.i8 d1, #52
  616. b .L\type\()_rv40_qpel16_v_s6
  617. endfunc
  618. function ff_\type\()_rv40_qpel16_mc33_neon, export=1
  619. mov r3, #16
  620. b X(ff_\type\()_pixels16_xy2_neon)
  621. endfunc
  622. .endm
  623. rv40_qpel put
  624. rv40_qpel avg
  625. .macro rv40_weight
  626. vmovl.u8 q8, d2
  627. vmovl.u8 q9, d3
  628. vmovl.u8 q10, d4
  629. vmovl.u8 q11, d5
  630. vmull.u16 q2, d16, d0[2]
  631. vmull.u16 q3, d17, d0[2]
  632. vmull.u16 q8, d18, d0[2]
  633. vmull.u16 q9, d19, d0[2]
  634. vmull.u16 q12, d20, d0[0]
  635. vmull.u16 q13, d21, d0[0]
  636. vmull.u16 q14, d22, d0[0]
  637. vmull.u16 q15, d23, d0[0]
  638. vshrn.i32 d4, q2, #9
  639. vshrn.i32 d5, q3, #9
  640. vshrn.i32 d6, q8, #9
  641. vshrn.i32 d7, q9, #9
  642. vshrn.i32 d16, q12, #9
  643. vshrn.i32 d17, q13, #9
  644. vshrn.i32 d18, q14, #9
  645. vshrn.i32 d19, q15, #9
  646. vadd.u16 q2, q2, q8
  647. vadd.u16 q3, q3, q9
  648. vrshrn.i16 d2, q2, #5
  649. vrshrn.i16 d3, q3, #5
  650. .endm
  651. /* void ff_rv40_weight_func_16_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  652. int w1, int w2, int stride) */
  653. function ff_rv40_weight_func_16_neon, export=1
  654. ldr r12, [sp]
  655. vmov d0, r3, r12
  656. ldr r12, [sp, #4]
  657. mov r3, #16
  658. 1:
  659. vld1.8 {q1}, [r1,:128], r12
  660. vld1.8 {q2}, [r2,:128], r12
  661. rv40_weight
  662. vst1.8 {q1}, [r0,:128], r12
  663. subs r3, r3, #1
  664. bne 1b
  665. bx lr
  666. endfunc
  667. /* void ff_rv40_weight_func_8_neon(uint8_t *dst, uint8_t *src1, uint8_t *src2,
  668. int w1, int w2, int stride) */
  669. function ff_rv40_weight_func_8_neon, export=1
  670. ldr r12, [sp]
  671. vmov d0, r3, r12
  672. ldr r12, [sp, #4]
  673. mov r3, #8
  674. 1:
  675. vld1.8 {d2}, [r1,:64], r12
  676. vld1.8 {d3}, [r1,:64], r12
  677. vld1.8 {d4}, [r2,:64], r12
  678. vld1.8 {d5}, [r2,:64], r12
  679. rv40_weight
  680. vst1.8 {d2}, [r0,:64], r12
  681. vst1.8 {d3}, [r0,:64], r12
  682. subs r3, r3, #2
  683. bne 1b
  684. bx lr
  685. endfunc
  686. function ff_rv40_h_loop_filter_strength_neon, export=1
  687. pkhbt r2, r3, r2, lsl #18
  688. ldr r3, [r0]
  689. ldr_dpre r12, r0, r1
  690. teq r3, r12
  691. beq 1f
  692. sub r0, r0, r1, lsl #1
  693. vld1.32 {d4[]}, [r0,:32], r1 @ -3
  694. vld1.32 {d0[]}, [r0,:32], r1 @ -2
  695. vld1.32 {d4[1]}, [r0,:32], r1 @ -1
  696. vld1.32 {d5[]}, [r0,:32], r1 @ 0
  697. vld1.32 {d1[]}, [r0,:32], r1 @ 1
  698. vld1.32 {d5[0]}, [r0,:32], r1 @ 2
  699. vpaddl.u8 q8, q0 @ -2, -2, -2, -2, 1, 1, 1, 1
  700. vpaddl.u8 q9, q2 @ -3, -3, -1, -1, 2, 2, 0, 0
  701. vdup.32 d30, r2 @ beta2, beta << 2
  702. vpadd.u16 d16, d16, d17 @ -2, -2, 1, 1
  703. vpadd.u16 d18, d18, d19 @ -3, -1, 2, 0
  704. vabd.u16 d16, d18, d16
  705. vclt.u16 d16, d16, d30
  706. ldrd r2, r3, [sp, #4]
  707. vmovl.u16 q12, d16
  708. vtrn.16 d16, d17
  709. vshr.u32 q12, q12, #15
  710. ldr r0, [sp]
  711. vst1.32 {d24[1]}, [r2,:32]
  712. vst1.32 {d25[1]}, [r3,:32]
  713. cmp r0, #0
  714. it eq
  715. bxeq lr
  716. vand d18, d16, d17
  717. vtrn.32 d18, d19
  718. vand d18, d18, d19
  719. vmov.u16 r0, d18[0]
  720. bx lr
  721. 1:
  722. ldrd r2, r3, [sp, #4]
  723. mov r0, #0
  724. str r0, [r2]
  725. str r0, [r3]
  726. bx lr
  727. endfunc
  728. function ff_rv40_v_loop_filter_strength_neon, export=1
  729. sub r0, r0, #3
  730. pkhbt r2, r3, r2, lsl #18
  731. vld1.8 {d0}, [r0], r1
  732. vld1.8 {d1}, [r0], r1
  733. vld1.8 {d2}, [r0], r1
  734. vld1.8 {d3}, [r0], r1
  735. vaddl.u8 q0, d0, d1
  736. vaddl.u8 q1, d2, d3
  737. vdup.32 q15, r2
  738. vadd.u16 q0, q0, q1 @ -3, -2, -1, 0, 1, 2
  739. vext.16 q1, q0, q0, #1 @ -2, -1, 0, 1, 2
  740. vabd.u16 q0, q1, q0
  741. vclt.u16 q0, q0, q15
  742. ldrd r2, r3, [sp, #4]
  743. vmovl.u16 q1, d0
  744. vext.16 d1, d0, d1, #3
  745. vshr.u32 q1, q1, #15
  746. ldr r0, [sp]
  747. vst1.32 {d2[1]}, [r2,:32]
  748. vst1.32 {d3[1]}, [r3,:32]
  749. cmp r0, #0
  750. it eq
  751. bxeq lr
  752. vand d0, d0, d1
  753. vtrn.16 d0, d1
  754. vand d0, d0, d1
  755. vmov.u16 r0, d0[0]
  756. bx lr
  757. endfunc
  758. .macro rv40_weak_loop_filter
  759. vdup.16 d30, r2 @ filter_p1
  760. vdup.16 d31, r3 @ filter_q1
  761. ldrd r2, r3, [sp]
  762. vdup.16 d28, r2 @ alpha
  763. vdup.16 d29, r3 @ beta
  764. ldr r12, [sp, #8]
  765. vdup.16 d25, r12 @ lim_p0q0
  766. ldrd r2, r3, [sp, #12]
  767. vsubl.u8 q9, d5, d4 @ x, t
  768. vabdl.u8 q8, d5, d4 @ x, abs(t)
  769. vneg.s16 q15, q15
  770. vceq.i16 d16, d19, #0 @ !t
  771. vshl.s16 d19, d19, #2 @ t << 2
  772. vmul.u16 d18, d17, d28 @ alpha * abs(t)
  773. vand d24, d30, d31 @ filter_p1 & filter_q1
  774. vsubl.u8 q1, d0, d4 @ p1p2, p1p0
  775. vsubl.u8 q3, d1, d5 @ q1q2, q1q0
  776. vmov.i16 d22, #3
  777. vshr.u16 d18, d18, #7
  778. vadd.i16 d22, d22, d24 @ 3 - (filter_p1 & filter_q1)
  779. vsubl.u8 q10, d0, d1 @ src[-2] - src[1]
  780. vcle.u16 d18, d18, d22
  781. vand d20, d20, d24
  782. vneg.s16 d23, d25 @ -lim_p0q0
  783. vadd.s16 d19, d19, d20
  784. vbic d16, d18, d16 @ t && u <= 3 - (fp1 & fq1)
  785. vtrn.32 d4, d5 @ -3, 2, -1, 0
  786. vrshr.s16 d19, d19, #3
  787. vmov d28, d29 @ beta
  788. vswp d3, d6 @ q1q2, p1p0
  789. vmin.s16 d19, d19, d25
  790. vand d30, d30, d16
  791. vand d31, d31, d16
  792. vadd.s16 q10, q1, q3 @ p1p2 + p1p0, q1q2 + q1q0
  793. vmax.s16 d19, d19, d23 @ diff
  794. vabs.s16 q1, q1 @ abs(p1p2), abs(q1q2)
  795. vand d18, d19, d16 @ diff
  796. vcle.u16 q1, q1, q14
  797. vneg.s16 d19, d18 @ -diff
  798. vdup.16 d26, r3 @ lim_p1
  799. vaddw.u8 q2, q9, d5 @ src[-1]+diff, src[0]-diff
  800. vhsub.s16 q11, q10, q9
  801. vand q1, q1, q15
  802. vqmovun.s16 d4, q2 @ -1, 0
  803. vand q9, q11, q1
  804. vdup.16 d27, r2 @ lim_q1
  805. vneg.s16 q9, q9
  806. vneg.s16 q14, q13
  807. vmin.s16 q9, q9, q13
  808. vtrn.32 d0, d1 @ -2, 1, -2, 1
  809. vmax.s16 q9, q9, q14
  810. vaddw.u8 q3, q9, d0
  811. vqmovun.s16 d5, q3 @ -2, 1
  812. .endm
  813. function ff_rv40_h_weak_loop_filter_neon, export=1
  814. sub r0, r0, r1, lsl #1
  815. sub r0, r0, r1
  816. vld1.32 {d4[]}, [r0,:32], r1
  817. vld1.32 {d0[]}, [r0,:32], r1
  818. vld1.32 {d4[1]}, [r0,:32], r1
  819. vld1.32 {d5[]}, [r0,:32], r1
  820. vld1.32 {d1[]}, [r0,:32], r1
  821. vld1.32 {d5[0]}, [r0,:32]
  822. sub r0, r0, r1, lsl #2
  823. rv40_weak_loop_filter
  824. vst1.32 {d5[0]}, [r0,:32], r1
  825. vst1.32 {d4[0]}, [r0,:32], r1
  826. vst1.32 {d4[1]}, [r0,:32], r1
  827. vst1.32 {d5[1]}, [r0,:32], r1
  828. bx lr
  829. endfunc
  830. function ff_rv40_v_weak_loop_filter_neon, export=1
  831. sub r12, r0, #3
  832. sub r0, r0, #2
  833. vld1.8 {d4}, [r12], r1
  834. vld1.8 {d5}, [r12], r1
  835. vld1.8 {d2}, [r12], r1
  836. vld1.8 {d3}, [r12], r1
  837. vtrn.16 q2, q1
  838. vtrn.8 d4, d5
  839. vtrn.8 d2, d3
  840. vrev64.32 d5, d5
  841. vtrn.32 q2, q1
  842. vdup.32 d0, d3[0]
  843. vdup.32 d1, d2[0]
  844. rv40_weak_loop_filter
  845. vtrn.32 q2, q3
  846. vswp d4, d5
  847. vst4.8 {d4[0],d5[0],d6[0],d7[0]}, [r0], r1
  848. vst4.8 {d4[1],d5[1],d6[1],d7[1]}, [r0], r1
  849. vst4.8 {d4[2],d5[2],d6[2],d7[2]}, [r0], r1
  850. vst4.8 {d4[3],d5[3],d6[3],d7[3]}, [r0], r1
  851. bx lr
  852. endfunc