You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

408 lines
18KB

  1. /*
  2. * Copyright (c) 2015 Matthieu Bouron <matthieu.bouron stupeflix.com>
  3. * Copyright (c) 2015 Clément Bœsch <clement stupeflix.com>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "libavutil/arm/asm.S"
  22. .macro compute_premult_16 half_u1, half_u2, half_v1, half_v2
  23. vmov d2, \half_u1 @ copy left q14 to left q1
  24. vmov d3, \half_u1 @ copy left q14 to right q1
  25. vmov d4, \half_u2 @ copy right q14 to left q2
  26. vmov d5, \half_u2 @ copy right q14 to right q2
  27. vmov d6, \half_v1 @ copy left q15 to left q3
  28. vmov d7, \half_v1 @ copy left q15 to right q3
  29. vmov d8, \half_v2 @ copy right q15 to left q4
  30. vmov d9, \half_v2 @ copy right q15 to right q4
  31. vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4
  32. vzip.16 d4, d5 @ U5U5U6U6U7U7U8U8
  33. vzip.16 d6, d7 @ V1V1V2V2V3V3V4V4
  34. vzip.16 d8, d9 @ V5V5V6V6V7V7V8V8
  35. vmul.s16 q8, q3, d1[0] @ V * v2r (left, red)
  36. vmul.s16 q9, q4, d1[0] @ V * v2r (right, red)
  37. vmul.s16 q10, q1, d1[1] @ U * u2g
  38. vmul.s16 q11, q2, d1[1] @ U * u2g
  39. vmla.s16 q10, q3, d1[2] @ U * u2g + V * v2g (left, green)
  40. vmla.s16 q11, q4, d1[2] @ U * u2g + V * v2g (right, green)
  41. vmul.s16 q12, q1, d1[3] @ U * u2b (left, blue)
  42. vmul.s16 q13, q2, d1[3] @ U * u2b (right, blue)
  43. .endm
  44. .macro compute_premult_32 half_u half_v
  45. vmov d2, \half_u @ copy left q14 to left q1
  46. vmov d3, \half_u @ copy left q14 to right q1
  47. vmov d4, \half_v @ copy left q15 to left q2
  48. vmov d5, \half_v @ copy left q15 to right q2
  49. vzip.16 d2, d3 @ U1U1U2U2U3U3U4U4
  50. vzip.16 d4, d5 @ V1V1V2V2V3V3V4V4
  51. vmull.s16 q8, d4, d1[0] @ V * v2r (left, red)
  52. vmull.s16 q9, d5, d1[0] @ V * v2r (right, red)
  53. vmull.s16 q10, d2, d1[1] @ U * u2g
  54. vmull.s16 q11, d3, d1[1] @ U * u2g
  55. vmlal.s16 q10, d4, d1[2] @ U * u2g + V * v2g (left, green)
  56. vmlal.s16 q11, d5, d1[2] @ U * u2g + V * v2g (right, green)
  57. vmull.s16 q12, d2, d1[3] @ U * u2b (left, blue)
  58. vmull.s16 q13, d3, d1[3] @ U * u2b (right, blue)
  59. .endm
  60. .macro compute_color_16 dst_comp1 dst_comp2 pre1 pre2
  61. vadd.s16 q1, q14, \pre1
  62. vadd.s16 q2, q15, \pre2
  63. vqrshrun.s16 \dst_comp1, q1, #6
  64. vqrshrun.s16 \dst_comp2, q2, #6
  65. .endm
  66. .macro compute_color_32 dst_comp pre1 pre2
  67. vadd.s32 q3, q1, \pre1
  68. vadd.s32 q4, q2, \pre2
  69. vqrshrun.s32 d10, q3, #13
  70. vqrshrun.s32 d11, q4, #13 @ q5 = ({q3,q4} + (1<<12)) >> 13
  71. vqmovn.u16 \dst_comp, q5 @ saturate 16bit -> 8bit
  72. .endm
  73. .macro compute_rgba_16 r1 r2 g1 g2 b1 b2 a1 a2
  74. compute_color_16 \r1, \r2, q8, q9
  75. compute_color_16 \g1, \g2, q10, q11
  76. compute_color_16 \b1, \b2, q12, q13
  77. vmov.u8 \a1, #255
  78. vmov.u8 \a2, #255
  79. .endm
  80. .macro compute_rgba_32 r g b a
  81. compute_color_32 \r, q8, q9
  82. compute_color_32 \g, q10, q11
  83. compute_color_32 \b, q12, q13
  84. vmov.u8 \a, #255
  85. .endm
  86. .macro compute_16px_16 dst y0 y1 ofmt
  87. vmovl.u8 q14, \y0 @ 8px of y
  88. vmovl.u8 q15, \y1 @ 8px of y
  89. vdup.16 q5, r9 @ q5 = y_offset
  90. vmov d14, d0 @ q7 = y_coeff
  91. vmov d15, d0 @ q7 = y_coeff
  92. vsub.s16 q14, q5
  93. vsub.s16 q15, q5
  94. vmul.s16 q14, q7 @ q14 = (srcY - y_offset) * y_coeff (left)
  95. vmul.s16 q15, q7 @ q15 = (srcY - y_offset) * y_coeff (right)
  96. .ifc \ofmt,argb
  97. compute_rgba_16 d7, d11, d8, d12, d9, d13, d6, d10
  98. .endif
  99. .ifc \ofmt,rgba
  100. compute_rgba_16 d6, d10, d7, d11, d8, d12, d9, d13
  101. .endif
  102. .ifc \ofmt,abgr
  103. compute_rgba_16 d9, d13, d8, d12, d7, d11, d6, d10
  104. .endif
  105. .ifc \ofmt,bgra
  106. compute_rgba_16 d8, d12, d7, d11, d6, d10, d9, d13
  107. .endif
  108. vst4.8 {q3, q4}, [\dst,:128]!
  109. vst4.8 {q5, q6}, [\dst,:128]!
  110. .endm
  111. .macro compute_8px_32 dst half_y ofmt
  112. vmovl.u8 q7, \half_y @ 8px of Y
  113. vdup.16 q5, r9
  114. vsub.s16 q7, q5
  115. vmull.s16 q1, d14, d0 @ q1 = (srcY - y_offset) * y_coeff (left)
  116. vmull.s16 q2, d15, d0 @ q2 = (srcY - y_offset) * y_coeff (right)
  117. .ifc \ofmt,argb
  118. compute_rgba_32 d13, d14, d15, d12
  119. .endif
  120. .ifc \ofmt,rgba
  121. compute_rgba_32 d12, d13, d14, d15
  122. .endif
  123. .ifc \ofmt,abgr
  124. compute_rgba_32 d15, d14, d13, d12
  125. .endif
  126. .ifc \ofmt,bgra
  127. compute_rgba_32 d14, d13, d12, d15
  128. .endif
  129. vst4.8 {q6, q7}, [\dst,:128]!
  130. .endm
  131. .macro process_1l_16px_16 ofmt
  132. compute_premult_16 d28, d29, d30, d31
  133. vld1.8 {q7}, [r4]!
  134. compute_16px_16 r2, d14, d15, \ofmt
  135. .endm
  136. .macro process_1l_16px_32 ofmt
  137. compute_premult_32 d28, d30
  138. vld1.8 {q7}, [r4]!
  139. vmov d28, d15 @ save right of the line of luma for later use
  140. compute_8px_32 r2, d14, \ofmt
  141. compute_premult_32 d29, d31
  142. compute_8px_32 r2, d28, \ofmt
  143. .endm
  144. .macro process_2l_16px_16 ofmt
  145. compute_premult_16 d28, d29, d30, d31
  146. vld1.8 {q7}, [r4]! @ first line of luma
  147. compute_16px_16 r2, d14, d15, \ofmt
  148. vld1.8 {q7}, [r12]! @ second line of luma
  149. compute_16px_16 r11, d14, d15, \ofmt
  150. .endm
  151. .macro process_2l_16px_32 ofmt
  152. compute_premult_32 d28, d30
  153. vld1.8 {q7}, [r4]! @ first line of luma
  154. vmov d28, d15 @ save right of the first line of luma for later use
  155. compute_8px_32 r2, d14, \ofmt
  156. vld1.8 {q7}, [r12]! @ second line of luma
  157. vmov d30, d15 @ save right of the second line of luma for later use
  158. compute_8px_32 r11, d14, \ofmt
  159. compute_premult_32 d29, d31
  160. compute_8px_32 r2, d28, \ofmt
  161. compute_8px_32 r11, d30, \ofmt
  162. .endm
  163. .macro load_args_nvx
  164. push {r4-r12, lr}
  165. vpush {q4-q7}
  166. ldr r4, [sp, #104] @ r4 = srcY
  167. ldr r5, [sp, #108] @ r5 = linesizeY
  168. ldr r6, [sp, #112] @ r6 = srcC
  169. ldr r7, [sp, #116] @ r7 = linesizeC
  170. ldr r8, [sp, #120] @ r8 = table
  171. ldr r9, [sp, #124] @ r9 = y_offset
  172. ldr r10,[sp, #128] @ r10 = y_coeff
  173. vdup.16 d0, r10 @ d0 = y_coeff
  174. vld1.16 {d1}, [r8] @ d1 = *table
  175. add r11, r2, r3 @ r11 = dst + linesize (dst2)
  176. add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
  177. lsl r3, r3, #1
  178. lsl r5, r5, #1
  179. lsl r8, r0, #2
  180. sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
  181. sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
  182. sub r7, r7, r0 @ r7 = linesizeC - width (paddingC)
  183. .endm
  184. .macro load_args_yuv420p
  185. push {r4-r12, lr}
  186. vpush {q4-q7}
  187. ldr r4, [sp, #104] @ r4 = srcY
  188. ldr r5, [sp, #108] @ r5 = linesizeY
  189. ldr r6, [sp, #112] @ r6 = srcU
  190. ldr r8, [sp, #128] @ r8 = table
  191. ldr r9, [sp, #132] @ r9 = y_offset
  192. ldr r10,[sp, #136] @ r10 = y_coeff
  193. vdup.16 d0, r10 @ d0 = y_coeff
  194. vld1.16 {d1}, [r8] @ d1 = *table
  195. add r11, r2, r3 @ r11 = dst + linesize (dst2)
  196. add r12, r4, r5 @ r12 = srcY + linesizeY (srcY2)
  197. lsl r3, r3, #1
  198. lsl r5, r5, #1
  199. lsl r8, r0, #2
  200. sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
  201. sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
  202. ldr r10,[sp, #120] @ r10 = srcV
  203. .endm
  204. .macro load_args_yuv422p
  205. push {r4-r12, lr}
  206. vpush {q4-q7}
  207. ldr r4, [sp, #104] @ r4 = srcY
  208. ldr r5, [sp, #108] @ r5 = linesizeY
  209. ldr r6, [sp, #112] @ r6 = srcU
  210. ldr r7, [sp, #116] @ r7 = linesizeU
  211. ldr r12,[sp, #124] @ r12 = linesizeV
  212. ldr r8, [sp, #128] @ r8 = table
  213. ldr r9, [sp, #132] @ r9 = y_offset
  214. ldr r10,[sp, #136] @ r10 = y_coeff
  215. vdup.16 d0, r10 @ d0 = y_coeff
  216. vld1.16 {d1}, [r8] @ d1 = *table
  217. add r11, r2, r3 @ r11 = dst + linesize (dst2)
  218. lsl r8, r0, #2
  219. sub r3, r3, r8 @ r3 = linesize * 2 - width * 4 (padding)
  220. sub r5, r5, r0 @ r5 = linesizeY * 2 - width (paddingY)
  221. sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
  222. sub r12,r12,r0, lsr #1 @ r12 = linesizeV - width / 2 (paddingV)
  223. ldr r10,[sp, #120] @ r10 = srcV
  224. .endm
  225. .macro declare_func ifmt ofmt precision
  226. function ff_\ifmt\()_to_\ofmt\()_neon_\precision\(), export=1
  227. .ifc \ifmt,nv12
  228. load_args_nvx
  229. .endif
  230. .ifc \ifmt,nv21
  231. load_args_nvx
  232. .endif
  233. .ifc \ifmt,yuv420p
  234. load_args_yuv420p
  235. .endif
  236. .ifc \ifmt,yuv422p
  237. load_args_yuv422p
  238. .endif
  239. 1:
  240. mov r8, r0 @ r8 = width
  241. 2:
  242. pld [r6, #64*3]
  243. pld [r4, #64*3]
  244. vmov.i8 d10, #128
  245. .ifc \ifmt,nv12
  246. pld [r12, #64*3]
  247. vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
  248. vsubl.u8 q14, d2, d10 @ q14 = U - 128
  249. vsubl.u8 q15, d3, d10 @ q15 = V - 128
  250. process_2l_16px_\precision \ofmt
  251. .endif
  252. .ifc \ifmt,nv21
  253. pld [r12, #64*3]
  254. vld2.8 {d2, d3}, [r6]! @ q1: interleaved chroma line
  255. vsubl.u8 q14, d3, d10 @ q14 = U - 128
  256. vsubl.u8 q15, d2, d10 @ q15 = V - 128
  257. process_2l_16px_\precision \ofmt
  258. .endif
  259. .ifc \ifmt,yuv420p
  260. pld [r10, #64*3]
  261. pld [r12, #64*3]
  262. vld1.8 d2, [r6]! @ d2: chroma red line
  263. vld1.8 d3, [r10]! @ d3: chroma blue line
  264. vsubl.u8 q14, d2, d10 @ q14 = U - 128
  265. vsubl.u8 q15, d3, d10 @ q15 = V - 128
  266. process_2l_16px_\precision \ofmt
  267. .endif
  268. .ifc \ifmt,yuv422p
  269. pld [r10, #64*3]
  270. vld1.8 d2, [r6]! @ d2: chroma red line
  271. vld1.8 d3, [r10]! @ d3: chroma blue line
  272. vsubl.u8 q14, d2, d10 @ q14 = U - 128
  273. vsubl.u8 q15, d3, d10 @ q15 = V - 128
  274. process_1l_16px_\precision \ofmt
  275. .endif
  276. subs r8, r8, #16 @ width -= 16
  277. bgt 2b
  278. add r2, r2, r3 @ dst += padding
  279. add r4, r4, r5 @ srcY += paddingY
  280. .ifc \ifmt,nv12
  281. add r11, r11, r3 @ dst2 += padding
  282. add r12, r12, r5 @ srcY2 += paddingY
  283. add r6, r6, r7 @ srcC += paddingC
  284. subs r1, r1, #2 @ height -= 2
  285. .endif
  286. .ifc \ifmt,nv21
  287. add r11, r11, r3 @ dst2 += padding
  288. add r12, r12, r5 @ srcY2 += paddingY
  289. add r6, r6, r7 @ srcC += paddingC
  290. subs r1, r1, #2 @ height -= 2
  291. .endif
  292. .ifc \ifmt,yuv420p
  293. add r11, r11, r3 @ dst2 += padding
  294. add r12, r12, r5 @ srcY2 += paddingY
  295. ldr r7, [sp, #116] @ r7 = linesizeU
  296. sub r7, r7, r0, lsr #1 @ r7 = linesizeU - width / 2 (paddingU)
  297. add r6, r6, r7 @ srcU += paddingU
  298. ldr r7, [sp, #124] @ r7 = linesizeV
  299. sub r7, r7, r0, lsr #1 @ r7 = linesizeV - width / 2 (paddingV)
  300. add r10, r10, r7 @ srcV += paddingV
  301. subs r1, r1, #2 @ height -= 2
  302. .endif
  303. .ifc \ifmt,yuv422p
  304. add r6, r6, r7 @ srcU += paddingU
  305. add r10,r10,r12 @ srcV += paddingV
  306. subs r1, r1, #1 @ height -= 1
  307. .endif
  308. bgt 1b
  309. vpop {q4-q7}
  310. pop {r4-r12, lr}
  311. mov pc, lr
  312. endfunc
  313. .endm
  314. .macro declare_rgb_funcs ifmt precision
  315. declare_func \ifmt, argb, \precision
  316. declare_func \ifmt, rgba, \precision
  317. declare_func \ifmt, abgr, \precision
  318. declare_func \ifmt, bgra, \precision
  319. .endm
  320. declare_rgb_funcs nv12, 16
  321. declare_rgb_funcs nv21, 16
  322. declare_rgb_funcs nv12, 32
  323. declare_rgb_funcs nv21, 32
  324. declare_rgb_funcs yuv420p, 16
  325. declare_rgb_funcs yuv420p, 32
  326. declare_rgb_funcs yuv422p, 16
  327. declare_rgb_funcs yuv422p, 32