You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

485 lines
16KB

  1. /*
  2. * AltiVec-enhanced yuv2yuvX
  3. *
  4. * Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  5. * based on the equivalent C code in swscale.c
  6. *
  7. * This file is part of FFmpeg.
  8. *
  9. * FFmpeg is free software; you can redistribute it and/or
  10. * modify it under the terms of the GNU Lesser General Public
  11. * License as published by the Free Software Foundation; either
  12. * version 2.1 of the License, or (at your option) any later version.
  13. *
  14. * FFmpeg is distributed in the hope that it will be useful,
  15. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  17. * Lesser General Public License for more details.
  18. *
  19. * You should have received a copy of the GNU Lesser General Public
  20. * License along with FFmpeg; if not, write to the Free Software
  21. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  22. */
  23. #include <inttypes.h>
  24. #include "config.h"
  25. #include "libswscale/swscale.h"
  26. #include "libswscale/swscale_internal.h"
  27. #include "libavutil/attributes.h"
  28. #include "libavutil/cpu.h"
  29. #include "yuv2rgb_altivec.h"
  30. #include "libavutil/ppc/util_altivec.h"
  31. #if HAVE_VSX
  32. #define vzero vec_splat_s32(0)
  33. #if !HAVE_BIGENDIAN
  34. #define GET_LS(a,b,c,s) {\
  35. ls = a;\
  36. a = vec_vsx_ld(((b) << 1) + 16, s);\
  37. }
  38. #define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
  39. vector signed short ls;\
  40. vector signed int vf1, vf2, i1, i2;\
  41. GET_LS(l1, x, perm, src);\
  42. i1 = vec_mule(filter, ls);\
  43. i2 = vec_mulo(filter, ls);\
  44. vf1 = vec_mergeh(i1, i2);\
  45. vf2 = vec_mergel(i1, i2);\
  46. d1 = vec_add(d1, vf1);\
  47. d2 = vec_add(d2, vf2);\
  48. } while (0)
  49. #define LOAD_FILTER(vf,f) {\
  50. vf = vec_vsx_ld(joffset, f);\
  51. }
  52. #define LOAD_L1(ll1,s,p){\
  53. ll1 = vec_vsx_ld(xoffset, s);\
  54. }
  55. // The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
  56. // The neat trick: We only care for half the elements,
  57. // high or low depending on (i<<3)%16 (it's 0 or 8 here),
  58. // and we're going to use vec_mule, so we choose
  59. // carefully how to "unpack" the elements into the even slots.
  60. #define GET_VF4(a, vf, f) {\
  61. vf = (vector signed short)vec_vsx_ld(a << 3, f);\
  62. vf = vec_mergeh(vf, (vector signed short)vzero);\
  63. }
  64. #define FIRST_LOAD(sv, pos, s, per) {}
  65. #define UPDATE_PTR(s0, d0, s1, d1) {}
  66. #define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
  67. vf = vec_vsx_ld(pos + a, s);\
  68. }
  69. #define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
  70. #define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
  71. vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
  72. }
  73. #define FUNC(name) name ## _vsx
  74. #include "swscale_ppc_template.c"
  75. #undef FUNC
  76. #undef vzero
  77. #endif /* !HAVE_BIGENDIAN */
  78. static void yuv2plane1_8_u(const int16_t *src, uint8_t *dest, int dstW,
  79. const uint8_t *dither, int offset, int start)
  80. {
  81. int i;
  82. for (i = start; i < dstW; i++) {
  83. int val = (src[i] + dither[(i + offset) & 7]) >> 7;
  84. dest[i] = av_clip_uint8(val);
  85. }
  86. }
  87. static void yuv2plane1_8_vsx(const int16_t *src, uint8_t *dest, int dstW,
  88. const uint8_t *dither, int offset)
  89. {
  90. const int dst_u = -(uintptr_t)dest & 15;
  91. int i, j;
  92. LOCAL_ALIGNED(16, int16_t, val, [16]);
  93. const vector uint16_t shifts = (vector uint16_t) {7, 7, 7, 7, 7, 7, 7, 7};
  94. vector int16_t vi, vileft, ditherleft, ditherright;
  95. vector uint8_t vd;
  96. for (j = 0; j < 16; j++) {
  97. val[j] = dither[(dst_u + offset + j) & 7];
  98. }
  99. ditherleft = vec_ld(0, val);
  100. ditherright = vec_ld(0, &val[8]);
  101. yuv2plane1_8_u(src, dest, dst_u, dither, offset, 0);
  102. for (i = dst_u; i < dstW - 15; i += 16) {
  103. vi = vec_vsx_ld(0, &src[i]);
  104. vi = vec_adds(ditherleft, vi);
  105. vileft = vec_sra(vi, shifts);
  106. vi = vec_vsx_ld(0, &src[i + 8]);
  107. vi = vec_adds(ditherright, vi);
  108. vi = vec_sra(vi, shifts);
  109. vd = vec_packsu(vileft, vi);
  110. vec_st(vd, 0, &dest[i]);
  111. }
  112. yuv2plane1_8_u(src, dest, dstW, dither, offset, i);
  113. }
  114. #if !HAVE_BIGENDIAN
  115. #define output_pixel(pos, val) \
  116. if (big_endian) { \
  117. AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
  118. } else { \
  119. AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
  120. }
  121. static void yuv2plane1_nbps_u(const int16_t *src, uint16_t *dest, int dstW,
  122. int big_endian, int output_bits, int start)
  123. {
  124. int i;
  125. int shift = 15 - output_bits;
  126. for (i = start; i < dstW; i++) {
  127. int val = src[i] + (1 << (shift - 1));
  128. output_pixel(&dest[i], val);
  129. }
  130. }
  131. static void yuv2plane1_nbps_vsx(const int16_t *src, uint16_t *dest, int dstW,
  132. int big_endian, int output_bits)
  133. {
  134. const int dst_u = -(uintptr_t)dest & 7;
  135. const int shift = 15 - output_bits;
  136. const int add = (1 << (shift - 1));
  137. const int clip = (1 << output_bits) - 1;
  138. const vector uint16_t vadd = (vector uint16_t) {add, add, add, add, add, add, add, add};
  139. const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
  140. const vector uint16_t vshift = (vector uint16_t) vec_splat_u16(shift);
  141. const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
  142. vector uint16_t v;
  143. int i;
  144. yuv2plane1_nbps_u(src, dest, dst_u, big_endian, output_bits, 0);
  145. for (i = dst_u; i < dstW - 7; i += 8) {
  146. v = vec_vsx_ld(0, (const uint16_t *) &src[i]);
  147. v = vec_add(v, vadd);
  148. v = vec_sr(v, vshift);
  149. v = vec_min(v, vlargest);
  150. v = vec_rl(v, vswap);
  151. vec_st(v, 0, &dest[i]);
  152. }
  153. yuv2plane1_nbps_u(src, dest, dstW, big_endian, output_bits, i);
  154. }
  155. static void yuv2planeX_nbps_u(const int16_t *filter, int filterSize,
  156. const int16_t **src, uint16_t *dest, int dstW,
  157. int big_endian, int output_bits, int start)
  158. {
  159. int i;
  160. int shift = 11 + 16 - output_bits;
  161. for (i = start; i < dstW; i++) {
  162. int val = 1 << (shift - 1);
  163. int j;
  164. for (j = 0; j < filterSize; j++)
  165. val += src[j][i] * filter[j];
  166. output_pixel(&dest[i], val);
  167. }
  168. }
  169. static void yuv2planeX_nbps_vsx(const int16_t *filter, int filterSize,
  170. const int16_t **src, uint16_t *dest, int dstW,
  171. int big_endian, int output_bits)
  172. {
  173. const int dst_u = -(uintptr_t)dest & 7;
  174. const int shift = 11 + 16 - output_bits;
  175. const int add = (1 << (shift - 1));
  176. const int clip = (1 << output_bits) - 1;
  177. const uint16_t swap = big_endian ? 8 : 0;
  178. const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
  179. const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
  180. const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
  181. const vector uint16_t vlargest = (vector uint16_t) {clip, clip, clip, clip, clip, clip, clip, clip};
  182. const vector int16_t vzero = vec_splat_s16(0);
  183. const vector uint8_t vperm = (vector uint8_t) {0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
  184. vector int16_t vfilter[MAX_FILTER_SIZE], vin;
  185. vector uint16_t v;
  186. vector uint32_t vleft, vright, vtmp;
  187. int i, j;
  188. for (i = 0; i < filterSize; i++) {
  189. vfilter[i] = (vector int16_t) {filter[i], filter[i], filter[i], filter[i],
  190. filter[i], filter[i], filter[i], filter[i]};
  191. }
  192. yuv2planeX_nbps_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
  193. for (i = dst_u; i < dstW - 7; i += 8) {
  194. vleft = vright = vadd;
  195. for (j = 0; j < filterSize; j++) {
  196. vin = vec_vsx_ld(0, &src[j][i]);
  197. vtmp = (vector uint32_t) vec_mule(vin, vfilter[j]);
  198. vleft = vec_add(vleft, vtmp);
  199. vtmp = (vector uint32_t) vec_mulo(vin, vfilter[j]);
  200. vright = vec_add(vright, vtmp);
  201. }
  202. vleft = vec_sra(vleft, vshift);
  203. vright = vec_sra(vright, vshift);
  204. v = vec_packsu(vleft, vright);
  205. v = (vector uint16_t) vec_max((vector int16_t) v, vzero);
  206. v = vec_min(v, vlargest);
  207. v = vec_rl(v, vswap);
  208. v = vec_perm(v, v, vperm);
  209. vec_st(v, 0, &dest[i]);
  210. }
  211. yuv2planeX_nbps_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
  212. }
  213. #undef output_pixel
  214. #define output_pixel(pos, val, bias, signedness) \
  215. if (big_endian) { \
  216. AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
  217. } else { \
  218. AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
  219. }
  220. static void yuv2plane1_16_u(const int32_t *src, uint16_t *dest, int dstW,
  221. int big_endian, int output_bits, int start)
  222. {
  223. int i;
  224. const int shift = 3;
  225. for (i = start; i < dstW; i++) {
  226. int val = src[i] + (1 << (shift - 1));
  227. output_pixel(&dest[i], val, 0, uint);
  228. }
  229. }
  230. static void yuv2plane1_16_vsx(const int32_t *src, uint16_t *dest, int dstW,
  231. int big_endian, int output_bits)
  232. {
  233. const int dst_u = -(uintptr_t)dest & 7;
  234. const int shift = 3;
  235. const int add = (1 << (shift - 1));
  236. const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
  237. const vector uint16_t vswap = (vector uint16_t) vec_splat_u16(big_endian ? 8 : 0);
  238. const vector uint32_t vshift = (vector uint32_t) vec_splat_u32(shift);
  239. vector uint32_t v, v2;
  240. vector uint16_t vd;
  241. int i;
  242. yuv2plane1_16_u(src, dest, dst_u, big_endian, output_bits, 0);
  243. for (i = dst_u; i < dstW - 7; i += 8) {
  244. v = vec_vsx_ld(0, (const uint32_t *) &src[i]);
  245. v = vec_add(v, vadd);
  246. v = vec_sr(v, vshift);
  247. v2 = vec_vsx_ld(0, (const uint32_t *) &src[i + 4]);
  248. v2 = vec_add(v2, vadd);
  249. v2 = vec_sr(v2, vshift);
  250. vd = vec_packsu(v, v2);
  251. vd = vec_rl(vd, vswap);
  252. vec_st(vd, 0, &dest[i]);
  253. }
  254. yuv2plane1_16_u(src, dest, dstW, big_endian, output_bits, i);
  255. }
  256. #if HAVE_POWER8
  257. static void yuv2planeX_16_u(const int16_t *filter, int filterSize,
  258. const int32_t **src, uint16_t *dest, int dstW,
  259. int big_endian, int output_bits, int start)
  260. {
  261. int i;
  262. int shift = 15;
  263. for (i = start; i < dstW; i++) {
  264. int val = 1 << (shift - 1);
  265. int j;
  266. /* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
  267. * filters (or anything with negative coeffs, the range can be slightly
  268. * wider in both directions. To account for this overflow, we subtract
  269. * a constant so it always fits in the signed range (assuming a
  270. * reasonable filterSize), and re-add that at the end. */
  271. val -= 0x40000000;
  272. for (j = 0; j < filterSize; j++)
  273. val += src[j][i] * (unsigned)filter[j];
  274. output_pixel(&dest[i], val, 0x8000, int);
  275. }
  276. }
  277. static void yuv2planeX_16_vsx(const int16_t *filter, int filterSize,
  278. const int32_t **src, uint16_t *dest, int dstW,
  279. int big_endian, int output_bits)
  280. {
  281. const int dst_u = -(uintptr_t)dest & 7;
  282. const int shift = 15;
  283. const int bias = 0x8000;
  284. const int add = (1 << (shift - 1)) - 0x40000000;
  285. const uint16_t swap = big_endian ? 8 : 0;
  286. const vector uint32_t vadd = (vector uint32_t) {add, add, add, add};
  287. const vector uint32_t vshift = (vector uint32_t) {shift, shift, shift, shift};
  288. const vector uint16_t vswap = (vector uint16_t) {swap, swap, swap, swap, swap, swap, swap, swap};
  289. const vector uint16_t vbias = (vector uint16_t) {bias, bias, bias, bias, bias, bias, bias, bias};
  290. vector int32_t vfilter[MAX_FILTER_SIZE];
  291. vector uint16_t v;
  292. vector uint32_t vleft, vright, vtmp;
  293. vector int32_t vin32l, vin32r;
  294. int i, j;
  295. for (i = 0; i < filterSize; i++) {
  296. vfilter[i] = (vector int32_t) {filter[i], filter[i], filter[i], filter[i]};
  297. }
  298. yuv2planeX_16_u(filter, filterSize, src, dest, dst_u, big_endian, output_bits, 0);
  299. for (i = dst_u; i < dstW - 7; i += 8) {
  300. vleft = vright = vadd;
  301. for (j = 0; j < filterSize; j++) {
  302. vin32l = vec_vsx_ld(0, &src[j][i]);
  303. vin32r = vec_vsx_ld(0, &src[j][i + 4]);
  304. vtmp = (vector uint32_t) vec_mul(vin32l, vfilter[j]);
  305. vleft = vec_add(vleft, vtmp);
  306. vtmp = (vector uint32_t) vec_mul(vin32r, vfilter[j]);
  307. vright = vec_add(vright, vtmp);
  308. }
  309. vleft = vec_sra(vleft, vshift);
  310. vright = vec_sra(vright, vshift);
  311. v = (vector uint16_t) vec_packs((vector int32_t) vleft, (vector int32_t) vright);
  312. v = vec_add(v, vbias);
  313. v = vec_rl(v, vswap);
  314. vec_st(v, 0, &dest[i]);
  315. }
  316. yuv2planeX_16_u(filter, filterSize, src, dest, dstW, big_endian, output_bits, i);
  317. }
  318. #endif /* HAVE_POWER8 */
  319. #define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
  320. yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
  321. yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t)
  322. #define yuv2NBPS1(bits, BE_LE, is_be, template_size, typeX_t) \
  323. static void yuv2plane1_ ## bits ## BE_LE ## _vsx(const int16_t *src, \
  324. uint8_t *dest, int dstW, \
  325. const uint8_t *dither, int offset) \
  326. { \
  327. yuv2plane1_ ## template_size ## _vsx((const typeX_t *) src, \
  328. (uint16_t *) dest, dstW, is_be, bits); \
  329. }
  330. #define yuv2NBPSX(bits, BE_LE, is_be, template_size, typeX_t) \
  331. static void yuv2planeX_ ## bits ## BE_LE ## _vsx(const int16_t *filter, int filterSize, \
  332. const int16_t **src, uint8_t *dest, int dstW, \
  333. const uint8_t *dither, int offset)\
  334. { \
  335. yuv2planeX_## template_size ## _vsx(filter, \
  336. filterSize, (const typeX_t **) src, \
  337. (uint16_t *) dest, dstW, is_be, bits); \
  338. }
  339. yuv2NBPS( 9, BE, 1, nbps, int16_t)
  340. yuv2NBPS( 9, LE, 0, nbps, int16_t)
  341. yuv2NBPS(10, BE, 1, nbps, int16_t)
  342. yuv2NBPS(10, LE, 0, nbps, int16_t)
  343. yuv2NBPS(12, BE, 1, nbps, int16_t)
  344. yuv2NBPS(12, LE, 0, nbps, int16_t)
  345. yuv2NBPS(14, BE, 1, nbps, int16_t)
  346. yuv2NBPS(14, LE, 0, nbps, int16_t)
  347. yuv2NBPS1(16, BE, 1, 16, int32_t)
  348. yuv2NBPS1(16, LE, 0, 16, int32_t)
  349. #if HAVE_POWER8
  350. yuv2NBPSX(16, BE, 1, 16, int32_t)
  351. yuv2NBPSX(16, LE, 0, 16, int32_t)
  352. #endif
  353. #endif /* !HAVE_BIGENDIAN */
  354. #endif /* HAVE_VSX */
  355. av_cold void ff_sws_init_swscale_vsx(SwsContext *c)
  356. {
  357. #if HAVE_VSX
  358. enum AVPixelFormat dstFormat = c->dstFormat;
  359. const int cpu_flags = av_get_cpu_flags();
  360. if (!(cpu_flags & AV_CPU_FLAG_VSX))
  361. return;
  362. #if !HAVE_BIGENDIAN
  363. if (c->srcBpc == 8 && c->dstBpc <= 14) {
  364. c->hyScale = c->hcScale = hScale_real_vsx;
  365. }
  366. if (!is16BPS(dstFormat) && !isNBPS(dstFormat) &&
  367. dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
  368. dstFormat != AV_PIX_FMT_GRAYF32BE && dstFormat != AV_PIX_FMT_GRAYF32LE &&
  369. !c->needAlpha) {
  370. c->yuv2planeX = yuv2planeX_vsx;
  371. }
  372. #endif
  373. if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->needAlpha) {
  374. switch (c->dstBpc) {
  375. case 8:
  376. c->yuv2plane1 = yuv2plane1_8_vsx;
  377. break;
  378. #if !HAVE_BIGENDIAN
  379. case 9:
  380. c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_vsx : yuv2plane1_9LE_vsx;
  381. c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_vsx : yuv2planeX_9LE_vsx;
  382. break;
  383. case 10:
  384. c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_vsx : yuv2plane1_10LE_vsx;
  385. c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_vsx : yuv2planeX_10LE_vsx;
  386. break;
  387. case 12:
  388. c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_vsx : yuv2plane1_12LE_vsx;
  389. c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_vsx : yuv2planeX_12LE_vsx;
  390. break;
  391. case 14:
  392. c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_vsx : yuv2plane1_14LE_vsx;
  393. c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_vsx : yuv2planeX_14LE_vsx;
  394. break;
  395. case 16:
  396. c->yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_vsx : yuv2plane1_16LE_vsx;
  397. #if HAVE_POWER8
  398. if (cpu_flags & AV_CPU_FLAG_POWER8) {
  399. c->yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_vsx : yuv2planeX_16LE_vsx;
  400. }
  401. #endif /* HAVE_POWER8 */
  402. break;
  403. #endif /* !HAVE_BIGENDIAN */
  404. }
  405. }
  406. #endif /* HAVE_VSX */
  407. }