You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

566 lines
24KB

  1. /*
  2. * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../dsputil.h"
  21. #include "gcc_fixes.h"
  22. #include "dsputil_altivec.h"
  23. #include "types_altivec.h"
  24. #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
  25. #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
  26. #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
  27. #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
  28. #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
  29. #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
  30. #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
  31. #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
  32. #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
  33. #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
  34. #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
  35. #include "h264_template_altivec.c"
  36. #undef OP_U8_ALTIVEC
  37. #undef PREFIX_h264_chroma_mc8_altivec
  38. #undef PREFIX_h264_chroma_mc8_num
  39. #undef PREFIX_h264_qpel16_h_lowpass_altivec
  40. #undef PREFIX_h264_qpel16_h_lowpass_num
  41. #undef PREFIX_h264_qpel16_v_lowpass_altivec
  42. #undef PREFIX_h264_qpel16_v_lowpass_num
  43. #undef PREFIX_h264_qpel16_hv_lowpass_altivec
  44. #undef PREFIX_h264_qpel16_hv_lowpass_num
  45. #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
  46. #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
  47. #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
  48. #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
  49. #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
  50. #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
  51. #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
  52. #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
  53. #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
  54. #include "h264_template_altivec.c"
  55. #undef OP_U8_ALTIVEC
  56. #undef PREFIX_h264_chroma_mc8_altivec
  57. #undef PREFIX_h264_chroma_mc8_num
  58. #undef PREFIX_h264_qpel16_h_lowpass_altivec
  59. #undef PREFIX_h264_qpel16_h_lowpass_num
  60. #undef PREFIX_h264_qpel16_v_lowpass_altivec
  61. #undef PREFIX_h264_qpel16_v_lowpass_num
  62. #undef PREFIX_h264_qpel16_hv_lowpass_altivec
  63. #undef PREFIX_h264_qpel16_hv_lowpass_num
  64. #define H264_MC(OPNAME, SIZE, CODETYPE) \
  65. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
  66. OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
  67. }\
  68. \
  69. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
  70. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  71. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  72. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
  73. }\
  74. \
  75. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  76. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
  77. }\
  78. \
  79. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  80. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  81. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  82. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
  83. }\
  84. \
  85. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  86. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  87. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  88. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
  89. }\
  90. \
  91. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  92. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
  93. }\
  94. \
  95. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  96. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  97. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  98. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
  99. }\
  100. \
  101. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  102. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  103. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  104. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
  105. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
  106. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  107. }\
  108. \
  109. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  110. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  111. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  112. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
  113. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
  114. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  115. }\
  116. \
  117. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  118. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  119. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  120. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
  121. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
  122. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  123. }\
  124. \
  125. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  126. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  127. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  128. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
  129. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
  130. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  131. }\
  132. \
  133. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  134. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  135. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
  136. }\
  137. \
  138. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  139. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  140. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  141. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  142. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
  143. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  144. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
  145. }\
  146. \
  147. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  148. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  149. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  150. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  151. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
  152. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  153. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
  154. }\
  155. \
  156. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  157. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  158. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  159. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  160. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
  161. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  162. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
  163. }\
  164. \
  165. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  166. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  167. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  168. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  169. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
  170. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  171. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
  172. }\
  173. /* this code assume that stride % 16 == 0 */
  174. void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
  175. signed int ABCD[4] __attribute__((aligned(16))) =
  176. {((8 - x) * (8 - y)),
  177. ((x) * (8 - y)),
  178. ((8 - x) * (y)),
  179. ((x) * (y))};
  180. register int i;
  181. vector unsigned char fperm;
  182. const vector signed int vABCD = vec_ld(0, ABCD);
  183. const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
  184. const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
  185. const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
  186. const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
  187. const vector signed int vzero = vec_splat_s32(0);
  188. const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
  189. const vector unsigned short v6us = vec_splat_u16(6);
  190. register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
  191. register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
  192. vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
  193. vector unsigned char vsrc0uc, vsrc1uc;
  194. vector signed short vsrc0ssH, vsrc1ssH;
  195. vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
  196. vector signed short vsrc2ssH, vsrc3ssH, psum;
  197. vector unsigned char vdst, ppsum, fsum;
  198. if (((unsigned long)dst) % 16 == 0) {
  199. fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
  200. 0x14, 0x15, 0x16, 0x17,
  201. 0x08, 0x09, 0x0A, 0x0B,
  202. 0x0C, 0x0D, 0x0E, 0x0F);
  203. } else {
  204. fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
  205. 0x04, 0x05, 0x06, 0x07,
  206. 0x18, 0x19, 0x1A, 0x1B,
  207. 0x1C, 0x1D, 0x1E, 0x1F);
  208. }
  209. vsrcAuc = vec_ld(0, src);
  210. if (loadSecond)
  211. vsrcBuc = vec_ld(16, src);
  212. vsrcperm0 = vec_lvsl(0, src);
  213. vsrcperm1 = vec_lvsl(1, src);
  214. vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
  215. if (reallyBadAlign)
  216. vsrc1uc = vsrcBuc;
  217. else
  218. vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
  219. vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  220. (vector unsigned char)vsrc0uc);
  221. vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  222. (vector unsigned char)vsrc1uc);
  223. if (!loadSecond) {// -> !reallyBadAlign
  224. for (i = 0 ; i < h ; i++) {
  225. vsrcCuc = vec_ld(stride + 0, src);
  226. vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
  227. vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
  228. vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  229. (vector unsigned char)vsrc2uc);
  230. vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  231. (vector unsigned char)vsrc3uc);
  232. psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
  233. psum = vec_mladd(vB, vsrc1ssH, psum);
  234. psum = vec_mladd(vC, vsrc2ssH, psum);
  235. psum = vec_mladd(vD, vsrc3ssH, psum);
  236. psum = vec_add(v28ss, psum);
  237. psum = vec_sra(psum, v6us);
  238. vdst = vec_ld(0, dst);
  239. ppsum = (vector unsigned char)vec_packsu(psum, psum);
  240. fsum = vec_perm(vdst, ppsum, fperm);
  241. vec_st(fsum, 0, dst);
  242. vsrc0ssH = vsrc2ssH;
  243. vsrc1ssH = vsrc3ssH;
  244. dst += stride;
  245. src += stride;
  246. }
  247. } else {
  248. vector unsigned char vsrcDuc;
  249. for (i = 0 ; i < h ; i++) {
  250. vsrcCuc = vec_ld(stride + 0, src);
  251. vsrcDuc = vec_ld(stride + 16, src);
  252. vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
  253. if (reallyBadAlign)
  254. vsrc3uc = vsrcDuc;
  255. else
  256. vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
  257. vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  258. (vector unsigned char)vsrc2uc);
  259. vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  260. (vector unsigned char)vsrc3uc);
  261. psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
  262. psum = vec_mladd(vB, vsrc1ssH, psum);
  263. psum = vec_mladd(vC, vsrc2ssH, psum);
  264. psum = vec_mladd(vD, vsrc3ssH, psum);
  265. psum = vec_add(v28ss, psum);
  266. psum = vec_sr(psum, v6us);
  267. vdst = vec_ld(0, dst);
  268. ppsum = (vector unsigned char)vec_pack(psum, psum);
  269. fsum = vec_perm(vdst, ppsum, fperm);
  270. vec_st(fsum, 0, dst);
  271. vsrc0ssH = vsrc2ssH;
  272. vsrc1ssH = vsrc3ssH;
  273. dst += stride;
  274. src += stride;
  275. }
  276. }
  277. }
  278. static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
  279. const uint8_t * src2, int dst_stride,
  280. int src_stride1, int h)
  281. {
  282. int i;
  283. vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
  284. mask_ = vec_lvsl(0, src2);
  285. for (i = 0; i < h; i++) {
  286. tmp1 = vec_ld(i * src_stride1, src1);
  287. mask = vec_lvsl(i * src_stride1, src1);
  288. tmp2 = vec_ld(i * src_stride1 + 15, src1);
  289. a = vec_perm(tmp1, tmp2, mask);
  290. tmp1 = vec_ld(i * 16, src2);
  291. tmp2 = vec_ld(i * 16 + 15, src2);
  292. b = vec_perm(tmp1, tmp2, mask_);
  293. tmp1 = vec_ld(0, dst);
  294. mask = vec_lvsl(0, dst);
  295. tmp2 = vec_ld(15, dst);
  296. d = vec_avg(a, b);
  297. edges = vec_perm(tmp2, tmp1, mask);
  298. align = vec_lvsr(0, dst);
  299. tmp2 = vec_perm(d, edges, align);
  300. tmp1 = vec_perm(edges, d, align);
  301. vec_st(tmp2, 15, dst);
  302. vec_st(tmp1, 0 , dst);
  303. dst += dst_stride;
  304. }
  305. }
  306. static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
  307. const uint8_t * src2, int dst_stride,
  308. int src_stride1, int h)
  309. {
  310. int i;
  311. vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
  312. mask_ = vec_lvsl(0, src2);
  313. for (i = 0; i < h; i++) {
  314. tmp1 = vec_ld(i * src_stride1, src1);
  315. mask = vec_lvsl(i * src_stride1, src1);
  316. tmp2 = vec_ld(i * src_stride1 + 15, src1);
  317. a = vec_perm(tmp1, tmp2, mask);
  318. tmp1 = vec_ld(i * 16, src2);
  319. tmp2 = vec_ld(i * 16 + 15, src2);
  320. b = vec_perm(tmp1, tmp2, mask_);
  321. tmp1 = vec_ld(0, dst);
  322. mask = vec_lvsl(0, dst);
  323. tmp2 = vec_ld(15, dst);
  324. d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
  325. edges = vec_perm(tmp2, tmp1, mask);
  326. align = vec_lvsr(0, dst);
  327. tmp2 = vec_perm(d, edges, align);
  328. tmp1 = vec_perm(edges, d, align);
  329. vec_st(tmp2, 15, dst);
  330. vec_st(tmp1, 0 , dst);
  331. dst += dst_stride;
  332. }
  333. }
  334. /* Implemented but could be faster
  335. #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
  336. #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
  337. */
  338. H264_MC(put_, 16, altivec)
  339. H264_MC(avg_, 16, altivec)
  340. /****************************************************************************
  341. * IDCT transform:
  342. ****************************************************************************/
  343. #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
  344. /* a0 = SRC(0) + SRC(4); */ \
  345. vec_s16_t a0v = vec_add(s0, s4); \
  346. /* a2 = SRC(0) - SRC(4); */ \
  347. vec_s16_t a2v = vec_sub(s0, s4); \
  348. /* a4 = (SRC(2)>>1) - SRC(6); */ \
  349. vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \
  350. /* a6 = (SRC(6)>>1) + SRC(2); */ \
  351. vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \
  352. /* b0 = a0 + a6; */ \
  353. vec_s16_t b0v = vec_add(a0v, a6v); \
  354. /* b2 = a2 + a4; */ \
  355. vec_s16_t b2v = vec_add(a2v, a4v); \
  356. /* b4 = a2 - a4; */ \
  357. vec_s16_t b4v = vec_sub(a2v, a4v); \
  358. /* b6 = a0 - a6; */ \
  359. vec_s16_t b6v = vec_sub(a0v, a6v); \
  360. /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
  361. /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
  362. vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
  363. /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
  364. /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
  365. vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
  366. /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
  367. /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
  368. vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
  369. /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
  370. vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
  371. /* b1 = (a7>>2) + a1; */ \
  372. vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
  373. /* b3 = a3 + (a5>>2); */ \
  374. vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
  375. /* b5 = (a3>>2) - a5; */ \
  376. vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
  377. /* b7 = a7 - (a1>>2); */ \
  378. vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
  379. /* DST(0, b0 + b7); */ \
  380. d0 = vec_add(b0v, b7v); \
  381. /* DST(1, b2 + b5); */ \
  382. d1 = vec_add(b2v, b5v); \
  383. /* DST(2, b4 + b3); */ \
  384. d2 = vec_add(b4v, b3v); \
  385. /* DST(3, b6 + b1); */ \
  386. d3 = vec_add(b6v, b1v); \
  387. /* DST(4, b6 - b1); */ \
  388. d4 = vec_sub(b6v, b1v); \
  389. /* DST(5, b4 - b3); */ \
  390. d5 = vec_sub(b4v, b3v); \
  391. /* DST(6, b2 - b5); */ \
  392. d6 = vec_sub(b2v, b5v); \
  393. /* DST(7, b0 - b7); */ \
  394. d7 = vec_sub(b0v, b7v); \
  395. }
  396. #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
  397. /* unaligned load */ \
  398. vec_u8_t hv = vec_ld( 0, dest ); \
  399. vec_u8_t lv = vec_ld( 7, dest ); \
  400. vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
  401. vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
  402. vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
  403. vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
  404. vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
  405. vec_u8_t edgehv; \
  406. /* unaligned store */ \
  407. vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
  408. vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
  409. lv = vec_sel( lv, bodyv, edgelv ); \
  410. vec_st( lv, 7, dest ); \
  411. hv = vec_ld( 0, dest ); \
  412. edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
  413. hv = vec_sel( hv, bodyv, edgehv ); \
  414. vec_st( hv, 0, dest ); \
  415. }
  416. void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
  417. vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
  418. vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
  419. vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
  420. vec_u8_t perm_ldv = vec_lvsl(0, dst);
  421. vec_u8_t perm_stv = vec_lvsr(8, dst);
  422. const vec_u16_t onev = vec_splat_u16(1);
  423. const vec_u16_t twov = vec_splat_u16(2);
  424. const vec_u16_t sixv = vec_splat_u16(6);
  425. const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
  426. -1,-1,-1,-1,-1,-1,-1,-1);
  427. LOAD_ZERO;
  428. dct[0] += 32; // rounding for the >>6 at the end
  429. s0 = vec_ld(0x00, (int16_t*)dct);
  430. s1 = vec_ld(0x10, (int16_t*)dct);
  431. s2 = vec_ld(0x20, (int16_t*)dct);
  432. s3 = vec_ld(0x30, (int16_t*)dct);
  433. s4 = vec_ld(0x40, (int16_t*)dct);
  434. s5 = vec_ld(0x50, (int16_t*)dct);
  435. s6 = vec_ld(0x60, (int16_t*)dct);
  436. s7 = vec_ld(0x70, (int16_t*)dct);
  437. IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
  438. d0, d1, d2, d3, d4, d5, d6, d7);
  439. TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
  440. IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
  441. idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
  442. ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
  443. ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
  444. ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
  445. ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
  446. ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
  447. ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
  448. ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
  449. ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
  450. }
  451. void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
  452. #ifdef HAVE_ALTIVEC
  453. if (has_altivec()) {
  454. c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
  455. c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
  456. c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
  457. c->h264_idct8_add = ff_h264_idct8_add_altivec;
  458. #define dspfunc(PFX, IDX, NUM) \
  459. c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
  460. c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
  461. c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
  462. c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
  463. c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
  464. c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
  465. c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
  466. c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
  467. c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
  468. c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
  469. c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
  470. c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
  471. c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
  472. c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
  473. c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
  474. c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
  475. dspfunc(put_h264_qpel, 0, 16);
  476. dspfunc(avg_h264_qpel, 0, 16);
  477. #undef dspfunc
  478. } else
  479. #endif /* HAVE_ALTIVEC */
  480. {
  481. // Non-AltiVec PPC optimisations
  482. // ... pending ...
  483. }
  484. }