You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

918 lines
44KB

  1. /*
  2. * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  3. *
  4. * This file is part of FFmpeg.
  5. *
  6. * FFmpeg is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2.1 of the License, or (at your option) any later version.
  10. *
  11. * FFmpeg is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with FFmpeg; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "dsputil.h"
  21. #include "gcc_fixes.h"
  22. #include "dsputil_altivec.h"
  23. #include "types_altivec.h"
  24. #define PUT_OP_U8_ALTIVEC(d, s, dst) d = s
  25. #define AVG_OP_U8_ALTIVEC(d, s, dst) d = vec_avg(dst, s)
  26. #define OP_U8_ALTIVEC PUT_OP_U8_ALTIVEC
  27. #define PREFIX_h264_chroma_mc8_altivec put_h264_chroma_mc8_altivec
  28. #define PREFIX_h264_chroma_mc8_num altivec_put_h264_chroma_mc8_num
  29. #define PREFIX_h264_qpel16_h_lowpass_altivec put_h264_qpel16_h_lowpass_altivec
  30. #define PREFIX_h264_qpel16_h_lowpass_num altivec_put_h264_qpel16_h_lowpass_num
  31. #define PREFIX_h264_qpel16_v_lowpass_altivec put_h264_qpel16_v_lowpass_altivec
  32. #define PREFIX_h264_qpel16_v_lowpass_num altivec_put_h264_qpel16_v_lowpass_num
  33. #define PREFIX_h264_qpel16_hv_lowpass_altivec put_h264_qpel16_hv_lowpass_altivec
  34. #define PREFIX_h264_qpel16_hv_lowpass_num altivec_put_h264_qpel16_hv_lowpass_num
  35. #include "h264_template_altivec.c"
  36. #undef OP_U8_ALTIVEC
  37. #undef PREFIX_h264_chroma_mc8_altivec
  38. #undef PREFIX_h264_chroma_mc8_num
  39. #undef PREFIX_h264_qpel16_h_lowpass_altivec
  40. #undef PREFIX_h264_qpel16_h_lowpass_num
  41. #undef PREFIX_h264_qpel16_v_lowpass_altivec
  42. #undef PREFIX_h264_qpel16_v_lowpass_num
  43. #undef PREFIX_h264_qpel16_hv_lowpass_altivec
  44. #undef PREFIX_h264_qpel16_hv_lowpass_num
  45. #define OP_U8_ALTIVEC AVG_OP_U8_ALTIVEC
  46. #define PREFIX_h264_chroma_mc8_altivec avg_h264_chroma_mc8_altivec
  47. #define PREFIX_h264_chroma_mc8_num altivec_avg_h264_chroma_mc8_num
  48. #define PREFIX_h264_qpel16_h_lowpass_altivec avg_h264_qpel16_h_lowpass_altivec
  49. #define PREFIX_h264_qpel16_h_lowpass_num altivec_avg_h264_qpel16_h_lowpass_num
  50. #define PREFIX_h264_qpel16_v_lowpass_altivec avg_h264_qpel16_v_lowpass_altivec
  51. #define PREFIX_h264_qpel16_v_lowpass_num altivec_avg_h264_qpel16_v_lowpass_num
  52. #define PREFIX_h264_qpel16_hv_lowpass_altivec avg_h264_qpel16_hv_lowpass_altivec
  53. #define PREFIX_h264_qpel16_hv_lowpass_num altivec_avg_h264_qpel16_hv_lowpass_num
  54. #include "h264_template_altivec.c"
  55. #undef OP_U8_ALTIVEC
  56. #undef PREFIX_h264_chroma_mc8_altivec
  57. #undef PREFIX_h264_chroma_mc8_num
  58. #undef PREFIX_h264_qpel16_h_lowpass_altivec
  59. #undef PREFIX_h264_qpel16_h_lowpass_num
  60. #undef PREFIX_h264_qpel16_v_lowpass_altivec
  61. #undef PREFIX_h264_qpel16_v_lowpass_num
  62. #undef PREFIX_h264_qpel16_hv_lowpass_altivec
  63. #undef PREFIX_h264_qpel16_hv_lowpass_num
  64. #define H264_MC(OPNAME, SIZE, CODETYPE) \
  65. static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## CODETYPE (uint8_t *dst, uint8_t *src, int stride){\
  66. OPNAME ## pixels ## SIZE ## _ ## CODETYPE(dst, src, stride, SIZE);\
  67. }\
  68. \
  69. static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){ \
  70. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  71. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  72. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
  73. }\
  74. \
  75. static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  76. OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(dst, src, stride, stride);\
  77. }\
  78. \
  79. static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  80. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  81. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  82. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+1, half, stride, stride, SIZE);\
  83. }\
  84. \
  85. static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  86. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  87. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  88. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src, half, stride, stride, SIZE);\
  89. }\
  90. \
  91. static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  92. OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(dst, src, stride, stride);\
  93. }\
  94. \
  95. static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  96. DECLARE_ALIGNED_16(uint8_t, half[SIZE*SIZE]);\
  97. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(half, src, SIZE, stride);\
  98. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, src+stride, half, stride, stride, SIZE);\
  99. }\
  100. \
  101. static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  102. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  103. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  104. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
  105. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
  106. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  107. }\
  108. \
  109. static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  110. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  111. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  112. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
  113. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
  114. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  115. }\
  116. \
  117. static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  118. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  119. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  120. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
  121. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
  122. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  123. }\
  124. \
  125. static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  126. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  127. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  128. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
  129. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
  130. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfV, stride, SIZE, SIZE);\
  131. }\
  132. \
  133. static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  134. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  135. OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(dst, tmp, src, stride, SIZE, stride);\
  136. }\
  137. \
  138. static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  139. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  140. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  141. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  142. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src, SIZE, stride);\
  143. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  144. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
  145. }\
  146. \
  147. static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  148. DECLARE_ALIGNED_16(uint8_t, halfH[SIZE*SIZE]);\
  149. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  150. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  151. put_h264_qpel ## SIZE ## _h_lowpass_ ## CODETYPE(halfH, src + stride, SIZE, stride);\
  152. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  153. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfH, halfHV, stride, SIZE, SIZE);\
  154. }\
  155. \
  156. static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  157. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  158. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  159. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  160. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src, SIZE, stride);\
  161. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  162. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
  163. }\
  164. \
  165. static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## CODETYPE(uint8_t *dst, uint8_t *src, int stride){\
  166. DECLARE_ALIGNED_16(uint8_t, halfV[SIZE*SIZE]);\
  167. DECLARE_ALIGNED_16(uint8_t, halfHV[SIZE*SIZE]);\
  168. DECLARE_ALIGNED_16(int16_t, tmp[SIZE*(SIZE+8)]);\
  169. put_h264_qpel ## SIZE ## _v_lowpass_ ## CODETYPE(halfV, src+1, SIZE, stride);\
  170. put_h264_qpel ## SIZE ## _hv_lowpass_ ## CODETYPE(halfHV, tmp, src, SIZE, SIZE, stride);\
  171. OPNAME ## pixels ## SIZE ## _l2_ ## CODETYPE(dst, halfV, halfHV, stride, SIZE, SIZE);\
  172. }\
  173. /* this code assume that stride % 16 == 0 */
  174. void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
  175. DECLARE_ALIGNED_16(signed int, ABCD[4]) =
  176. {((8 - x) * (8 - y)),
  177. ((x) * (8 - y)),
  178. ((8 - x) * (y)),
  179. ((x) * (y))};
  180. register int i;
  181. vector unsigned char fperm;
  182. const vector signed int vABCD = vec_ld(0, ABCD);
  183. const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
  184. const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
  185. const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
  186. const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
  187. const vector signed int vzero = vec_splat_s32(0);
  188. const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
  189. const vector unsigned short v6us = vec_splat_u16(6);
  190. register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
  191. register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
  192. vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
  193. vector unsigned char vsrc0uc, vsrc1uc;
  194. vector signed short vsrc0ssH, vsrc1ssH;
  195. vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
  196. vector signed short vsrc2ssH, vsrc3ssH, psum;
  197. vector unsigned char vdst, ppsum, fsum;
  198. if (((unsigned long)dst) % 16 == 0) {
  199. fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
  200. 0x14, 0x15, 0x16, 0x17,
  201. 0x08, 0x09, 0x0A, 0x0B,
  202. 0x0C, 0x0D, 0x0E, 0x0F);
  203. } else {
  204. fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
  205. 0x04, 0x05, 0x06, 0x07,
  206. 0x18, 0x19, 0x1A, 0x1B,
  207. 0x1C, 0x1D, 0x1E, 0x1F);
  208. }
  209. vsrcAuc = vec_ld(0, src);
  210. if (loadSecond)
  211. vsrcBuc = vec_ld(16, src);
  212. vsrcperm0 = vec_lvsl(0, src);
  213. vsrcperm1 = vec_lvsl(1, src);
  214. vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
  215. if (reallyBadAlign)
  216. vsrc1uc = vsrcBuc;
  217. else
  218. vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
  219. vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  220. (vector unsigned char)vsrc0uc);
  221. vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  222. (vector unsigned char)vsrc1uc);
  223. if (!loadSecond) {// -> !reallyBadAlign
  224. for (i = 0 ; i < h ; i++) {
  225. vsrcCuc = vec_ld(stride + 0, src);
  226. vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
  227. vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
  228. vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  229. (vector unsigned char)vsrc2uc);
  230. vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  231. (vector unsigned char)vsrc3uc);
  232. psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
  233. psum = vec_mladd(vB, vsrc1ssH, psum);
  234. psum = vec_mladd(vC, vsrc2ssH, psum);
  235. psum = vec_mladd(vD, vsrc3ssH, psum);
  236. psum = vec_add(v28ss, psum);
  237. psum = vec_sra(psum, v6us);
  238. vdst = vec_ld(0, dst);
  239. ppsum = (vector unsigned char)vec_packsu(psum, psum);
  240. fsum = vec_perm(vdst, ppsum, fperm);
  241. vec_st(fsum, 0, dst);
  242. vsrc0ssH = vsrc2ssH;
  243. vsrc1ssH = vsrc3ssH;
  244. dst += stride;
  245. src += stride;
  246. }
  247. } else {
  248. vector unsigned char vsrcDuc;
  249. for (i = 0 ; i < h ; i++) {
  250. vsrcCuc = vec_ld(stride + 0, src);
  251. vsrcDuc = vec_ld(stride + 16, src);
  252. vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
  253. if (reallyBadAlign)
  254. vsrc3uc = vsrcDuc;
  255. else
  256. vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
  257. vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  258. (vector unsigned char)vsrc2uc);
  259. vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
  260. (vector unsigned char)vsrc3uc);
  261. psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
  262. psum = vec_mladd(vB, vsrc1ssH, psum);
  263. psum = vec_mladd(vC, vsrc2ssH, psum);
  264. psum = vec_mladd(vD, vsrc3ssH, psum);
  265. psum = vec_add(v28ss, psum);
  266. psum = vec_sr(psum, v6us);
  267. vdst = vec_ld(0, dst);
  268. ppsum = (vector unsigned char)vec_pack(psum, psum);
  269. fsum = vec_perm(vdst, ppsum, fperm);
  270. vec_st(fsum, 0, dst);
  271. vsrc0ssH = vsrc2ssH;
  272. vsrc1ssH = vsrc3ssH;
  273. dst += stride;
  274. src += stride;
  275. }
  276. }
  277. }
  278. static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
  279. const uint8_t * src2, int dst_stride,
  280. int src_stride1, int h)
  281. {
  282. int i;
  283. vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
  284. mask_ = vec_lvsl(0, src2);
  285. for (i = 0; i < h; i++) {
  286. tmp1 = vec_ld(i * src_stride1, src1);
  287. mask = vec_lvsl(i * src_stride1, src1);
  288. tmp2 = vec_ld(i * src_stride1 + 15, src1);
  289. a = vec_perm(tmp1, tmp2, mask);
  290. tmp1 = vec_ld(i * 16, src2);
  291. tmp2 = vec_ld(i * 16 + 15, src2);
  292. b = vec_perm(tmp1, tmp2, mask_);
  293. tmp1 = vec_ld(0, dst);
  294. mask = vec_lvsl(0, dst);
  295. tmp2 = vec_ld(15, dst);
  296. d = vec_avg(a, b);
  297. edges = vec_perm(tmp2, tmp1, mask);
  298. align = vec_lvsr(0, dst);
  299. tmp2 = vec_perm(d, edges, align);
  300. tmp1 = vec_perm(edges, d, align);
  301. vec_st(tmp2, 15, dst);
  302. vec_st(tmp1, 0 , dst);
  303. dst += dst_stride;
  304. }
  305. }
  306. static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
  307. const uint8_t * src2, int dst_stride,
  308. int src_stride1, int h)
  309. {
  310. int i;
  311. vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
  312. mask_ = vec_lvsl(0, src2);
  313. for (i = 0; i < h; i++) {
  314. tmp1 = vec_ld(i * src_stride1, src1);
  315. mask = vec_lvsl(i * src_stride1, src1);
  316. tmp2 = vec_ld(i * src_stride1 + 15, src1);
  317. a = vec_perm(tmp1, tmp2, mask);
  318. tmp1 = vec_ld(i * 16, src2);
  319. tmp2 = vec_ld(i * 16 + 15, src2);
  320. b = vec_perm(tmp1, tmp2, mask_);
  321. tmp1 = vec_ld(0, dst);
  322. mask = vec_lvsl(0, dst);
  323. tmp2 = vec_ld(15, dst);
  324. d = vec_avg(vec_perm(tmp1, tmp2, mask), vec_avg(a, b));
  325. edges = vec_perm(tmp2, tmp1, mask);
  326. align = vec_lvsr(0, dst);
  327. tmp2 = vec_perm(d, edges, align);
  328. tmp1 = vec_perm(edges, d, align);
  329. vec_st(tmp2, 15, dst);
  330. vec_st(tmp1, 0 , dst);
  331. dst += dst_stride;
  332. }
  333. }
  334. /* Implemented but could be faster
  335. #define put_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) put_pixels16_l2(d,s1,s2,ds,s1s,16,h)
  336. #define avg_pixels16_l2_altivec(d,s1,s2,ds,s1s,h) avg_pixels16_l2(d,s1,s2,ds,s1s,16,h)
  337. */
  338. H264_MC(put_, 16, altivec)
  339. H264_MC(avg_, 16, altivec)
  340. /****************************************************************************
  341. * IDCT transform:
  342. ****************************************************************************/
  343. #define VEC_1D_DCT(vb0,vb1,vb2,vb3,va0,va1,va2,va3) \
  344. /* 1st stage */ \
  345. vz0 = vec_add(vb0,vb2); /* temp[0] = Y[0] + Y[2] */ \
  346. vz1 = vec_sub(vb0,vb2); /* temp[1] = Y[0] - Y[2] */ \
  347. vz2 = vec_sra(vb1,vec_splat_u16(1)); \
  348. vz2 = vec_sub(vz2,vb3); /* temp[2] = Y[1].1/2 - Y[3] */ \
  349. vz3 = vec_sra(vb3,vec_splat_u16(1)); \
  350. vz3 = vec_add(vb1,vz3); /* temp[3] = Y[1] + Y[3].1/2 */ \
  351. /* 2nd stage: output */ \
  352. va0 = vec_add(vz0,vz3); /* x[0] = temp[0] + temp[3] */ \
  353. va1 = vec_add(vz1,vz2); /* x[1] = temp[1] + temp[2] */ \
  354. va2 = vec_sub(vz1,vz2); /* x[2] = temp[1] - temp[2] */ \
  355. va3 = vec_sub(vz0,vz3) /* x[3] = temp[0] - temp[3] */
  356. #define VEC_TRANSPOSE_4(a0,a1,a2,a3,b0,b1,b2,b3) \
  357. b0 = vec_mergeh( a0, a0 ); \
  358. b1 = vec_mergeh( a1, a0 ); \
  359. b2 = vec_mergeh( a2, a0 ); \
  360. b3 = vec_mergeh( a3, a0 ); \
  361. a0 = vec_mergeh( b0, b2 ); \
  362. a1 = vec_mergel( b0, b2 ); \
  363. a2 = vec_mergeh( b1, b3 ); \
  364. a3 = vec_mergel( b1, b3 ); \
  365. b0 = vec_mergeh( a0, a2 ); \
  366. b1 = vec_mergel( a0, a2 ); \
  367. b2 = vec_mergeh( a1, a3 ); \
  368. b3 = vec_mergel( a1, a3 )
  369. #define VEC_LOAD_U8_ADD_S16_STORE_U8(va) \
  370. vdst_orig = vec_ld(0, dst); \
  371. vdst = vec_perm(vdst_orig, zero_u8v, vdst_mask); \
  372. vdst_ss = (vec_s16_t) vec_mergeh(zero_u8v, vdst); \
  373. va = vec_add(va, vdst_ss); \
  374. va_u8 = vec_packsu(va, zero_s16v); \
  375. va_u32 = vec_splat((vec_u32_t)va_u8, 0); \
  376. vec_ste(va_u32, element, (uint32_t*)dst);
  377. static void ff_h264_idct_add_altivec(uint8_t *dst, DCTELEM *block, int stride)
  378. {
  379. vec_s16_t va0, va1, va2, va3;
  380. vec_s16_t vz0, vz1, vz2, vz3;
  381. vec_s16_t vtmp0, vtmp1, vtmp2, vtmp3;
  382. vec_u8_t va_u8;
  383. vec_u32_t va_u32;
  384. vec_s16_t vdst_ss;
  385. const vec_u16_t v6us = vec_splat_u16(6);
  386. vec_u8_t vdst, vdst_orig;
  387. vec_u8_t vdst_mask = vec_lvsl(0, dst);
  388. int element = ((unsigned long)dst & 0xf) >> 2;
  389. LOAD_ZERO;
  390. block[0] += 32; /* add 32 as a DC-level for rounding */
  391. vtmp0 = vec_ld(0,block);
  392. vtmp1 = vec_sld(vtmp0, vtmp0, 8);
  393. vtmp2 = vec_ld(16,block);
  394. vtmp3 = vec_sld(vtmp2, vtmp2, 8);
  395. VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
  396. VEC_TRANSPOSE_4(va0,va1,va2,va3,vtmp0,vtmp1,vtmp2,vtmp3);
  397. VEC_1D_DCT(vtmp0,vtmp1,vtmp2,vtmp3,va0,va1,va2,va3);
  398. va0 = vec_sra(va0,v6us);
  399. va1 = vec_sra(va1,v6us);
  400. va2 = vec_sra(va2,v6us);
  401. va3 = vec_sra(va3,v6us);
  402. VEC_LOAD_U8_ADD_S16_STORE_U8(va0);
  403. dst += stride;
  404. VEC_LOAD_U8_ADD_S16_STORE_U8(va1);
  405. dst += stride;
  406. VEC_LOAD_U8_ADD_S16_STORE_U8(va2);
  407. dst += stride;
  408. VEC_LOAD_U8_ADD_S16_STORE_U8(va3);
  409. }
  410. #define IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7, d0, d1, d2, d3, d4, d5, d6, d7) {\
  411. /* a0 = SRC(0) + SRC(4); */ \
  412. vec_s16_t a0v = vec_add(s0, s4); \
  413. /* a2 = SRC(0) - SRC(4); */ \
  414. vec_s16_t a2v = vec_sub(s0, s4); \
  415. /* a4 = (SRC(2)>>1) - SRC(6); */ \
  416. vec_s16_t a4v = vec_sub(vec_sra(s2, onev), s6); \
  417. /* a6 = (SRC(6)>>1) + SRC(2); */ \
  418. vec_s16_t a6v = vec_add(vec_sra(s6, onev), s2); \
  419. /* b0 = a0 + a6; */ \
  420. vec_s16_t b0v = vec_add(a0v, a6v); \
  421. /* b2 = a2 + a4; */ \
  422. vec_s16_t b2v = vec_add(a2v, a4v); \
  423. /* b4 = a2 - a4; */ \
  424. vec_s16_t b4v = vec_sub(a2v, a4v); \
  425. /* b6 = a0 - a6; */ \
  426. vec_s16_t b6v = vec_sub(a0v, a6v); \
  427. /* a1 = SRC(5) - SRC(3) - SRC(7) - (SRC(7)>>1); */ \
  428. /* a1 = (SRC(5)-SRC(3)) - (SRC(7) + (SRC(7)>>1)); */ \
  429. vec_s16_t a1v = vec_sub( vec_sub(s5, s3), vec_add(s7, vec_sra(s7, onev)) ); \
  430. /* a3 = SRC(7) + SRC(1) - SRC(3) - (SRC(3)>>1); */ \
  431. /* a3 = (SRC(7)+SRC(1)) - (SRC(3) + (SRC(3)>>1)); */ \
  432. vec_s16_t a3v = vec_sub( vec_add(s7, s1), vec_add(s3, vec_sra(s3, onev)) );\
  433. /* a5 = SRC(7) - SRC(1) + SRC(5) + (SRC(5)>>1); */ \
  434. /* a5 = (SRC(7)-SRC(1)) + SRC(5) + (SRC(5)>>1); */ \
  435. vec_s16_t a5v = vec_add( vec_sub(s7, s1), vec_add(s5, vec_sra(s5, onev)) );\
  436. /* a7 = SRC(5)+SRC(3) + SRC(1) + (SRC(1)>>1); */ \
  437. vec_s16_t a7v = vec_add( vec_add(s5, s3), vec_add(s1, vec_sra(s1, onev)) );\
  438. /* b1 = (a7>>2) + a1; */ \
  439. vec_s16_t b1v = vec_add( vec_sra(a7v, twov), a1v); \
  440. /* b3 = a3 + (a5>>2); */ \
  441. vec_s16_t b3v = vec_add(a3v, vec_sra(a5v, twov)); \
  442. /* b5 = (a3>>2) - a5; */ \
  443. vec_s16_t b5v = vec_sub( vec_sra(a3v, twov), a5v); \
  444. /* b7 = a7 - (a1>>2); */ \
  445. vec_s16_t b7v = vec_sub( a7v, vec_sra(a1v, twov)); \
  446. /* DST(0, b0 + b7); */ \
  447. d0 = vec_add(b0v, b7v); \
  448. /* DST(1, b2 + b5); */ \
  449. d1 = vec_add(b2v, b5v); \
  450. /* DST(2, b4 + b3); */ \
  451. d2 = vec_add(b4v, b3v); \
  452. /* DST(3, b6 + b1); */ \
  453. d3 = vec_add(b6v, b1v); \
  454. /* DST(4, b6 - b1); */ \
  455. d4 = vec_sub(b6v, b1v); \
  456. /* DST(5, b4 - b3); */ \
  457. d5 = vec_sub(b4v, b3v); \
  458. /* DST(6, b2 - b5); */ \
  459. d6 = vec_sub(b2v, b5v); \
  460. /* DST(7, b0 - b7); */ \
  461. d7 = vec_sub(b0v, b7v); \
  462. }
  463. #define ALTIVEC_STORE_SUM_CLIP(dest, idctv, perm_ldv, perm_stv, sel) { \
  464. /* unaligned load */ \
  465. vec_u8_t hv = vec_ld( 0, dest ); \
  466. vec_u8_t lv = vec_ld( 7, dest ); \
  467. vec_u8_t dstv = vec_perm( hv, lv, (vec_u8_t)perm_ldv ); \
  468. vec_s16_t idct_sh6 = vec_sra(idctv, sixv); \
  469. vec_u16_t dst16 = (vec_u16_t)vec_mergeh(zero_u8v, dstv); \
  470. vec_s16_t idstsum = vec_adds(idct_sh6, (vec_s16_t)dst16); \
  471. vec_u8_t idstsum8 = vec_packsu(zero_s16v, idstsum); \
  472. vec_u8_t edgehv; \
  473. /* unaligned store */ \
  474. vec_u8_t bodyv = vec_perm( idstsum8, idstsum8, perm_stv );\
  475. vec_u8_t edgelv = vec_perm( sel, zero_u8v, perm_stv ); \
  476. lv = vec_sel( lv, bodyv, edgelv ); \
  477. vec_st( lv, 7, dest ); \
  478. hv = vec_ld( 0, dest ); \
  479. edgehv = vec_perm( zero_u8v, sel, perm_stv ); \
  480. hv = vec_sel( hv, bodyv, edgehv ); \
  481. vec_st( hv, 0, dest ); \
  482. }
  483. void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
  484. vec_s16_t s0, s1, s2, s3, s4, s5, s6, s7;
  485. vec_s16_t d0, d1, d2, d3, d4, d5, d6, d7;
  486. vec_s16_t idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7;
  487. vec_u8_t perm_ldv = vec_lvsl(0, dst);
  488. vec_u8_t perm_stv = vec_lvsr(8, dst);
  489. const vec_u16_t onev = vec_splat_u16(1);
  490. const vec_u16_t twov = vec_splat_u16(2);
  491. const vec_u16_t sixv = vec_splat_u16(6);
  492. const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
  493. -1,-1,-1,-1,-1,-1,-1,-1);
  494. LOAD_ZERO;
  495. dct[0] += 32; // rounding for the >>6 at the end
  496. s0 = vec_ld(0x00, (int16_t*)dct);
  497. s1 = vec_ld(0x10, (int16_t*)dct);
  498. s2 = vec_ld(0x20, (int16_t*)dct);
  499. s3 = vec_ld(0x30, (int16_t*)dct);
  500. s4 = vec_ld(0x40, (int16_t*)dct);
  501. s5 = vec_ld(0x50, (int16_t*)dct);
  502. s6 = vec_ld(0x60, (int16_t*)dct);
  503. s7 = vec_ld(0x70, (int16_t*)dct);
  504. IDCT8_1D_ALTIVEC(s0, s1, s2, s3, s4, s5, s6, s7,
  505. d0, d1, d2, d3, d4, d5, d6, d7);
  506. TRANSPOSE8( d0, d1, d2, d3, d4, d5, d6, d7 );
  507. IDCT8_1D_ALTIVEC(d0, d1, d2, d3, d4, d5, d6, d7,
  508. idct0, idct1, idct2, idct3, idct4, idct5, idct6, idct7);
  509. ALTIVEC_STORE_SUM_CLIP(&dst[0*stride], idct0, perm_ldv, perm_stv, sel);
  510. ALTIVEC_STORE_SUM_CLIP(&dst[1*stride], idct1, perm_ldv, perm_stv, sel);
  511. ALTIVEC_STORE_SUM_CLIP(&dst[2*stride], idct2, perm_ldv, perm_stv, sel);
  512. ALTIVEC_STORE_SUM_CLIP(&dst[3*stride], idct3, perm_ldv, perm_stv, sel);
  513. ALTIVEC_STORE_SUM_CLIP(&dst[4*stride], idct4, perm_ldv, perm_stv, sel);
  514. ALTIVEC_STORE_SUM_CLIP(&dst[5*stride], idct5, perm_ldv, perm_stv, sel);
  515. ALTIVEC_STORE_SUM_CLIP(&dst[6*stride], idct6, perm_ldv, perm_stv, sel);
  516. ALTIVEC_STORE_SUM_CLIP(&dst[7*stride], idct7, perm_ldv, perm_stv, sel);
  517. }
  518. #define transpose4x16(r0, r1, r2, r3) { \
  519. register vector unsigned char r4; \
  520. register vector unsigned char r5; \
  521. register vector unsigned char r6; \
  522. register vector unsigned char r7; \
  523. \
  524. r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
  525. r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
  526. r6 = vec_mergeh(r1, r3); /*1, 3 set 0*/ \
  527. r7 = vec_mergel(r1, r3); /*1, 3 set 1*/ \
  528. \
  529. r0 = vec_mergeh(r4, r6); /*all set 0*/ \
  530. r1 = vec_mergel(r4, r6); /*all set 1*/ \
  531. r2 = vec_mergeh(r5, r7); /*all set 2*/ \
  532. r3 = vec_mergel(r5, r7); /*all set 3*/ \
  533. }
  534. static inline void write16x4(uint8_t *dst, int dst_stride,
  535. register vector unsigned char r0, register vector unsigned char r1,
  536. register vector unsigned char r2, register vector unsigned char r3) {
  537. DECLARE_ALIGNED_16(unsigned char, result[64]);
  538. uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
  539. int int_dst_stride = dst_stride/4;
  540. vec_st(r0, 0, result);
  541. vec_st(r1, 16, result);
  542. vec_st(r2, 32, result);
  543. vec_st(r3, 48, result);
  544. /* FIXME: there has to be a better way!!!! */
  545. *dst_int = *src_int;
  546. *(dst_int+ int_dst_stride) = *(src_int + 1);
  547. *(dst_int+ 2*int_dst_stride) = *(src_int + 2);
  548. *(dst_int+ 3*int_dst_stride) = *(src_int + 3);
  549. *(dst_int+ 4*int_dst_stride) = *(src_int + 4);
  550. *(dst_int+ 5*int_dst_stride) = *(src_int + 5);
  551. *(dst_int+ 6*int_dst_stride) = *(src_int + 6);
  552. *(dst_int+ 7*int_dst_stride) = *(src_int + 7);
  553. *(dst_int+ 8*int_dst_stride) = *(src_int + 8);
  554. *(dst_int+ 9*int_dst_stride) = *(src_int + 9);
  555. *(dst_int+10*int_dst_stride) = *(src_int + 10);
  556. *(dst_int+11*int_dst_stride) = *(src_int + 11);
  557. *(dst_int+12*int_dst_stride) = *(src_int + 12);
  558. *(dst_int+13*int_dst_stride) = *(src_int + 13);
  559. *(dst_int+14*int_dst_stride) = *(src_int + 14);
  560. *(dst_int+15*int_dst_stride) = *(src_int + 15);
  561. }
  562. /** \brief performs a 6x16 transpose of data in src, and stores it to dst
  563. \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
  564. out of unaligned_load() */
  565. #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
  566. register vector unsigned char r0 = unaligned_load(0, src);\
  567. register vector unsigned char r1 = unaligned_load( src_stride, src);\
  568. register vector unsigned char r2 = unaligned_load(2* src_stride, src);\
  569. register vector unsigned char r3 = unaligned_load(3* src_stride, src);\
  570. register vector unsigned char r4 = unaligned_load(4* src_stride, src);\
  571. register vector unsigned char r5 = unaligned_load(5* src_stride, src);\
  572. register vector unsigned char r6 = unaligned_load(6* src_stride, src);\
  573. register vector unsigned char r7 = unaligned_load(7* src_stride, src);\
  574. register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
  575. register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
  576. \
  577. r8 = unaligned_load( 8*src_stride, src); \
  578. r9 = unaligned_load( 9*src_stride, src); \
  579. r10 = unaligned_load(10*src_stride, src); \
  580. r11 = unaligned_load(11*src_stride, src); \
  581. r12 = unaligned_load(12*src_stride, src); \
  582. r13 = unaligned_load(13*src_stride, src); \
  583. \
  584. /*Merge first pairs*/ \
  585. r0 = vec_mergeh(r0, r8); /*0, 8*/ \
  586. r1 = vec_mergeh(r1, r9); /*1, 9*/ \
  587. r2 = vec_mergeh(r2, r10); /*2,10*/ \
  588. r3 = vec_mergeh(r3, r11); /*3,11*/ \
  589. r4 = vec_mergeh(r4, r12); /*4,12*/ \
  590. r5 = vec_mergeh(r5, r13); /*5,13*/ \
  591. r6 = vec_mergeh(r6, r14); /*6,14*/ \
  592. r7 = vec_mergeh(r7, r15); /*7,15*/ \
  593. \
  594. /*Merge second pairs*/ \
  595. r8 = vec_mergeh(r0, r4); /*0,4, 8,12 set 0*/ \
  596. r9 = vec_mergel(r0, r4); /*0,4, 8,12 set 1*/ \
  597. r10 = vec_mergeh(r1, r5); /*1,5, 9,13 set 0*/ \
  598. r11 = vec_mergel(r1, r5); /*1,5, 9,13 set 1*/ \
  599. r12 = vec_mergeh(r2, r6); /*2,6,10,14 set 0*/ \
  600. r13 = vec_mergel(r2, r6); /*2,6,10,14 set 1*/ \
  601. r14 = vec_mergeh(r3, r7); /*3,7,11,15 set 0*/ \
  602. r15 = vec_mergel(r3, r7); /*3,7,11,15 set 1*/ \
  603. \
  604. /*Third merge*/ \
  605. r0 = vec_mergeh(r8, r12); /*0,2,4,6,8,10,12,14 set 0*/ \
  606. r1 = vec_mergel(r8, r12); /*0,2,4,6,8,10,12,14 set 1*/ \
  607. r2 = vec_mergeh(r9, r13); /*0,2,4,6,8,10,12,14 set 2*/ \
  608. r4 = vec_mergeh(r10, r14); /*1,3,5,7,9,11,13,15 set 0*/ \
  609. r5 = vec_mergel(r10, r14); /*1,3,5,7,9,11,13,15 set 1*/ \
  610. r6 = vec_mergeh(r11, r15); /*1,3,5,7,9,11,13,15 set 2*/ \
  611. /* Don't need to compute 3 and 7*/ \
  612. \
  613. /*Final merge*/ \
  614. r8 = vec_mergeh(r0, r4); /*all set 0*/ \
  615. r9 = vec_mergel(r0, r4); /*all set 1*/ \
  616. r10 = vec_mergeh(r1, r5); /*all set 2*/ \
  617. r11 = vec_mergel(r1, r5); /*all set 3*/ \
  618. r12 = vec_mergeh(r2, r6); /*all set 4*/ \
  619. r13 = vec_mergel(r2, r6); /*all set 5*/ \
  620. /* Don't need to compute 14 and 15*/ \
  621. \
  622. }
  623. // out: o = |x-y| < a
  624. static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
  625. register vector unsigned char y,
  626. register vector unsigned char a) {
  627. register vector unsigned char diff = vec_subs(x, y);
  628. register vector unsigned char diffneg = vec_subs(y, x);
  629. register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
  630. o = (vector unsigned char)vec_cmplt(o, a);
  631. return o;
  632. }
  633. static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
  634. register vector unsigned char p1,
  635. register vector unsigned char q0,
  636. register vector unsigned char q1,
  637. register vector unsigned char alpha,
  638. register vector unsigned char beta) {
  639. register vector unsigned char mask;
  640. register vector unsigned char tempmask;
  641. mask = diff_lt_altivec(p0, q0, alpha);
  642. tempmask = diff_lt_altivec(p1, p0, beta);
  643. mask = vec_and(mask, tempmask);
  644. tempmask = diff_lt_altivec(q1, q0, beta);
  645. mask = vec_and(mask, tempmask);
  646. return mask;
  647. }
  648. // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
  649. static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
  650. register vector unsigned char p1,
  651. register vector unsigned char p2,
  652. register vector unsigned char q0,
  653. register vector unsigned char tc0) {
  654. register vector unsigned char average = vec_avg(p0, q0);
  655. register vector unsigned char temp;
  656. register vector unsigned char uncliped;
  657. register vector unsigned char ones;
  658. register vector unsigned char max;
  659. register vector unsigned char min;
  660. register vector unsigned char newp1;
  661. temp = vec_xor(average, p2);
  662. average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
  663. ones = vec_splat_u8(1);
  664. temp = vec_and(temp, ones); /*(p2^avg(p0, q0)) & 1 */
  665. uncliped = vec_subs(average, temp); /*(p2+((p0+q0+1)>>1))>>1 */
  666. max = vec_adds(p1, tc0);
  667. min = vec_subs(p1, tc0);
  668. newp1 = vec_max(min, uncliped);
  669. newp1 = vec_min(max, newp1);
  670. return newp1;
  671. }
  672. #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
  673. \
  674. const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
  675. \
  676. register vector unsigned char pq0bit = vec_xor(p0,q0); \
  677. register vector unsigned char q1minus; \
  678. register vector unsigned char p0minus; \
  679. register vector unsigned char stage1; \
  680. register vector unsigned char stage2; \
  681. register vector unsigned char vec160; \
  682. register vector unsigned char delta; \
  683. register vector unsigned char deltaneg; \
  684. \
  685. q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
  686. stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
  687. stage2 = vec_sr(stage1, vec_splat_u8(1)); /* (p1 - q1 + 256)>>2 = 64 + (p1 - q1) >> 2 */ \
  688. p0minus = vec_nor(p0, p0); /* 255 - p0 */ \
  689. stage1 = vec_avg(q0, p0minus); /* (q0 - p0 + 256)>>1 */ \
  690. pq0bit = vec_and(pq0bit, vec_splat_u8(1)); \
  691. stage2 = vec_avg(stage2, pq0bit); /* 32 + ((q0 - p0)&1 + (p1 - q1) >> 2 + 1) >> 1 */ \
  692. stage2 = vec_adds(stage2, stage1); /* 160 + ((p0 - q0) + (p1 - q1) >> 2 + 1) >> 1 */ \
  693. vec160 = vec_ld(0, &A0v); \
  694. deltaneg = vec_subs(vec160, stage2); /* -d */ \
  695. delta = vec_subs(stage2, vec160); /* d */ \
  696. deltaneg = vec_min(tc0masked, deltaneg); \
  697. delta = vec_min(tc0masked, delta); \
  698. p0 = vec_subs(p0, deltaneg); \
  699. q0 = vec_subs(q0, delta); \
  700. p0 = vec_adds(p0, delta); \
  701. q0 = vec_adds(q0, deltaneg); \
  702. }
  703. #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
  704. DECLARE_ALIGNED_16(unsigned char, temp[16]); \
  705. register vector unsigned char alphavec; \
  706. register vector unsigned char betavec; \
  707. register vector unsigned char mask; \
  708. register vector unsigned char p1mask; \
  709. register vector unsigned char q1mask; \
  710. register vector signed char tc0vec; \
  711. register vector unsigned char finaltc0; \
  712. register vector unsigned char tc0masked; \
  713. register vector unsigned char newp1; \
  714. register vector unsigned char newq1; \
  715. \
  716. temp[0] = alpha; \
  717. temp[1] = beta; \
  718. alphavec = vec_ld(0, temp); \
  719. betavec = vec_splat(alphavec, 0x1); \
  720. alphavec = vec_splat(alphavec, 0x0); \
  721. mask = h264_deblock_mask(p0, p1, q0, q1, alphavec, betavec); /*if in block */ \
  722. \
  723. *((int *)temp) = *((int *)tc0); \
  724. tc0vec = vec_ld(0, (signed char*)temp); \
  725. tc0vec = vec_mergeh(tc0vec, tc0vec); \
  726. tc0vec = vec_mergeh(tc0vec, tc0vec); \
  727. mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
  728. finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \
  729. \
  730. p1mask = diff_lt_altivec(p2, p0, betavec); \
  731. p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
  732. tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec); \
  733. finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
  734. newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
  735. /*end if*/ \
  736. \
  737. q1mask = diff_lt_altivec(q2, q0, betavec); \
  738. q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
  739. tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec); \
  740. finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
  741. newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
  742. /*end if*/ \
  743. \
  744. h264_deblock_p0_q0(p0, p1, q0, q1, finaltc0); \
  745. p1 = newp1; \
  746. q1 = newq1; \
  747. }
  748. static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
  749. if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
  750. register vector unsigned char p2 = vec_ld(-3*stride, pix);
  751. register vector unsigned char p1 = vec_ld(-2*stride, pix);
  752. register vector unsigned char p0 = vec_ld(-1*stride, pix);
  753. register vector unsigned char q0 = vec_ld(0, pix);
  754. register vector unsigned char q1 = vec_ld(stride, pix);
  755. register vector unsigned char q2 = vec_ld(2*stride, pix);
  756. h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
  757. vec_st(p1, -2*stride, pix);
  758. vec_st(p0, -1*stride, pix);
  759. vec_st(q0, 0, pix);
  760. vec_st(q1, stride, pix);
  761. }
  762. }
  763. static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {
  764. register vector unsigned char line0, line1, line2, line3, line4, line5;
  765. if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
  766. return;
  767. readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);
  768. h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0);
  769. transpose4x16(line1, line2, line3, line4);
  770. write16x4(pix-2, stride, line1, line2, line3, line4);
  771. }
  772. void dsputil_h264_init_ppc(DSPContext* c, AVCodecContext *avctx) {
  773. #ifdef HAVE_ALTIVEC
  774. if (has_altivec()) {
  775. c->put_h264_chroma_pixels_tab[0] = put_h264_chroma_mc8_altivec;
  776. c->put_no_rnd_h264_chroma_pixels_tab[0] = put_no_rnd_h264_chroma_mc8_altivec;
  777. c->avg_h264_chroma_pixels_tab[0] = avg_h264_chroma_mc8_altivec;
  778. c->h264_idct_add = ff_h264_idct_add_altivec;
  779. c->h264_idct8_add = ff_h264_idct8_add_altivec;
  780. c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_altivec;
  781. c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_altivec;
  782. #define dspfunc(PFX, IDX, NUM) \
  783. c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_altivec; \
  784. c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_altivec; \
  785. c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_altivec; \
  786. c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_altivec; \
  787. c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_altivec; \
  788. c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_altivec; \
  789. c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_altivec; \
  790. c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_altivec; \
  791. c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_altivec; \
  792. c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_altivec; \
  793. c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_altivec; \
  794. c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_altivec; \
  795. c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_altivec; \
  796. c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_altivec; \
  797. c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_altivec; \
  798. c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_altivec
  799. dspfunc(put_h264_qpel, 0, 16);
  800. dspfunc(avg_h264_qpel, 0, 16);
  801. #undef dspfunc
  802. } else
  803. #endif /* HAVE_ALTIVEC */
  804. {
  805. // Non-AltiVec PPC optimisations
  806. // ... pending ...
  807. }
  808. }