You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

640 lines
28KB

  1. /*
  2. * Copyright (c) 2004 Romain Dolbeau <romain@dolbeau.org>
  3. *
  4. * This library is free software; you can redistribute it and/or
  5. * modify it under the terms of the GNU Lesser General Public
  6. * License as published by the Free Software Foundation; either
  7. * version 2 of the License, or (at your option) any later version.
  8. *
  9. * This library is distributed in the hope that it will be useful,
  10. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  12. * Lesser General Public License for more details.
  13. *
  14. * You should have received a copy of the GNU Lesser General Public
  15. * License along with this library; if not, write to the Free Software
  16. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  17. */
  18. /* this code assume that stride % 16 == 0 */
  19. void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, int h, int x, int y) {
  20. POWERPC_PERF_DECLARE(PREFIX_h264_chroma_mc8_num, 1);
  21. POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);
  22. signed int ABCD[4] __attribute__((aligned(16)));
  23. register int i;
  24. ABCD[0] = ((8 - x) * (8 - y));
  25. ABCD[1] = ((x) * (8 - y));
  26. ABCD[2] = ((8 - x) * (y));
  27. ABCD[3] = ((x) * (y));
  28. const vector signed int vABCD = vec_ld(0, ABCD);
  29. const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
  30. const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
  31. const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
  32. const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
  33. const vector signed int vzero = vec_splat_s32(0);
  34. const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
  35. const vector unsigned short v6us = vec_splat_u16(6);
  36. vector unsigned char fperm;
  37. if (((unsigned long)dst) % 16 == 0) {
  38. fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
  39. 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F);
  40. } else {
  41. fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  42. 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
  43. }
  44. register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
  45. register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;
  46. vector unsigned char vsrcAuc;
  47. vector unsigned char vsrcBuc;
  48. vector unsigned char vsrcperm0;
  49. vector unsigned char vsrcperm1;
  50. vsrcAuc = vec_ld(0, src);
  51. if (loadSecond)
  52. vsrcBuc = vec_ld(16, src);
  53. vsrcperm0 = vec_lvsl(0, src);
  54. vsrcperm1 = vec_lvsl(1, src);
  55. vector unsigned char vsrc0uc;
  56. vector unsigned char vsrc1uc;
  57. vsrc0uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm0);
  58. if (reallyBadAlign)
  59. vsrc1uc = vsrcBuc;
  60. else
  61. vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);
  62. vector signed short vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc0uc);
  63. vector signed short vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc1uc);
  64. if (!loadSecond) {// -> !reallyBadAlign
  65. for (i = 0 ; i < h ; i++) {
  66. vector unsigned char vsrcCuc;
  67. vsrcCuc = vec_ld(stride + 0, src);
  68. vector unsigned char vsrc2uc;
  69. vector unsigned char vsrc3uc;
  70. vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
  71. vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);
  72. vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
  73. vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
  74. vector signed short psum;
  75. psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
  76. psum = vec_mladd(vB, vsrc1ssH, psum);
  77. psum = vec_mladd(vC, vsrc2ssH, psum);
  78. psum = vec_mladd(vD, vsrc3ssH, psum);
  79. psum = vec_add(v32ss, psum);
  80. psum = vec_sra(psum, v6us);
  81. vector unsigned char vdst = vec_ld(0, dst);
  82. vector unsigned char ppsum = (vector unsigned char)vec_packsu(psum, psum);
  83. vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
  84. vector unsigned char fsum;
  85. OP_U8_ALTIVEC(fsum, vfdst, vdst);
  86. vec_st(fsum, 0, dst);
  87. vsrc0ssH = vsrc2ssH;
  88. vsrc1ssH = vsrc3ssH;
  89. dst += stride;
  90. src += stride;
  91. }
  92. } else {
  93. for (i = 0 ; i < h ; i++) {
  94. vector unsigned char vsrcCuc;
  95. vector unsigned char vsrcDuc;
  96. vsrcCuc = vec_ld(stride + 0, src);
  97. vsrcDuc = vec_ld(stride + 16, src);
  98. vector unsigned char vsrc2uc;
  99. vector unsigned char vsrc3uc;
  100. vsrc2uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm0);
  101. if (reallyBadAlign)
  102. vsrc3uc = vsrcDuc;
  103. else
  104. vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);
  105. vector signed short vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc2uc);
  106. vector signed short vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero, (vector unsigned char)vsrc3uc);
  107. vector signed short psum;
  108. psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
  109. psum = vec_mladd(vB, vsrc1ssH, psum);
  110. psum = vec_mladd(vC, vsrc2ssH, psum);
  111. psum = vec_mladd(vD, vsrc3ssH, psum);
  112. psum = vec_add(v32ss, psum);
  113. psum = vec_sr(psum, v6us);
  114. vector unsigned char vdst = vec_ld(0, dst);
  115. vector unsigned char ppsum = (vector unsigned char)vec_pack(psum, psum);
  116. vector unsigned char vfdst = vec_perm(vdst, ppsum, fperm);
  117. vector unsigned char fsum;
  118. OP_U8_ALTIVEC(fsum, vfdst, vdst);
  119. vec_st(fsum, 0, dst);
  120. vsrc0ssH = vsrc2ssH;
  121. vsrc1ssH = vsrc3ssH;
  122. dst += stride;
  123. src += stride;
  124. }
  125. }
  126. POWERPC_PERF_STOP_COUNT(PREFIX_h264_chroma_mc8_num, 1);
  127. }
  128. /* this code assume stride % 16 == 0 */
  129. static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
  130. POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
  131. POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
  132. register int i;
  133. const vector signed int vzero = vec_splat_s32(0);
  134. const vector unsigned char permM2 = vec_lvsl(-2, src);
  135. const vector unsigned char permM1 = vec_lvsl(-1, src);
  136. const vector unsigned char permP0 = vec_lvsl(+0, src);
  137. const vector unsigned char permP1 = vec_lvsl(+1, src);
  138. const vector unsigned char permP2 = vec_lvsl(+2, src);
  139. const vector unsigned char permP3 = vec_lvsl(+3, src);
  140. const vector signed short v5ss = vec_splat_s16(5);
  141. const vector unsigned short v5us = vec_splat_u16(5);
  142. const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
  143. const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
  144. const vector unsigned char dstperm = vec_lvsr(0, dst);
  145. const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
  146. const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
  147. register int align = ((((unsigned long)src) - 2) % 16);
  148. for (i = 0 ; i < 16 ; i ++) {
  149. vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  150. vector unsigned char srcR1 = vec_ld(-2, src);
  151. vector unsigned char srcR2 = vec_ld(14, src);
  152. switch (align) {
  153. default: {
  154. srcM2 = vec_perm(srcR1, srcR2, permM2);
  155. srcM1 = vec_perm(srcR1, srcR2, permM1);
  156. srcP0 = vec_perm(srcR1, srcR2, permP0);
  157. srcP1 = vec_perm(srcR1, srcR2, permP1);
  158. srcP2 = vec_perm(srcR1, srcR2, permP2);
  159. srcP3 = vec_perm(srcR1, srcR2, permP3);
  160. } break;
  161. case 11: {
  162. srcM2 = vec_perm(srcR1, srcR2, permM2);
  163. srcM1 = vec_perm(srcR1, srcR2, permM1);
  164. srcP0 = vec_perm(srcR1, srcR2, permP0);
  165. srcP1 = vec_perm(srcR1, srcR2, permP1);
  166. srcP2 = vec_perm(srcR1, srcR2, permP2);
  167. srcP3 = srcR2;
  168. } break;
  169. case 12: {
  170. vector unsigned char srcR3 = vec_ld(30, src);
  171. srcM2 = vec_perm(srcR1, srcR2, permM2);
  172. srcM1 = vec_perm(srcR1, srcR2, permM1);
  173. srcP0 = vec_perm(srcR1, srcR2, permP0);
  174. srcP1 = vec_perm(srcR1, srcR2, permP1);
  175. srcP2 = srcR2;
  176. srcP3 = vec_perm(srcR2, srcR3, permP3);
  177. } break;
  178. case 13: {
  179. vector unsigned char srcR3 = vec_ld(30, src);
  180. srcM2 = vec_perm(srcR1, srcR2, permM2);
  181. srcM1 = vec_perm(srcR1, srcR2, permM1);
  182. srcP0 = vec_perm(srcR1, srcR2, permP0);
  183. srcP1 = srcR2;
  184. srcP2 = vec_perm(srcR2, srcR3, permP2);
  185. srcP3 = vec_perm(srcR2, srcR3, permP3);
  186. } break;
  187. case 14: {
  188. vector unsigned char srcR3 = vec_ld(30, src);
  189. srcM2 = vec_perm(srcR1, srcR2, permM2);
  190. srcM1 = vec_perm(srcR1, srcR2, permM1);
  191. srcP0 = srcR2;
  192. srcP1 = vec_perm(srcR2, srcR3, permP1);
  193. srcP2 = vec_perm(srcR2, srcR3, permP2);
  194. srcP3 = vec_perm(srcR2, srcR3, permP3);
  195. } break;
  196. case 15: {
  197. vector unsigned char srcR3 = vec_ld(30, src);
  198. srcM2 = vec_perm(srcR1, srcR2, permM2);
  199. srcM1 = srcR2;
  200. srcP0 = vec_perm(srcR2, srcR3, permP0);
  201. srcP1 = vec_perm(srcR2, srcR3, permP1);
  202. srcP2 = vec_perm(srcR2, srcR3, permP2);
  203. srcP3 = vec_perm(srcR2, srcR3, permP3);
  204. } break;
  205. }
  206. const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
  207. const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
  208. const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
  209. const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
  210. const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
  211. const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
  212. const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
  213. const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
  214. const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
  215. const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
  216. const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
  217. const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
  218. const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
  219. const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
  220. const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
  221. const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
  222. const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
  223. const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
  224. const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
  225. const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
  226. const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
  227. const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
  228. const vector signed short pp3A = vec_add(sum3A, pp1A);
  229. const vector signed short pp3B = vec_add(sum3B, pp1B);
  230. const vector signed short psumA = vec_sub(pp3A, pp2A);
  231. const vector signed short psumB = vec_sub(pp3B, pp2B);
  232. const vector signed short sumA = vec_sra(psumA, v5us);
  233. const vector signed short sumB = vec_sra(psumB, v5us);
  234. const vector unsigned char sum = vec_packsu(sumA, sumB);
  235. const vector unsigned char dst1 = vec_ld(0, dst);
  236. const vector unsigned char dst2 = vec_ld(16, dst);
  237. const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
  238. vector unsigned char fsum;
  239. OP_U8_ALTIVEC(fsum, sum, vdst);
  240. const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
  241. const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
  242. const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
  243. vec_st(fdst1, 0, dst);
  244. vec_st(fdst2, 16, dst);
  245. src += srcStride;
  246. dst += dstStride;
  247. }
  248. POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);
  249. }
  250. /* this code assume stride % 16 == 0 */
  251. static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, int dstStride, int srcStride) {
  252. POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_v_lowpass_num, 1);
  253. POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
  254. register int i;
  255. const vector signed int vzero = vec_splat_s32(0);
  256. const vector unsigned char perm = vec_lvsl(0, src);
  257. const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
  258. const vector unsigned short v5us = vec_splat_u16(5);
  259. const vector signed short v5ss = vec_splat_s16(5);
  260. const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
  261. const vector unsigned char dstperm = vec_lvsr(0, dst);
  262. const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
  263. const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
  264. uint8_t *srcbis = src - (srcStride * 2);
  265. const vector unsigned char srcM2a = vec_ld(0, srcbis);
  266. const vector unsigned char srcM2b = vec_ld(16, srcbis);
  267. const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
  268. srcbis += srcStride;
  269. const vector unsigned char srcM1a = vec_ld(0, srcbis);
  270. const vector unsigned char srcM1b = vec_ld(16, srcbis);
  271. const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
  272. srcbis += srcStride;
  273. const vector unsigned char srcP0a = vec_ld(0, srcbis);
  274. const vector unsigned char srcP0b = vec_ld(16, srcbis);
  275. const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
  276. srcbis += srcStride;
  277. const vector unsigned char srcP1a = vec_ld(0, srcbis);
  278. const vector unsigned char srcP1b = vec_ld(16, srcbis);
  279. const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
  280. srcbis += srcStride;
  281. const vector unsigned char srcP2a = vec_ld(0, srcbis);
  282. const vector unsigned char srcP2b = vec_ld(16, srcbis);
  283. const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
  284. srcbis += srcStride;
  285. vector signed short srcM2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
  286. vector signed short srcM2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
  287. vector signed short srcM1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
  288. vector signed short srcM1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
  289. vector signed short srcP0ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
  290. vector signed short srcP0ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
  291. vector signed short srcP1ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
  292. vector signed short srcP1ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
  293. vector signed short srcP2ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
  294. vector signed short srcP2ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
  295. for (i = 0 ; i < 16 ; i++) {
  296. const vector unsigned char srcP3a = vec_ld(0, srcbis);
  297. const vector unsigned char srcP3b = vec_ld(16, srcbis);
  298. const vector unsigned char srcP3 = vec_perm(srcP3a, srcP3b, perm);
  299. const vector signed short srcP3ssA = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
  300. const vector signed short srcP3ssB = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
  301. srcbis += srcStride;
  302. const vector signed short sum1A = vec_adds(srcP0ssA, srcP1ssA);
  303. const vector signed short sum1B = vec_adds(srcP0ssB, srcP1ssB);
  304. const vector signed short sum2A = vec_adds(srcM1ssA, srcP2ssA);
  305. const vector signed short sum2B = vec_adds(srcM1ssB, srcP2ssB);
  306. const vector signed short sum3A = vec_adds(srcM2ssA, srcP3ssA);
  307. const vector signed short sum3B = vec_adds(srcM2ssB, srcP3ssB);
  308. srcM2ssA = srcM1ssA;
  309. srcM2ssB = srcM1ssB;
  310. srcM1ssA = srcP0ssA;
  311. srcM1ssB = srcP0ssB;
  312. srcP0ssA = srcP1ssA;
  313. srcP0ssB = srcP1ssB;
  314. srcP1ssA = srcP2ssA;
  315. srcP1ssB = srcP2ssB;
  316. srcP2ssA = srcP3ssA;
  317. srcP2ssB = srcP3ssB;
  318. const vector signed short pp1A = vec_mladd(sum1A, v20ss, v16ss);
  319. const vector signed short pp1B = vec_mladd(sum1B, v20ss, v16ss);
  320. const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
  321. const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
  322. const vector signed short pp3A = vec_add(sum3A, pp1A);
  323. const vector signed short pp3B = vec_add(sum3B, pp1B);
  324. const vector signed short psumA = vec_sub(pp3A, pp2A);
  325. const vector signed short psumB = vec_sub(pp3B, pp2B);
  326. const vector signed short sumA = vec_sra(psumA, v5us);
  327. const vector signed short sumB = vec_sra(psumB, v5us);
  328. const vector unsigned char sum = vec_packsu(sumA, sumB);
  329. const vector unsigned char dst1 = vec_ld(0, dst);
  330. const vector unsigned char dst2 = vec_ld(16, dst);
  331. const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
  332. vector unsigned char fsum;
  333. OP_U8_ALTIVEC(fsum, sum, vdst);
  334. const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
  335. const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
  336. const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
  337. vec_st(fdst1, 0, dst);
  338. vec_st(fdst2, 16, dst);
  339. dst += dstStride;
  340. }
  341. POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);
  342. }
  343. /* this code assume stride % 16 == 0 *and* tmp is properly aligned */
  344. static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
  345. POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
  346. POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
  347. register int i;
  348. const vector signed int vzero = vec_splat_s32(0);
  349. const vector unsigned char permM2 = vec_lvsl(-2, src);
  350. const vector unsigned char permM1 = vec_lvsl(-1, src);
  351. const vector unsigned char permP0 = vec_lvsl(+0, src);
  352. const vector unsigned char permP1 = vec_lvsl(+1, src);
  353. const vector unsigned char permP2 = vec_lvsl(+2, src);
  354. const vector unsigned char permP3 = vec_lvsl(+3, src);
  355. const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
  356. const vector unsigned int v10ui = vec_splat_u32(10);
  357. const vector signed short v5ss = vec_splat_s16(5);
  358. const vector signed short v1ss = vec_splat_s16(1);
  359. const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
  360. const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
  361. register int align = ((((unsigned long)src) - 2) % 16);
  362. src -= (2 * srcStride);
  363. for (i = 0 ; i < 21 ; i ++) {
  364. vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
  365. vector unsigned char srcR1 = vec_ld(-2, src);
  366. vector unsigned char srcR2 = vec_ld(14, src);
  367. switch (align) {
  368. default: {
  369. srcM2 = vec_perm(srcR1, srcR2, permM2);
  370. srcM1 = vec_perm(srcR1, srcR2, permM1);
  371. srcP0 = vec_perm(srcR1, srcR2, permP0);
  372. srcP1 = vec_perm(srcR1, srcR2, permP1);
  373. srcP2 = vec_perm(srcR1, srcR2, permP2);
  374. srcP3 = vec_perm(srcR1, srcR2, permP3);
  375. } break;
  376. case 11: {
  377. srcM2 = vec_perm(srcR1, srcR2, permM2);
  378. srcM1 = vec_perm(srcR1, srcR2, permM1);
  379. srcP0 = vec_perm(srcR1, srcR2, permP0);
  380. srcP1 = vec_perm(srcR1, srcR2, permP1);
  381. srcP2 = vec_perm(srcR1, srcR2, permP2);
  382. srcP3 = srcR2;
  383. } break;
  384. case 12: {
  385. vector unsigned char srcR3 = vec_ld(30, src);
  386. srcM2 = vec_perm(srcR1, srcR2, permM2);
  387. srcM1 = vec_perm(srcR1, srcR2, permM1);
  388. srcP0 = vec_perm(srcR1, srcR2, permP0);
  389. srcP1 = vec_perm(srcR1, srcR2, permP1);
  390. srcP2 = srcR2;
  391. srcP3 = vec_perm(srcR2, srcR3, permP3);
  392. } break;
  393. case 13: {
  394. vector unsigned char srcR3 = vec_ld(30, src);
  395. srcM2 = vec_perm(srcR1, srcR2, permM2);
  396. srcM1 = vec_perm(srcR1, srcR2, permM1);
  397. srcP0 = vec_perm(srcR1, srcR2, permP0);
  398. srcP1 = srcR2;
  399. srcP2 = vec_perm(srcR2, srcR3, permP2);
  400. srcP3 = vec_perm(srcR2, srcR3, permP3);
  401. } break;
  402. case 14: {
  403. vector unsigned char srcR3 = vec_ld(30, src);
  404. srcM2 = vec_perm(srcR1, srcR2, permM2);
  405. srcM1 = vec_perm(srcR1, srcR2, permM1);
  406. srcP0 = srcR2;
  407. srcP1 = vec_perm(srcR2, srcR3, permP1);
  408. srcP2 = vec_perm(srcR2, srcR3, permP2);
  409. srcP3 = vec_perm(srcR2, srcR3, permP3);
  410. } break;
  411. case 15: {
  412. vector unsigned char srcR3 = vec_ld(30, src);
  413. srcM2 = vec_perm(srcR1, srcR2, permM2);
  414. srcM1 = srcR2;
  415. srcP0 = vec_perm(srcR2, srcR3, permP0);
  416. srcP1 = vec_perm(srcR2, srcR3, permP1);
  417. srcP2 = vec_perm(srcR2, srcR3, permP2);
  418. srcP3 = vec_perm(srcR2, srcR3, permP3);
  419. } break;
  420. }
  421. const vector signed short srcP0A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP0);
  422. const vector signed short srcP0B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP0);
  423. const vector signed short srcP1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP1);
  424. const vector signed short srcP1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP1);
  425. const vector signed short srcP2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP2);
  426. const vector signed short srcP2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP2);
  427. const vector signed short srcP3A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcP3);
  428. const vector signed short srcP3B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcP3);
  429. const vector signed short srcM1A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM1);
  430. const vector signed short srcM1B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM1);
  431. const vector signed short srcM2A = (vector signed short)vec_mergeh((vector unsigned char)vzero, srcM2);
  432. const vector signed short srcM2B = (vector signed short)vec_mergel((vector unsigned char)vzero, srcM2);
  433. const vector signed short sum1A = vec_adds(srcP0A, srcP1A);
  434. const vector signed short sum1B = vec_adds(srcP0B, srcP1B);
  435. const vector signed short sum2A = vec_adds(srcM1A, srcP2A);
  436. const vector signed short sum2B = vec_adds(srcM1B, srcP2B);
  437. const vector signed short sum3A = vec_adds(srcM2A, srcP3A);
  438. const vector signed short sum3B = vec_adds(srcM2B, srcP3B);
  439. const vector signed short pp1A = vec_mladd(sum1A, v20ss, sum3A);
  440. const vector signed short pp1B = vec_mladd(sum1B, v20ss, sum3B);
  441. const vector signed short pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
  442. const vector signed short pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
  443. const vector signed short psumA = vec_sub(pp1A, pp2A);
  444. const vector signed short psumB = vec_sub(pp1B, pp2B);
  445. vec_st(psumA, 0, tmp);
  446. vec_st(psumB, 16, tmp);
  447. src += srcStride;
  448. tmp += tmpStride; /* int16_t*, and stride is 16, so it's OK here */
  449. }
  450. const vector unsigned char dstperm = vec_lvsr(0, dst);
  451. const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
  452. const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
  453. const vector unsigned char mperm = (const vector unsigned char)
  454. AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
  455. 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
  456. int16_t *tmpbis = tmp - (tmpStride * 21);
  457. vector signed short tmpM2ssA = vec_ld(0, tmpbis);
  458. vector signed short tmpM2ssB = vec_ld(16, tmpbis);
  459. tmpbis += tmpStride;
  460. vector signed short tmpM1ssA = vec_ld(0, tmpbis);
  461. vector signed short tmpM1ssB = vec_ld(16, tmpbis);
  462. tmpbis += tmpStride;
  463. vector signed short tmpP0ssA = vec_ld(0, tmpbis);
  464. vector signed short tmpP0ssB = vec_ld(16, tmpbis);
  465. tmpbis += tmpStride;
  466. vector signed short tmpP1ssA = vec_ld(0, tmpbis);
  467. vector signed short tmpP1ssB = vec_ld(16, tmpbis);
  468. tmpbis += tmpStride;
  469. vector signed short tmpP2ssA = vec_ld(0, tmpbis);
  470. vector signed short tmpP2ssB = vec_ld(16, tmpbis);
  471. tmpbis += tmpStride;
  472. for (i = 0 ; i < 16 ; i++) {
  473. const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
  474. const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
  475. tmpbis += tmpStride;
  476. const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
  477. const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
  478. const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
  479. const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
  480. const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
  481. const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
  482. tmpM2ssA = tmpM1ssA;
  483. tmpM2ssB = tmpM1ssB;
  484. tmpM1ssA = tmpP0ssA;
  485. tmpM1ssB = tmpP0ssB;
  486. tmpP0ssA = tmpP1ssA;
  487. tmpP0ssB = tmpP1ssB;
  488. tmpP1ssA = tmpP2ssA;
  489. tmpP1ssB = tmpP2ssB;
  490. tmpP2ssA = tmpP3ssA;
  491. tmpP2ssB = tmpP3ssB;
  492. const vector signed int pp1Ae = vec_mule(sum1A, v20ss);
  493. const vector signed int pp1Ao = vec_mulo(sum1A, v20ss);
  494. const vector signed int pp1Be = vec_mule(sum1B, v20ss);
  495. const vector signed int pp1Bo = vec_mulo(sum1B, v20ss);
  496. const vector signed int pp2Ae = vec_mule(sum2A, v5ss);
  497. const vector signed int pp2Ao = vec_mulo(sum2A, v5ss);
  498. const vector signed int pp2Be = vec_mule(sum2B, v5ss);
  499. const vector signed int pp2Bo = vec_mulo(sum2B, v5ss);
  500. const vector signed int pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
  501. const vector signed int pp3Ao = vec_mulo(sum3A, v1ss);
  502. const vector signed int pp3Be = vec_sra((vector signed int)sum3B, v16ui);
  503. const vector signed int pp3Bo = vec_mulo(sum3B, v1ss);
  504. const vector signed int pp1cAe = vec_add(pp1Ae, v512si);
  505. const vector signed int pp1cAo = vec_add(pp1Ao, v512si);
  506. const vector signed int pp1cBe = vec_add(pp1Be, v512si);
  507. const vector signed int pp1cBo = vec_add(pp1Bo, v512si);
  508. const vector signed int pp32Ae = vec_sub(pp3Ae, pp2Ae);
  509. const vector signed int pp32Ao = vec_sub(pp3Ao, pp2Ao);
  510. const vector signed int pp32Be = vec_sub(pp3Be, pp2Be);
  511. const vector signed int pp32Bo = vec_sub(pp3Bo, pp2Bo);
  512. const vector signed int sumAe = vec_add(pp1cAe, pp32Ae);
  513. const vector signed int sumAo = vec_add(pp1cAo, pp32Ao);
  514. const vector signed int sumBe = vec_add(pp1cBe, pp32Be);
  515. const vector signed int sumBo = vec_add(pp1cBo, pp32Bo);
  516. const vector signed int ssumAe = vec_sra(sumAe, v10ui);
  517. const vector signed int ssumAo = vec_sra(sumAo, v10ui);
  518. const vector signed int ssumBe = vec_sra(sumBe, v10ui);
  519. const vector signed int ssumBo = vec_sra(sumBo, v10ui);
  520. const vector signed short ssume = vec_packs(ssumAe, ssumBe);
  521. const vector signed short ssumo = vec_packs(ssumAo, ssumBo);
  522. const vector unsigned char sumv = vec_packsu(ssume, ssumo);
  523. const vector unsigned char sum = vec_perm(sumv, sumv, mperm);
  524. const vector unsigned char dst1 = vec_ld(0, dst);
  525. const vector unsigned char dst2 = vec_ld(16, dst);
  526. const vector unsigned char vdst = vec_perm(dst1, dst2, vec_lvsl(0, dst));
  527. vector unsigned char fsum;
  528. OP_U8_ALTIVEC(fsum, sum, vdst);
  529. const vector unsigned char rsum = vec_perm(fsum, fsum, dstperm);
  530. const vector unsigned char fdst1 = vec_sel(dst1, rsum, dstmask);
  531. const vector unsigned char fdst2 = vec_sel(rsum, dst2, dstmask);
  532. vec_st(fdst1, 0, dst);
  533. vec_st(fdst2, 16, dst);
  534. dst += dstStride;
  535. }
  536. POWERPC_PERF_STOP_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
  537. }