You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1046 lines
38KB

  1. /*
  2. AltiVec optimizations (C) 2004 Romain Dolbeau <romain@dolbeau.org>
  3. based on code by Copyright (C) 2001-2003 Michael Niedermayer (michaelni@gmx.at)
  4. This program is free software; you can redistribute it and/or modify
  5. it under the terms of the GNU General Public License as published by
  6. the Free Software Foundation; either version 2 of the License, or
  7. (at your option) any later version.
  8. This program is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  11. GNU General Public License for more details.
  12. You should have received a copy of the GNU General Public License
  13. along with this program; if not, write to the Free Software
  14. Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  15. */
  16. #ifdef CONFIG_DARWIN
  17. #define AVV(x...) (x)
  18. #else
  19. #define AVV(x...) {x}
  20. #endif
  21. #define ALTIVEC_TRANSPOSE_8x8_SHORT(src_a,src_b,src_c,src_d,src_e,src_f,src_g,src_h) \
  22. do { \
  23. __typeof__(src_a) tempA1, tempB1, tempC1, tempD1; \
  24. __typeof__(src_a) tempE1, tempF1, tempG1, tempH1; \
  25. __typeof__(src_a) tempA2, tempB2, tempC2, tempD2; \
  26. __typeof__(src_a) tempE2, tempF2, tempG2, tempH2; \
  27. tempA1 = vec_mergeh (src_a, src_e); \
  28. tempB1 = vec_mergel (src_a, src_e); \
  29. tempC1 = vec_mergeh (src_b, src_f); \
  30. tempD1 = vec_mergel (src_b, src_f); \
  31. tempE1 = vec_mergeh (src_c, src_g); \
  32. tempF1 = vec_mergel (src_c, src_g); \
  33. tempG1 = vec_mergeh (src_d, src_h); \
  34. tempH1 = vec_mergel (src_d, src_h); \
  35. tempA2 = vec_mergeh (tempA1, tempE1); \
  36. tempB2 = vec_mergel (tempA1, tempE1); \
  37. tempC2 = vec_mergeh (tempB1, tempF1); \
  38. tempD2 = vec_mergel (tempB1, tempF1); \
  39. tempE2 = vec_mergeh (tempC1, tempG1); \
  40. tempF2 = vec_mergel (tempC1, tempG1); \
  41. tempG2 = vec_mergeh (tempD1, tempH1); \
  42. tempH2 = vec_mergel (tempD1, tempH1); \
  43. src_a = vec_mergeh (tempA2, tempE2); \
  44. src_b = vec_mergel (tempA2, tempE2); \
  45. src_c = vec_mergeh (tempB2, tempF2); \
  46. src_d = vec_mergel (tempB2, tempF2); \
  47. src_e = vec_mergeh (tempC2, tempG2); \
  48. src_f = vec_mergel (tempC2, tempG2); \
  49. src_g = vec_mergeh (tempD2, tempH2); \
  50. src_h = vec_mergel (tempD2, tempH2); \
  51. } while (0)
  52. static inline int vertClassify_altivec(uint8_t src[], int stride, PPContext *c) {
  53. /*
  54. this code makes no assumption on src or stride.
  55. One could remove the recomputation of the perm
  56. vector by assuming (stride % 16) == 0, unfortunately
  57. this is not always true.
  58. */
  59. register int y;
  60. short __attribute__ ((aligned(16))) data[8];
  61. int numEq;
  62. uint8_t *src2 = src;
  63. vector signed short v_dcOffset;
  64. vector signed short v2QP;
  65. vector unsigned short v4QP;
  66. vector unsigned short v_dcThreshold;
  67. int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
  68. const vector signed int zero = vec_splat_s32(0);
  69. const vector signed short mask = vec_splat_s16(1);
  70. vector signed int v_numEq = vec_splat_s32(0);
  71. data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
  72. data[1] = data[0] * 2 + 1;
  73. data[2] = c->QP * 2;
  74. data[3] = c->QP * 4;
  75. vector signed short v_data = vec_ld(0, data);
  76. v_dcOffset = vec_splat(v_data, 0);
  77. v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
  78. v2QP = vec_splat(v_data, 2);
  79. v4QP = (vector unsigned short)vec_splat(v_data, 3);
  80. src2 += stride * 4;
  81. #define LOAD_LINE(i) \
  82. register int j##i = i * stride; \
  83. vector unsigned char perm##i = vec_lvsl(j##i, src2); \
  84. const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
  85. vector unsigned char v_srcA2##i; \
  86. if (two_vectors) \
  87. v_srcA2##i = vec_ld(j##i + 16, src2); \
  88. const vector unsigned char v_srcA##i = \
  89. vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
  90. vector signed short v_srcAss##i = \
  91. (vector signed short)vec_mergeh((vector signed char)zero, \
  92. (vector signed char)v_srcA##i)
  93. LOAD_LINE(0);
  94. LOAD_LINE(1);
  95. LOAD_LINE(2);
  96. LOAD_LINE(3);
  97. LOAD_LINE(4);
  98. LOAD_LINE(5);
  99. LOAD_LINE(6);
  100. LOAD_LINE(7);
  101. #undef LOAD_LINE
  102. #define ITER(i, j) \
  103. const vector signed short v_diff##i = \
  104. vec_sub(v_srcAss##i, v_srcAss##j); \
  105. const vector signed short v_sum##i = \
  106. vec_add(v_diff##i, v_dcOffset); \
  107. const vector signed short v_comp##i = \
  108. (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
  109. v_dcThreshold); \
  110. const vector signed short v_part##i = vec_and(mask, v_comp##i); \
  111. v_numEq = vec_sum4s(v_part##i, v_numEq);
  112. ITER(0, 1);
  113. ITER(1, 2);
  114. ITER(2, 3);
  115. ITER(3, 4);
  116. ITER(4, 5);
  117. ITER(5, 6);
  118. ITER(6, 7);
  119. #undef ITER
  120. v_numEq = vec_sums(v_numEq, zero);
  121. v_numEq = vec_splat(v_numEq, 3);
  122. vec_ste(v_numEq, 0, &numEq);
  123. if (numEq > c->ppMode.flatnessThreshold)
  124. {
  125. const vector unsigned char mmoP1 = (const vector unsigned char)
  126. AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  127. 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
  128. const vector unsigned char mmoP2 = (const vector unsigned char)
  129. AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
  130. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
  131. const vector unsigned char mmoP = (const vector unsigned char)
  132. vec_lvsl(8, (unsigned char*)0);
  133. vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
  134. vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
  135. vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
  136. vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
  137. vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
  138. vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
  139. vector signed short mmoDiff = vec_sub(mmoL, mmoR);
  140. vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
  141. if (vec_any_gt(mmoSum, v4QP))
  142. return 0;
  143. else
  144. return 1;
  145. }
  146. else return 2;
  147. }
  148. /* this is the same as vertClassify_altivec,
  149. with an added 8x8 transpose after the loading,
  150. and w/o the stride*4 offset */
  151. static inline int horizClassify_altivec(uint8_t src[], int stride, PPContext *c) {
  152. /*
  153. this code makes no assumption on src or stride.
  154. One could remove the recomputation of the perm
  155. vector by assuming (stride % 16) == 0, unfortunately
  156. this is not always true.
  157. */
  158. register int y;
  159. short __attribute__ ((aligned(16))) data[8];
  160. int numEq;
  161. uint8_t *src2 = src;
  162. vector signed short v_dcOffset;
  163. vector signed short v2QP;
  164. vector unsigned short v4QP;
  165. vector unsigned short v_dcThreshold;
  166. int two_vectors = ((((unsigned long)src2 % 16) > 8) || (stride % 16)) ? 1 : 0;
  167. const vector signed int zero = vec_splat_s32(0);
  168. const vector signed short mask = vec_splat_s16(1);
  169. vector signed int v_numEq = vec_splat_s32(0);
  170. data[0] = ((c->nonBQP*c->ppMode.baseDcDiff)>>8) + 1;
  171. data[1] = data[0] * 2 + 1;
  172. data[2] = c->QP * 2;
  173. data[3] = c->QP * 4;
  174. vector signed short v_data = vec_ld(0, data);
  175. v_dcOffset = vec_splat(v_data, 0);
  176. v_dcThreshold = (vector unsigned short)vec_splat(v_data, 1);
  177. v2QP = vec_splat(v_data, 2);
  178. v4QP = (vector unsigned short)vec_splat(v_data, 3);
  179. // src2 += stride * 4;
  180. #define LOAD_LINE(i) \
  181. register int j##i = i * stride; \
  182. vector unsigned char perm##i = vec_lvsl(j##i, src2); \
  183. const vector unsigned char v_srcA1##i = vec_ld(j##i, src2); \
  184. vector unsigned char v_srcA2##i; \
  185. if (two_vectors) \
  186. v_srcA2##i = vec_ld(j##i + 16, src2); \
  187. const vector unsigned char v_srcA##i = \
  188. vec_perm(v_srcA1##i, v_srcA2##i, perm##i); \
  189. vector signed short v_srcAss##i = \
  190. (vector signed short)vec_mergeh((vector signed char)zero, \
  191. (vector signed char)v_srcA##i)
  192. LOAD_LINE(0);
  193. LOAD_LINE(1);
  194. LOAD_LINE(2);
  195. LOAD_LINE(3);
  196. LOAD_LINE(4);
  197. LOAD_LINE(5);
  198. LOAD_LINE(6);
  199. LOAD_LINE(7);
  200. #undef LOAD_LINE
  201. ALTIVEC_TRANSPOSE_8x8_SHORT(v_srcAss0,
  202. v_srcAss1,
  203. v_srcAss2,
  204. v_srcAss3,
  205. v_srcAss4,
  206. v_srcAss5,
  207. v_srcAss6,
  208. v_srcAss7);
  209. #define ITER(i, j) \
  210. const vector signed short v_diff##i = \
  211. vec_sub(v_srcAss##i, v_srcAss##j); \
  212. const vector signed short v_sum##i = \
  213. vec_add(v_diff##i, v_dcOffset); \
  214. const vector signed short v_comp##i = \
  215. (vector signed short)vec_cmplt((vector unsigned short)v_sum##i, \
  216. v_dcThreshold); \
  217. const vector signed short v_part##i = vec_and(mask, v_comp##i); \
  218. v_numEq = vec_sum4s(v_part##i, v_numEq);
  219. ITER(0, 1);
  220. ITER(1, 2);
  221. ITER(2, 3);
  222. ITER(3, 4);
  223. ITER(4, 5);
  224. ITER(5, 6);
  225. ITER(6, 7);
  226. #undef ITER
  227. v_numEq = vec_sums(v_numEq, zero);
  228. v_numEq = vec_splat(v_numEq, 3);
  229. vec_ste(v_numEq, 0, &numEq);
  230. if (numEq > c->ppMode.flatnessThreshold)
  231. {
  232. const vector unsigned char mmoP1 = (const vector unsigned char)
  233. AVV(0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f,
  234. 0x00, 0x01, 0x12, 0x13, 0x08, 0x09, 0x1A, 0x1B);
  235. const vector unsigned char mmoP2 = (const vector unsigned char)
  236. AVV(0x04, 0x05, 0x16, 0x17, 0x0C, 0x0D, 0x1E, 0x1F,
  237. 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f, 0x1f);
  238. const vector unsigned char mmoP = (const vector unsigned char)
  239. vec_lvsl(8, (unsigned char*)0);
  240. vector signed short mmoL1 = vec_perm(v_srcAss0, v_srcAss2, mmoP1);
  241. vector signed short mmoL2 = vec_perm(v_srcAss4, v_srcAss6, mmoP2);
  242. vector signed short mmoL = vec_perm(mmoL1, mmoL2, mmoP);
  243. vector signed short mmoR1 = vec_perm(v_srcAss5, v_srcAss7, mmoP1);
  244. vector signed short mmoR2 = vec_perm(v_srcAss1, v_srcAss3, mmoP2);
  245. vector signed short mmoR = vec_perm(mmoR1, mmoR2, mmoP);
  246. vector signed short mmoDiff = vec_sub(mmoL, mmoR);
  247. vector unsigned short mmoSum = (vector unsigned short)vec_add(mmoDiff, v2QP);
  248. if (vec_any_gt(mmoSum, v4QP))
  249. return 0;
  250. else
  251. return 1;
  252. }
  253. else return 2;
  254. }
  255. static inline void doVertLowPass_altivec(uint8_t *src, int stride, PPContext *c) {
  256. /*
  257. this code makes no assumption on src or stride.
  258. One could remove the recomputation of the perm
  259. vector by assuming (stride % 16) == 0, unfortunately
  260. this is not always true. Quite a lot of load/stores
  261. can be removed by assuming proper alignement of
  262. src & stride :-(
  263. */
  264. uint8_t *src2 = src;
  265. const vector signed int zero = vec_splat_s32(0);
  266. short __attribute__ ((aligned(16))) qp[8];
  267. qp[0] = c->QP;
  268. vector signed short vqp = vec_ld(0, qp);
  269. vqp = vec_splat(vqp, 0);
  270. #define LOAD_LINE(i) \
  271. const vector unsigned char perml##i = \
  272. vec_lvsl(i * stride, src2); \
  273. const vector unsigned char vbA##i = \
  274. vec_ld(i * stride, src2); \
  275. const vector unsigned char vbB##i = \
  276. vec_ld(i * stride + 16, src2); \
  277. const vector unsigned char vbT##i = \
  278. vec_perm(vbA##i, vbB##i, perml##i); \
  279. const vector signed short vb##i = \
  280. (vector signed short)vec_mergeh((vector unsigned char)zero, \
  281. (vector unsigned char)vbT##i)
  282. src2 += stride*3;
  283. LOAD_LINE(0);
  284. LOAD_LINE(1);
  285. LOAD_LINE(2);
  286. LOAD_LINE(3);
  287. LOAD_LINE(4);
  288. LOAD_LINE(5);
  289. LOAD_LINE(6);
  290. LOAD_LINE(7);
  291. LOAD_LINE(8);
  292. LOAD_LINE(9);
  293. #undef LOAD_LINE
  294. const vector unsigned short v_1 = vec_splat_u16(1);
  295. const vector unsigned short v_2 = vec_splat_u16(2);
  296. const vector unsigned short v_4 = vec_splat_u16(4);
  297. const vector signed short v_diff01 = vec_sub(vb0, vb1);
  298. const vector unsigned short v_cmp01 =
  299. (const vector unsigned short) vec_cmplt(vec_abs(v_diff01), vqp);
  300. const vector signed short v_first = vec_sel(vb1, vb0, v_cmp01);
  301. const vector signed short v_diff89 = vec_sub(vb8, vb9);
  302. const vector unsigned short v_cmp89 =
  303. (const vector unsigned short) vec_cmplt(vec_abs(v_diff89), vqp);
  304. const vector signed short v_last = vec_sel(vb8, vb9, v_cmp89);
  305. const vector signed short temp01 = vec_mladd(v_first, (vector signed short)v_4, vb1);
  306. const vector signed short temp02 = vec_add(vb2, vb3);
  307. const vector signed short temp03 = vec_add(temp01, (vector signed short)v_4);
  308. const vector signed short v_sumsB0 = vec_add(temp02, temp03);
  309. const vector signed short temp11 = vec_sub(v_sumsB0, v_first);
  310. const vector signed short v_sumsB1 = vec_add(temp11, vb4);
  311. const vector signed short temp21 = vec_sub(v_sumsB1, v_first);
  312. const vector signed short v_sumsB2 = vec_add(temp21, vb5);
  313. const vector signed short temp31 = vec_sub(v_sumsB2, v_first);
  314. const vector signed short v_sumsB3 = vec_add(temp31, vb6);
  315. const vector signed short temp41 = vec_sub(v_sumsB3, v_first);
  316. const vector signed short v_sumsB4 = vec_add(temp41, vb7);
  317. const vector signed short temp51 = vec_sub(v_sumsB4, vb1);
  318. const vector signed short v_sumsB5 = vec_add(temp51, vb8);
  319. const vector signed short temp61 = vec_sub(v_sumsB5, vb2);
  320. const vector signed short v_sumsB6 = vec_add(temp61, v_last);
  321. const vector signed short temp71 = vec_sub(v_sumsB6, vb3);
  322. const vector signed short v_sumsB7 = vec_add(temp71, v_last);
  323. const vector signed short temp81 = vec_sub(v_sumsB7, vb4);
  324. const vector signed short v_sumsB8 = vec_add(temp81, v_last);
  325. const vector signed short temp91 = vec_sub(v_sumsB8, vb5);
  326. const vector signed short v_sumsB9 = vec_add(temp91, v_last);
  327. #define COMPUTE_VR(i, j, k) \
  328. const vector signed short temps1##i = \
  329. vec_add(v_sumsB##i, v_sumsB##k); \
  330. const vector signed short temps2##i = \
  331. vec_mladd(vb##j, (vector signed short)v_2, temps1##i); \
  332. const vector signed short vr##j = vec_sra(temps2##i, v_4)
  333. COMPUTE_VR(0, 1, 2);
  334. COMPUTE_VR(1, 2, 3);
  335. COMPUTE_VR(2, 3, 4);
  336. COMPUTE_VR(3, 4, 5);
  337. COMPUTE_VR(4, 5, 6);
  338. COMPUTE_VR(5, 6, 7);
  339. COMPUTE_VR(6, 7, 8);
  340. COMPUTE_VR(7, 8, 9);
  341. const vector signed char neg1 = vec_splat_s8(-1);
  342. const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  343. 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
  344. #define PACK_AND_STORE(i) \
  345. const vector unsigned char perms##i = \
  346. vec_lvsr(i * stride, src2); \
  347. const vector unsigned char vf##i = \
  348. vec_packsu(vr##i, (vector signed short)zero); \
  349. const vector unsigned char vg##i = \
  350. vec_perm(vf##i, vbT##i, permHH); \
  351. const vector unsigned char mask##i = \
  352. vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
  353. const vector unsigned char vg2##i = \
  354. vec_perm(vg##i, vg##i, perms##i); \
  355. const vector unsigned char svA##i = \
  356. vec_sel(vbA##i, vg2##i, mask##i); \
  357. const vector unsigned char svB##i = \
  358. vec_sel(vg2##i, vbB##i, mask##i); \
  359. vec_st(svA##i, i * stride, src2); \
  360. vec_st(svB##i, i * stride + 16, src2)
  361. PACK_AND_STORE(1);
  362. PACK_AND_STORE(2);
  363. PACK_AND_STORE(3);
  364. PACK_AND_STORE(4);
  365. PACK_AND_STORE(5);
  366. PACK_AND_STORE(6);
  367. PACK_AND_STORE(7);
  368. PACK_AND_STORE(8);
  369. #undef PACK_AND_STORE
  370. }
  371. static inline void doVertDefFilter_altivec(uint8_t src[], int stride, PPContext *c) {
  372. /*
  373. this code makes no assumption on src or stride.
  374. One could remove the recomputation of the perm
  375. vector by assuming (stride % 16) == 0, unfortunately
  376. this is not always true. Quite a lot of load/stores
  377. can be removed by assuming proper alignement of
  378. src & stride :-(
  379. */
  380. uint8_t *src2 = src;
  381. const vector signed int zero = vec_splat_s32(0);
  382. short __attribute__ ((aligned(16))) qp[8];
  383. qp[0] = 8*c->QP;
  384. vector signed short vqp = vec_ld(0, qp);
  385. vqp = vec_splat(vqp, 0);
  386. #define LOAD_LINE(i) \
  387. const vector unsigned char perm##i = \
  388. vec_lvsl(i * stride, src2); \
  389. const vector unsigned char vbA##i = \
  390. vec_ld(i * stride, src2); \
  391. const vector unsigned char vbB##i = \
  392. vec_ld(i * stride + 16, src2); \
  393. const vector unsigned char vbT##i = \
  394. vec_perm(vbA##i, vbB##i, perm##i); \
  395. const vector signed short vb##i = \
  396. (vector signed short)vec_mergeh((vector unsigned char)zero, \
  397. (vector unsigned char)vbT##i)
  398. src2 += stride*3;
  399. LOAD_LINE(1);
  400. LOAD_LINE(2);
  401. LOAD_LINE(3);
  402. LOAD_LINE(4);
  403. LOAD_LINE(5);
  404. LOAD_LINE(6);
  405. LOAD_LINE(7);
  406. LOAD_LINE(8);
  407. #undef LOAD_LINE
  408. const vector signed short v_1 = vec_splat_s16(1);
  409. const vector signed short v_2 = vec_splat_s16(2);
  410. const vector signed short v_5 = vec_splat_s16(5);
  411. const vector signed short v_32 = vec_sl(v_1,
  412. (vector unsigned short)v_5);
  413. /* middle energy */
  414. const vector signed short l3minusl6 = vec_sub(vb3, vb6);
  415. const vector signed short l5minusl4 = vec_sub(vb5, vb4);
  416. const vector signed short twotimes_l3minusl6 = vec_mladd(v_2, l3minusl6, (vector signed short)zero);
  417. const vector signed short mE = vec_mladd(v_5, l5minusl4, twotimes_l3minusl6);
  418. const vector signed short absmE = vec_abs(mE);
  419. /* left & right energy */
  420. const vector signed short l1minusl4 = vec_sub(vb1, vb4);
  421. const vector signed short l3minusl2 = vec_sub(vb3, vb2);
  422. const vector signed short l5minusl8 = vec_sub(vb5, vb8);
  423. const vector signed short l7minusl6 = vec_sub(vb7, vb6);
  424. const vector signed short twotimes_l1minusl4 = vec_mladd(v_2, l1minusl4, (vector signed short)zero);
  425. const vector signed short twotimes_l5minusl8 = vec_mladd(v_2, l5minusl8, (vector signed short)zero);
  426. const vector signed short lE = vec_mladd(v_5, l3minusl2, twotimes_l1minusl4);
  427. const vector signed short rE = vec_mladd(v_5, l7minusl6, twotimes_l5minusl8);
  428. /* d */
  429. const vector signed short ddiff = vec_sub(absmE,
  430. vec_min(vec_abs(lE),
  431. vec_abs(rE)));
  432. const vector signed short ddiffclamp = vec_max(ddiff, (vector signed short)zero);
  433. const vector signed short dtimes64 = vec_mladd(v_5, ddiffclamp, v_32);
  434. const vector signed short d = vec_sra(dtimes64, vec_splat_u16(6));
  435. const vector signed short minusd = vec_sub((vector signed short)zero, d);
  436. const vector signed short finald = vec_sel(minusd,
  437. d,
  438. vec_cmpgt(vec_sub((vector signed short)zero, mE),
  439. (vector signed short)zero));
  440. /* q */
  441. const vector signed short qtimes2 = vec_sub(vb4, vb5);
  442. /* for a shift right to behave like /2, we need to add one
  443. to all negative integer */
  444. const vector signed short rounddown = vec_sel((vector signed short)zero,
  445. v_1,
  446. vec_cmplt(qtimes2, (vector signed short)zero));
  447. const vector signed short q = vec_sra(vec_add(qtimes2, rounddown), vec_splat_u16(1));
  448. /* clamp */
  449. const vector signed short dclamp_P1 = vec_max((vector signed short)zero, finald);
  450. const vector signed short dclamp_P = vec_min(dclamp_P1, q);
  451. const vector signed short dclamp_N1 = vec_min((vector signed short)zero, finald);
  452. const vector signed short dclamp_N = vec_max(dclamp_N1, q);
  453. const vector signed short dclampedfinal = vec_sel(dclamp_N,
  454. dclamp_P,
  455. vec_cmpgt(q, (vector signed short)zero));
  456. const vector signed short dornotd = vec_sel((vector signed short)zero,
  457. dclampedfinal,
  458. vec_cmplt(absmE, vqp));
  459. /* add/substract to l4 and l5 */
  460. const vector signed short vb4minusd = vec_sub(vb4, dornotd);
  461. const vector signed short vb5plusd = vec_add(vb5, dornotd);
  462. /* finally, stores */
  463. const vector unsigned char st4 = vec_packsu(vb4minusd, (vector signed short)zero);
  464. const vector unsigned char st5 = vec_packsu(vb5plusd, (vector signed short)zero);
  465. const vector signed char neg1 = vec_splat_s8(-1);
  466. const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  467. 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
  468. #define STORE(i) \
  469. const vector unsigned char perms##i = \
  470. vec_lvsr(i * stride, src2); \
  471. const vector unsigned char vg##i = \
  472. vec_perm(st##i, vbT##i, permHH); \
  473. const vector unsigned char mask##i = \
  474. vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##i); \
  475. const vector unsigned char vg2##i = \
  476. vec_perm(vg##i, vg##i, perms##i); \
  477. const vector unsigned char svA##i = \
  478. vec_sel(vbA##i, vg2##i, mask##i); \
  479. const vector unsigned char svB##i = \
  480. vec_sel(vg2##i, vbB##i, mask##i); \
  481. vec_st(svA##i, i * stride, src2); \
  482. vec_st(svB##i, i * stride + 16, src2)
  483. STORE(4);
  484. STORE(5);
  485. }
  486. static inline void dering_altivec(uint8_t src[], int stride, PPContext *c) {
  487. /*
  488. this code makes no assumption on src or stride.
  489. One could remove the recomputation of the perm
  490. vector by assuming (stride % 16) == 0, unfortunately
  491. this is not always true. Quite a lot of load/stores
  492. can be removed by assuming proper alignement of
  493. src & stride :-(
  494. */
  495. uint8_t *srcCopy = src;
  496. uint8_t __attribute__((aligned(16))) dt[16];
  497. const vector unsigned char vuint8_1 = vec_splat_u8(1);
  498. const vector signed int zero = vec_splat_s32(0);
  499. vector unsigned char v_dt;
  500. dt[0] = deringThreshold;
  501. v_dt = vec_splat(vec_ld(0, dt), 0);
  502. #define LOAD_LINE(i) \
  503. const vector unsigned char perm##i = \
  504. vec_lvsl(i * stride, srcCopy); \
  505. vector unsigned char sA##i = vec_ld(i * stride, srcCopy); \
  506. vector unsigned char sB##i = vec_ld(i * stride + 16, srcCopy); \
  507. vector unsigned char src##i = vec_perm(sA##i, sB##i, perm##i)
  508. LOAD_LINE(0);
  509. LOAD_LINE(1);
  510. LOAD_LINE(2);
  511. LOAD_LINE(3);
  512. LOAD_LINE(4);
  513. LOAD_LINE(5);
  514. LOAD_LINE(6);
  515. LOAD_LINE(7);
  516. LOAD_LINE(8);
  517. LOAD_LINE(9);
  518. #undef LOAD_LINE
  519. vector unsigned char v_avg;
  520. {
  521. const vector unsigned char trunc_perm = (vector unsigned char)
  522. AVV(0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
  523. 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18);
  524. const vector unsigned char trunc_src12 = vec_perm(src1, src2, trunc_perm);
  525. const vector unsigned char trunc_src34 = vec_perm(src3, src4, trunc_perm);
  526. const vector unsigned char trunc_src56 = vec_perm(src5, src6, trunc_perm);
  527. const vector unsigned char trunc_src78 = vec_perm(src7, src8, trunc_perm);
  528. #define EXTRACT(op) do { \
  529. const vector unsigned char s##op##_1 = vec_##op(trunc_src12, trunc_src34); \
  530. const vector unsigned char s##op##_2 = vec_##op(trunc_src56, trunc_src78); \
  531. const vector unsigned char s##op##_6 = vec_##op(s##op##_1, s##op##_2); \
  532. const vector unsigned char s##op##_8h = vec_mergeh(s##op##_6, s##op##_6); \
  533. const vector unsigned char s##op##_8l = vec_mergel(s##op##_6, s##op##_6); \
  534. const vector unsigned char s##op##_9 = vec_##op(s##op##_8h, s##op##_8l); \
  535. const vector unsigned char s##op##_9h = vec_mergeh(s##op##_9, s##op##_9); \
  536. const vector unsigned char s##op##_9l = vec_mergel(s##op##_9, s##op##_9); \
  537. const vector unsigned char s##op##_10 = vec_##op(s##op##_9h, s##op##_9l); \
  538. const vector unsigned char s##op##_10h = vec_mergeh(s##op##_10, s##op##_10); \
  539. const vector unsigned char s##op##_10l = vec_mergel(s##op##_10, s##op##_10); \
  540. const vector unsigned char s##op##_11 = vec_##op(s##op##_10h, s##op##_10l); \
  541. const vector unsigned char s##op##_11h = vec_mergeh(s##op##_11, s##op##_11); \
  542. const vector unsigned char s##op##_11l = vec_mergel(s##op##_11, s##op##_11); \
  543. v_##op = vec_##op(s##op##_11h, s##op##_11l); } while (0)
  544. vector unsigned char v_min;
  545. vector unsigned char v_max;
  546. EXTRACT(min);
  547. EXTRACT(max);
  548. #undef EXTRACT
  549. if (vec_all_lt(vec_sub(v_max, v_min), v_dt))
  550. return;
  551. v_avg = vec_avg(v_min, v_max);
  552. }
  553. signed int __attribute__((aligned(16))) S[8];
  554. {
  555. const vector unsigned short mask1 = (vector unsigned short)
  556. AVV(0x0001, 0x0002, 0x0004, 0x0008,
  557. 0x0010, 0x0020, 0x0040, 0x0080);
  558. const vector unsigned short mask2 = (vector unsigned short)
  559. AVV(0x0100, 0x0200, 0x0000, 0x0000,
  560. 0x0000, 0x0000, 0x0000, 0x0000);
  561. const vector unsigned int vuint32_16 = vec_sl(vec_splat_u32(1), vec_splat_u32(4));
  562. const vector unsigned int vuint32_1 = vec_splat_u32(1);
  563. #define COMPARE(i) \
  564. vector signed int sum##i; \
  565. do { \
  566. const vector unsigned char cmp##i = \
  567. (vector unsigned char)vec_cmpgt(src##i, v_avg); \
  568. const vector unsigned short cmpHi##i = \
  569. (vector unsigned short)vec_mergeh(cmp##i, cmp##i); \
  570. const vector unsigned short cmpLi##i = \
  571. (vector unsigned short)vec_mergel(cmp##i, cmp##i); \
  572. const vector signed short cmpHf##i = \
  573. (vector signed short)vec_and(cmpHi##i, mask1); \
  574. const vector signed short cmpLf##i = \
  575. (vector signed short)vec_and(cmpLi##i, mask2); \
  576. const vector signed int sump##i = vec_sum4s(cmpHf##i, zero); \
  577. const vector signed int sumq##i = vec_sum4s(cmpLf##i, sump##i); \
  578. sum##i = vec_sums(sumq##i, zero); } while (0)
  579. COMPARE(0);
  580. COMPARE(1);
  581. COMPARE(2);
  582. COMPARE(3);
  583. COMPARE(4);
  584. COMPARE(5);
  585. COMPARE(6);
  586. COMPARE(7);
  587. COMPARE(8);
  588. COMPARE(9);
  589. #undef COMPARE
  590. vector signed int sumA2;
  591. vector signed int sumB2;
  592. {
  593. const vector signed int sump02 = vec_mergel(sum0, sum2);
  594. const vector signed int sump13 = vec_mergel(sum1, sum3);
  595. const vector signed int sumA = vec_mergel(sump02, sump13);
  596. const vector signed int sump46 = vec_mergel(sum4, sum6);
  597. const vector signed int sump57 = vec_mergel(sum5, sum7);
  598. const vector signed int sumB = vec_mergel(sump46, sump57);
  599. const vector signed int sump8A = vec_mergel(sum8, zero);
  600. const vector signed int sump9B = vec_mergel(sum9, zero);
  601. const vector signed int sumC = vec_mergel(sump8A, sump9B);
  602. const vector signed int tA = vec_sl(vec_nor(zero, sumA), vuint32_16);
  603. const vector signed int tB = vec_sl(vec_nor(zero, sumB), vuint32_16);
  604. const vector signed int tC = vec_sl(vec_nor(zero, sumC), vuint32_16);
  605. const vector signed int t2A = vec_or(sumA, tA);
  606. const vector signed int t2B = vec_or(sumB, tB);
  607. const vector signed int t2C = vec_or(sumC, tC);
  608. const vector signed int t3A = vec_and(vec_sra(t2A, vuint32_1),
  609. vec_sl(t2A, vuint32_1));
  610. const vector signed int t3B = vec_and(vec_sra(t2B, vuint32_1),
  611. vec_sl(t2B, vuint32_1));
  612. const vector signed int t3C = vec_and(vec_sra(t2C, vuint32_1),
  613. vec_sl(t2C, vuint32_1));
  614. const vector signed int yA = vec_and(t2A, t3A);
  615. const vector signed int yB = vec_and(t2B, t3B);
  616. const vector signed int yC = vec_and(t2C, t3C);
  617. const vector unsigned char strangeperm1 = vec_lvsl(4, (unsigned char*)0);
  618. const vector unsigned char strangeperm2 = vec_lvsl(8, (unsigned char*)0);
  619. const vector signed int sumAd4 = vec_perm(yA, yB, strangeperm1);
  620. const vector signed int sumAd8 = vec_perm(yA, yB, strangeperm2);
  621. const vector signed int sumBd4 = vec_perm(yB, yC, strangeperm1);
  622. const vector signed int sumBd8 = vec_perm(yB, yC, strangeperm2);
  623. const vector signed int sumAp = vec_and(yA,
  624. vec_and(sumAd4,sumAd8));
  625. const vector signed int sumBp = vec_and(yB,
  626. vec_and(sumBd4,sumBd8));
  627. sumA2 = vec_or(sumAp,
  628. vec_sra(sumAp,
  629. vuint32_16));
  630. sumB2 = vec_or(sumBp,
  631. vec_sra(sumBp,
  632. vuint32_16));
  633. }
  634. vec_st(sumA2, 0, S);
  635. vec_st(sumB2, 16, S);
  636. }
  637. /* I'm not sure the following is actually faster
  638. than straight, unvectorized C code :-( */
  639. int __attribute__((aligned(16))) tQP2[4];
  640. tQP2[0]= c->QP/2 + 1;
  641. vector signed int vQP2 = vec_ld(0, tQP2);
  642. vQP2 = vec_splat(vQP2, 0);
  643. const vector unsigned char vuint8_2 = vec_splat_u8(2);
  644. const vector signed int vsint32_8 = vec_splat_s32(8);
  645. const vector unsigned int vuint32_4 = vec_splat_u32(4);
  646. const vector unsigned char permA1 = (vector unsigned char)
  647. AVV(0x00, 0x01, 0x02, 0x10, 0x11, 0x12, 0x1F, 0x1F,
  648. 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
  649. const vector unsigned char permA2 = (vector unsigned char)
  650. AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x10, 0x11,
  651. 0x12, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F, 0x1F);
  652. const vector unsigned char permA1inc = (vector unsigned char)
  653. AVV(0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x00, 0x00,
  654. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
  655. const vector unsigned char permA2inc = (vector unsigned char)
  656. AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01,
  657. 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
  658. const vector unsigned char magic = (vector unsigned char)
  659. AVV(0x01, 0x02, 0x01, 0x02, 0x04, 0x02, 0x01, 0x02,
  660. 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
  661. const vector unsigned char extractPerm = (vector unsigned char)
  662. AVV(0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01,
  663. 0x10, 0x10, 0x10, 0x01, 0x10, 0x10, 0x10, 0x01);
  664. const vector unsigned char extractPermInc = (vector unsigned char)
  665. AVV(0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01,
  666. 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01);
  667. const vector unsigned char identity = vec_lvsl(0,(unsigned char *)0);
  668. const vector unsigned char tenRight = (vector unsigned char)
  669. AVV(0x00, 0x10, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  670. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00);
  671. const vector unsigned char eightLeft = (vector unsigned char)
  672. AVV(0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
  673. 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08);
  674. #define F_INIT(i) \
  675. vector unsigned char tenRightM##i = tenRight; \
  676. vector unsigned char permA1M##i = permA1; \
  677. vector unsigned char permA2M##i = permA2; \
  678. vector unsigned char extractPermM##i = extractPerm
  679. #define F2(i, j, k, l) \
  680. if (S[i] & (1 << (l+1))) { \
  681. const vector unsigned char a_##j##_A##l = \
  682. vec_perm(src##i, src##j, permA1M##i); \
  683. const vector unsigned char a_##j##_B##l = \
  684. vec_perm(a_##j##_A##l, src##k, permA2M##i); \
  685. const vector signed int a_##j##_sump##l = \
  686. (vector signed int)vec_msum(a_##j##_B##l, magic, \
  687. (vector unsigned int)zero); \
  688. vector signed int F_##j##_##l = \
  689. vec_sr(vec_sums(a_##j##_sump##l, vsint32_8), vuint32_4); \
  690. F_##j##_##l = vec_splat(F_##j##_##l, 3); \
  691. const vector signed int p_##j##_##l = \
  692. (vector signed int)vec_perm(src##j, \
  693. (vector unsigned char)zero, \
  694. extractPermM##i); \
  695. const vector signed int sum_##j##_##l = vec_add( p_##j##_##l, vQP2); \
  696. const vector signed int diff_##j##_##l = vec_sub( p_##j##_##l, vQP2); \
  697. vector signed int newpm_##j##_##l; \
  698. if (vec_all_lt(sum_##j##_##l, F_##j##_##l)) \
  699. newpm_##j##_##l = sum_##j##_##l; \
  700. else if (vec_all_gt(diff_##j##_##l, F_##j##_##l)) \
  701. newpm_##j##_##l = diff_##j##_##l; \
  702. else newpm_##j##_##l = F_##j##_##l; \
  703. const vector unsigned char newpm2_##j##_##l = \
  704. vec_splat((vector unsigned char)newpm_##j##_##l, 15); \
  705. const vector unsigned char mask##j##l = vec_add(identity, \
  706. tenRightM##i); \
  707. src##j = vec_perm(src##j, newpm2_##j##_##l, mask##j##l); \
  708. } \
  709. permA1M##i = vec_add(permA1M##i, permA1inc); \
  710. permA2M##i = vec_add(permA2M##i, permA2inc); \
  711. tenRightM##i = vec_sro(tenRightM##i, eightLeft); \
  712. extractPermM##i = vec_add(extractPermM##i, extractPermInc)
  713. #define ITER(i, j, k) \
  714. F_INIT(i); \
  715. F2(i, j, k, 0); \
  716. F2(i, j, k, 1); \
  717. F2(i, j, k, 2); \
  718. F2(i, j, k, 3); \
  719. F2(i, j, k, 4); \
  720. F2(i, j, k, 5); \
  721. F2(i, j, k, 6); \
  722. F2(i, j, k, 7)
  723. ITER(0, 1, 2);
  724. ITER(1, 2, 3);
  725. ITER(2, 3, 4);
  726. ITER(3, 4, 5);
  727. ITER(4, 5, 6);
  728. ITER(5, 6, 7);
  729. ITER(6, 7, 8);
  730. ITER(7, 8, 9);
  731. const vector signed char neg1 = vec_splat_s8(-1);
  732. #define STORE_LINE(i) \
  733. const vector unsigned char permST##i = \
  734. vec_lvsr(i * stride, srcCopy); \
  735. const vector unsigned char maskST##i = \
  736. vec_perm((vector unsigned char)zero, \
  737. (vector unsigned char)neg1, permST##i); \
  738. src##i = vec_perm(src##i ,src##i, permST##i); \
  739. sA##i= vec_sel(sA##i, src##i, maskST##i); \
  740. sB##i= vec_sel(src##i, sB##i, maskST##i); \
  741. vec_st(sA##i, i * stride, srcCopy); \
  742. vec_st(sB##i, i * stride + 16, srcCopy)
  743. STORE_LINE(1);
  744. STORE_LINE(2);
  745. STORE_LINE(3);
  746. STORE_LINE(4);
  747. STORE_LINE(5);
  748. STORE_LINE(6);
  749. STORE_LINE(7);
  750. STORE_LINE(8);
  751. #undef STORE_LINE
  752. #undef ITER
  753. #undef F2
  754. }
  755. #define doHorizLowPass_altivec(a...) doHorizLowPass_C(a)
  756. #define doHorizDefFilter_altivec(a...) doHorizDefFilter_C(a)
  757. #define do_a_deblock_altivec(a...) do_a_deblock_C(a)
  758. static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride,
  759. uint8_t *tempBlured, uint32_t *tempBluredPast, int *maxNoise)
  760. {
  761. const vector signed int zero = vec_splat_s32(0);
  762. const vector signed short vsint16_1 = vec_splat_s16(1);
  763. vector signed int v_dp = zero;
  764. vector signed int v_sysdp = zero;
  765. int d, sysd, i;
  766. tempBluredPast[127]= maxNoise[0];
  767. tempBluredPast[128]= maxNoise[1];
  768. tempBluredPast[129]= maxNoise[2];
  769. #define LOAD_LINE(src, i) \
  770. register int j##src##i = i * stride; \
  771. vector unsigned char perm##src##i = vec_lvsl(j##src##i, src); \
  772. const vector unsigned char v_##src##A1##i = vec_ld(j##src##i, src); \
  773. const vector unsigned char v_##src##A2##i = vec_ld(j##src##i + 16, src); \
  774. const vector unsigned char v_##src##A##i = \
  775. vec_perm(v_##src##A1##i, v_##src##A2##i, perm##src##i); \
  776. vector signed short v_##src##Ass##i = \
  777. (vector signed short)vec_mergeh((vector signed char)zero, \
  778. (vector signed char)v_##src##A##i)
  779. LOAD_LINE(src, 0);
  780. LOAD_LINE(src, 1);
  781. LOAD_LINE(src, 2);
  782. LOAD_LINE(src, 3);
  783. LOAD_LINE(src, 4);
  784. LOAD_LINE(src, 5);
  785. LOAD_LINE(src, 6);
  786. LOAD_LINE(src, 7);
  787. LOAD_LINE(tempBlured, 0);
  788. LOAD_LINE(tempBlured, 1);
  789. LOAD_LINE(tempBlured, 2);
  790. LOAD_LINE(tempBlured, 3);
  791. LOAD_LINE(tempBlured, 4);
  792. LOAD_LINE(tempBlured, 5);
  793. LOAD_LINE(tempBlured, 6);
  794. LOAD_LINE(tempBlured, 7);
  795. #undef LOAD_LINE
  796. #define ACCUMULATE_DIFFS(i) \
  797. vector signed short v_d##i = vec_sub(v_tempBluredAss##i, \
  798. v_srcAss##i); \
  799. v_dp = vec_msums(v_d##i, v_d##i, v_dp); \
  800. v_sysdp = vec_msums(v_d##i, vsint16_1, v_sysdp)
  801. ACCUMULATE_DIFFS(0);
  802. ACCUMULATE_DIFFS(1);
  803. ACCUMULATE_DIFFS(2);
  804. ACCUMULATE_DIFFS(3);
  805. ACCUMULATE_DIFFS(4);
  806. ACCUMULATE_DIFFS(5);
  807. ACCUMULATE_DIFFS(6);
  808. ACCUMULATE_DIFFS(7);
  809. #undef ACCUMULATE_DIFFS
  810. v_dp = vec_sums(v_dp, zero);
  811. v_sysdp = vec_sums(v_sysdp, zero);
  812. v_dp = vec_splat(v_dp, 3);
  813. v_sysdp = vec_splat(v_sysdp, 3);
  814. vec_ste(v_dp, 0, &d);
  815. vec_ste(v_sysdp, 0, &sysd);
  816. i = d;
  817. d = (4*d
  818. +(*(tempBluredPast-256))
  819. +(*(tempBluredPast-1))+ (*(tempBluredPast+1))
  820. +(*(tempBluredPast+256))
  821. +4)>>3;
  822. *tempBluredPast=i;
  823. if (d > maxNoise[1]) {
  824. if (d < maxNoise[2]) {
  825. #define OP(i) v_tempBluredAss##i = vec_avg(v_tempBluredAss##i, v_srcAss##i);
  826. OP(0);
  827. OP(1);
  828. OP(2);
  829. OP(3);
  830. OP(4);
  831. OP(5);
  832. OP(6);
  833. OP(7);
  834. #undef OP
  835. } else {
  836. #define OP(i) v_tempBluredAss##i = v_srcAss##i;
  837. OP(0);
  838. OP(1);
  839. OP(2);
  840. OP(3);
  841. OP(4);
  842. OP(5);
  843. OP(6);
  844. OP(7);
  845. #undef OP
  846. }
  847. } else {
  848. if (d < maxNoise[0]) {
  849. const vector signed short vsint16_7 = vec_splat_s16(7);
  850. const vector signed short vsint16_4 = vec_splat_s16(4);
  851. const vector unsigned short vuint16_3 = vec_splat_u16(3);
  852. #define OP(i) \
  853. const vector signed short v_temp##i = \
  854. vec_mladd(v_tempBluredAss##i, \
  855. vsint16_7, v_srcAss##i); \
  856. const vector signed short v_temp2##i = \
  857. vec_add(v_temp##i, vsint16_4); \
  858. v_tempBluredAss##i = vec_sr(v_temp2##i, vuint16_3)
  859. OP(0);
  860. OP(1);
  861. OP(2);
  862. OP(3);
  863. OP(4);
  864. OP(5);
  865. OP(6);
  866. OP(7);
  867. #undef OP
  868. } else {
  869. const vector signed short vsint16_3 = vec_splat_s16(3);
  870. const vector signed short vsint16_2 = vec_splat_s16(2);
  871. #define OP(i) \
  872. const vector signed short v_temp##i = \
  873. vec_mladd(v_tempBluredAss##i, \
  874. vsint16_3, v_srcAss##i); \
  875. const vector signed short v_temp2##i = \
  876. vec_add(v_temp##i, vsint16_2); \
  877. v_tempBluredAss##i = vec_sr(v_temp2##i, (vector unsigned short)vsint16_2)
  878. OP(0);
  879. OP(1);
  880. OP(2);
  881. OP(3);
  882. OP(4);
  883. OP(5);
  884. OP(6);
  885. OP(7);
  886. #undef OP
  887. }
  888. }
  889. const vector signed char neg1 = vec_splat_s8(-1);
  890. const vector unsigned char permHH = (const vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
  891. 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F);
  892. #define PACK_AND_STORE(src, i) \
  893. const vector unsigned char perms##src##i = \
  894. vec_lvsr(i * stride, src); \
  895. const vector unsigned char vf##src##i = \
  896. vec_packsu(v_tempBluredAss##i, (vector signed short)zero); \
  897. const vector unsigned char vg##src##i = \
  898. vec_perm(vf##src##i, v_##src##A##i, permHH); \
  899. const vector unsigned char mask##src##i = \
  900. vec_perm((vector unsigned char)zero, (vector unsigned char)neg1, perms##src##i); \
  901. const vector unsigned char vg2##src##i = \
  902. vec_perm(vg##src##i, vg##src##i, perms##src##i); \
  903. const vector unsigned char svA##src##i = \
  904. vec_sel(v_##src##A1##i, vg2##src##i, mask##src##i); \
  905. const vector unsigned char svB##src##i = \
  906. vec_sel(vg2##src##i, v_##src##A2##i, mask##src##i); \
  907. vec_st(svA##src##i, i * stride, src); \
  908. vec_st(svB##src##i, i * stride + 16, src)
  909. PACK_AND_STORE(src, 0);
  910. PACK_AND_STORE(src, 1);
  911. PACK_AND_STORE(src, 2);
  912. PACK_AND_STORE(src, 3);
  913. PACK_AND_STORE(src, 4);
  914. PACK_AND_STORE(src, 5);
  915. PACK_AND_STORE(src, 6);
  916. PACK_AND_STORE(src, 7);
  917. PACK_AND_STORE(tempBlured, 0);
  918. PACK_AND_STORE(tempBlured, 1);
  919. PACK_AND_STORE(tempBlured, 2);
  920. PACK_AND_STORE(tempBlured, 3);
  921. PACK_AND_STORE(tempBlured, 4);
  922. PACK_AND_STORE(tempBlured, 5);
  923. PACK_AND_STORE(tempBlured, 6);
  924. PACK_AND_STORE(tempBlured, 7);
  925. #undef PACK_AND_STORE
  926. }