You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1519 lines
57KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "dsputil.h"
  23. #include "gcc_fixes.h"
  24. #include "dsputil_ppc.h"
  25. #include "util_altivec.h"
  26. int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  27. {
  28. int i;
  29. DECLARE_ALIGNED_16(int, s);
  30. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  31. vector unsigned char *tv;
  32. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  33. vector unsigned int sad;
  34. vector signed int sumdiffs;
  35. s = 0;
  36. sad = (vector unsigned int)vec_splat_u32(0);
  37. for(i=0;i<h;i++) {
  38. /*
  39. Read unaligned pixels into our vectors. The vectors are as follows:
  40. pix1v: pix1[0]-pix1[15]
  41. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  42. */
  43. tv = (vector unsigned char *) pix1;
  44. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  45. tv = (vector unsigned char *) &pix2[0];
  46. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  47. tv = (vector unsigned char *) &pix2[1];
  48. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  49. /* Calculate the average vector */
  50. avgv = vec_avg(pix2v, pix2iv);
  51. /* Calculate a sum of abs differences vector */
  52. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  53. /* Add each 4 pixel group together and put 4 results into sad */
  54. sad = vec_sum4s(t5, sad);
  55. pix1 += line_size;
  56. pix2 += line_size;
  57. }
  58. /* Sum up the four partial sums, and put the result into s */
  59. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  60. sumdiffs = vec_splat(sumdiffs, 3);
  61. vec_ste(sumdiffs, 0, &s);
  62. return s;
  63. }
  64. int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  65. {
  66. int i;
  67. DECLARE_ALIGNED_16(int, s);
  68. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  69. vector unsigned char *tv;
  70. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  71. vector unsigned int sad;
  72. vector signed int sumdiffs;
  73. uint8_t *pix3 = pix2 + line_size;
  74. s = 0;
  75. sad = (vector unsigned int)vec_splat_u32(0);
  76. /*
  77. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  78. iteration becomes pix2 in the next iteration. We can use this
  79. fact to avoid a potentially expensive unaligned read, each
  80. time around the loop.
  81. Read unaligned pixels into our vectors. The vectors are as follows:
  82. pix2v: pix2[0]-pix2[15]
  83. Split the pixel vectors into shorts
  84. */
  85. tv = (vector unsigned char *) &pix2[0];
  86. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  87. for(i=0;i<h;i++) {
  88. /*
  89. Read unaligned pixels into our vectors. The vectors are as follows:
  90. pix1v: pix1[0]-pix1[15]
  91. pix3v: pix3[0]-pix3[15]
  92. */
  93. tv = (vector unsigned char *) pix1;
  94. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  95. tv = (vector unsigned char *) &pix3[0];
  96. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  97. /* Calculate the average vector */
  98. avgv = vec_avg(pix2v, pix3v);
  99. /* Calculate a sum of abs differences vector */
  100. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  101. /* Add each 4 pixel group together and put 4 results into sad */
  102. sad = vec_sum4s(t5, sad);
  103. pix1 += line_size;
  104. pix2v = pix3v;
  105. pix3 += line_size;
  106. }
  107. /* Sum up the four partial sums, and put the result into s */
  108. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  109. sumdiffs = vec_splat(sumdiffs, 3);
  110. vec_ste(sumdiffs, 0, &s);
  111. return s;
  112. }
  113. int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  114. {
  115. int i;
  116. DECLARE_ALIGNED_16(int, s);
  117. uint8_t *pix3 = pix2 + line_size;
  118. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  119. const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
  120. vector unsigned char *tv, avgv, t5;
  121. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  122. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  123. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  124. vector unsigned short avghv, avglv;
  125. vector unsigned short t1, t2, t3, t4;
  126. vector unsigned int sad;
  127. vector signed int sumdiffs;
  128. sad = (vector unsigned int)vec_splat_u32(0);
  129. s = 0;
  130. /*
  131. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  132. iteration becomes pix2 in the next iteration. We can use this
  133. fact to avoid a potentially expensive unaligned read, as well
  134. as some splitting, and vector addition each time around the loop.
  135. Read unaligned pixels into our vectors. The vectors are as follows:
  136. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  137. Split the pixel vectors into shorts
  138. */
  139. tv = (vector unsigned char *) &pix2[0];
  140. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  141. tv = (vector unsigned char *) &pix2[1];
  142. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  143. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  144. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  145. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  146. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  147. t1 = vec_add(pix2hv, pix2ihv);
  148. t2 = vec_add(pix2lv, pix2ilv);
  149. for(i=0;i<h;i++) {
  150. /*
  151. Read unaligned pixels into our vectors. The vectors are as follows:
  152. pix1v: pix1[0]-pix1[15]
  153. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  154. */
  155. tv = (vector unsigned char *) pix1;
  156. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  157. tv = (vector unsigned char *) &pix3[0];
  158. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  159. tv = (vector unsigned char *) &pix3[1];
  160. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  161. /*
  162. Note that AltiVec does have vec_avg, but this works on vector pairs
  163. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  164. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  165. Instead, we have to split the pixel vectors into vectors of shorts,
  166. and do the averaging by hand.
  167. */
  168. /* Split the pixel vectors into shorts */
  169. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  170. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  171. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  172. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  173. /* Do the averaging on them */
  174. t3 = vec_add(pix3hv, pix3ihv);
  175. t4 = vec_add(pix3lv, pix3ilv);
  176. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  177. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  178. /* Pack the shorts back into a result */
  179. avgv = vec_pack(avghv, avglv);
  180. /* Calculate a sum of abs differences vector */
  181. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  182. /* Add each 4 pixel group together and put 4 results into sad */
  183. sad = vec_sum4s(t5, sad);
  184. pix1 += line_size;
  185. pix3 += line_size;
  186. /* Transfer the calculated values for pix3 into pix2 */
  187. t1 = t3;
  188. t2 = t4;
  189. }
  190. /* Sum up the four partial sums, and put the result into s */
  191. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  192. sumdiffs = vec_splat(sumdiffs, 3);
  193. vec_ste(sumdiffs, 0, &s);
  194. return s;
  195. }
  196. int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  197. {
  198. int i;
  199. DECLARE_ALIGNED_16(int, s);
  200. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  201. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  202. vector unsigned char t1, t2, t3,t4, t5;
  203. vector unsigned int sad;
  204. vector signed int sumdiffs;
  205. sad = (vector unsigned int)vec_splat_u32(0);
  206. for(i=0;i<h;i++) {
  207. /* Read potentially unaligned pixels into t1 and t2 */
  208. perm1 = vec_lvsl(0, pix1);
  209. pix1v = (vector unsigned char *) pix1;
  210. perm2 = vec_lvsl(0, pix2);
  211. pix2v = (vector unsigned char *) pix2;
  212. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  213. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  214. /* Calculate a sum of abs differences vector */
  215. t3 = vec_max(t1, t2);
  216. t4 = vec_min(t1, t2);
  217. t5 = vec_sub(t3, t4);
  218. /* Add each 4 pixel group together and put 4 results into sad */
  219. sad = vec_sum4s(t5, sad);
  220. pix1 += line_size;
  221. pix2 += line_size;
  222. }
  223. /* Sum up the four partial sums, and put the result into s */
  224. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  225. sumdiffs = vec_splat(sumdiffs, 3);
  226. vec_ste(sumdiffs, 0, &s);
  227. return s;
  228. }
  229. int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  230. {
  231. int i;
  232. DECLARE_ALIGNED_16(int, s);
  233. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  234. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  235. vector unsigned char t1, t2, t3,t4, t5;
  236. vector unsigned int sad;
  237. vector signed int sumdiffs;
  238. sad = (vector unsigned int)vec_splat_u32(0);
  239. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  240. for(i=0;i<h;i++) {
  241. /* Read potentially unaligned pixels into t1 and t2
  242. Since we're reading 16 pixels, and actually only want 8,
  243. mask out the last 8 pixels. The 0s don't change the sum. */
  244. perm1 = vec_lvsl(0, pix1);
  245. pix1v = (vector unsigned char *) pix1;
  246. perm2 = vec_lvsl(0, pix2);
  247. pix2v = (vector unsigned char *) pix2;
  248. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  249. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  250. /* Calculate a sum of abs differences vector */
  251. t3 = vec_max(t1, t2);
  252. t4 = vec_min(t1, t2);
  253. t5 = vec_sub(t3, t4);
  254. /* Add each 4 pixel group together and put 4 results into sad */
  255. sad = vec_sum4s(t5, sad);
  256. pix1 += line_size;
  257. pix2 += line_size;
  258. }
  259. /* Sum up the four partial sums, and put the result into s */
  260. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  261. sumdiffs = vec_splat(sumdiffs, 3);
  262. vec_ste(sumdiffs, 0, &s);
  263. return s;
  264. }
  265. int pix_norm1_altivec(uint8_t *pix, int line_size)
  266. {
  267. int i;
  268. DECLARE_ALIGNED_16(int, s);
  269. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  270. vector unsigned char *tv;
  271. vector unsigned char pixv;
  272. vector unsigned int sv;
  273. vector signed int sum;
  274. sv = (vector unsigned int)vec_splat_u32(0);
  275. s = 0;
  276. for (i = 0; i < 16; i++) {
  277. /* Read in the potentially unaligned pixels */
  278. tv = (vector unsigned char *) pix;
  279. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  280. /* Square the values, and add them to our sum */
  281. sv = vec_msum(pixv, pixv, sv);
  282. pix += line_size;
  283. }
  284. /* Sum up the four partial sums, and put the result into s */
  285. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  286. sum = vec_splat(sum, 3);
  287. vec_ste(sum, 0, &s);
  288. return s;
  289. }
  290. /**
  291. * Sum of Squared Errors for a 8x8 block.
  292. * AltiVec-enhanced.
  293. * It's the sad8_altivec code above w/ squaring added.
  294. */
  295. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  296. {
  297. int i;
  298. DECLARE_ALIGNED_16(int, s);
  299. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  300. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  301. vector unsigned char t1, t2, t3,t4, t5;
  302. vector unsigned int sum;
  303. vector signed int sumsqr;
  304. sum = (vector unsigned int)vec_splat_u32(0);
  305. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  306. for(i=0;i<h;i++) {
  307. /* Read potentially unaligned pixels into t1 and t2
  308. Since we're reading 16 pixels, and actually only want 8,
  309. mask out the last 8 pixels. The 0s don't change the sum. */
  310. perm1 = vec_lvsl(0, pix1);
  311. pix1v = (vector unsigned char *) pix1;
  312. perm2 = vec_lvsl(0, pix2);
  313. pix2v = (vector unsigned char *) pix2;
  314. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  315. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  316. /*
  317. Since we want to use unsigned chars, we can take advantage
  318. of the fact that abs(a-b)^2 = (a-b)^2.
  319. */
  320. /* Calculate abs differences vector */
  321. t3 = vec_max(t1, t2);
  322. t4 = vec_min(t1, t2);
  323. t5 = vec_sub(t3, t4);
  324. /* Square the values and add them to our sum */
  325. sum = vec_msum(t5, t5, sum);
  326. pix1 += line_size;
  327. pix2 += line_size;
  328. }
  329. /* Sum up the four partial sums, and put the result into s */
  330. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  331. sumsqr = vec_splat(sumsqr, 3);
  332. vec_ste(sumsqr, 0, &s);
  333. return s;
  334. }
  335. /**
  336. * Sum of Squared Errors for a 16x16 block.
  337. * AltiVec-enhanced.
  338. * It's the sad16_altivec code above w/ squaring added.
  339. */
  340. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  341. {
  342. int i;
  343. DECLARE_ALIGNED_16(int, s);
  344. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  345. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  346. vector unsigned char t1, t2, t3,t4, t5;
  347. vector unsigned int sum;
  348. vector signed int sumsqr;
  349. sum = (vector unsigned int)vec_splat_u32(0);
  350. for(i=0;i<h;i++) {
  351. /* Read potentially unaligned pixels into t1 and t2 */
  352. perm1 = vec_lvsl(0, pix1);
  353. pix1v = (vector unsigned char *) pix1;
  354. perm2 = vec_lvsl(0, pix2);
  355. pix2v = (vector unsigned char *) pix2;
  356. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  357. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  358. /*
  359. Since we want to use unsigned chars, we can take advantage
  360. of the fact that abs(a-b)^2 = (a-b)^2.
  361. */
  362. /* Calculate abs differences vector */
  363. t3 = vec_max(t1, t2);
  364. t4 = vec_min(t1, t2);
  365. t5 = vec_sub(t3, t4);
  366. /* Square the values and add them to our sum */
  367. sum = vec_msum(t5, t5, sum);
  368. pix1 += line_size;
  369. pix2 += line_size;
  370. }
  371. /* Sum up the four partial sums, and put the result into s */
  372. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  373. sumsqr = vec_splat(sumsqr, 3);
  374. vec_ste(sumsqr, 0, &s);
  375. return s;
  376. }
  377. int pix_sum_altivec(uint8_t * pix, int line_size)
  378. {
  379. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  380. vector unsigned char perm, *pixv;
  381. vector unsigned char t1;
  382. vector unsigned int sad;
  383. vector signed int sumdiffs;
  384. int i;
  385. DECLARE_ALIGNED_16(int, s);
  386. sad = (vector unsigned int)vec_splat_u32(0);
  387. for (i = 0; i < 16; i++) {
  388. /* Read the potentially unaligned 16 pixels into t1 */
  389. perm = vec_lvsl(0, pix);
  390. pixv = (vector unsigned char *) pix;
  391. t1 = vec_perm(pixv[0], pixv[1], perm);
  392. /* Add each 4 pixel group together and put 4 results into sad */
  393. sad = vec_sum4s(t1, sad);
  394. pix += line_size;
  395. }
  396. /* Sum up the four partial sums, and put the result into s */
  397. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  398. sumdiffs = vec_splat(sumdiffs, 3);
  399. vec_ste(sumdiffs, 0, &s);
  400. return s;
  401. }
  402. void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  403. {
  404. int i;
  405. vector unsigned char perm, bytes, *pixv;
  406. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  407. vector signed short shorts;
  408. for(i=0;i<8;i++)
  409. {
  410. // Read potentially unaligned pixels.
  411. // We're reading 16 pixels, and actually only want 8,
  412. // but we simply ignore the extras.
  413. perm = vec_lvsl(0, pixels);
  414. pixv = (vector unsigned char *) pixels;
  415. bytes = vec_perm(pixv[0], pixv[1], perm);
  416. // convert the bytes into shorts
  417. shorts = (vector signed short)vec_mergeh(zero, bytes);
  418. // save the data to the block, we assume the block is 16-byte aligned
  419. vec_st(shorts, i*16, (vector signed short*)block);
  420. pixels += line_size;
  421. }
  422. }
  423. void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  424. const uint8_t *s2, int stride)
  425. {
  426. int i;
  427. vector unsigned char perm, bytes, *pixv;
  428. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  429. vector signed short shorts1, shorts2;
  430. for(i=0;i<4;i++)
  431. {
  432. // Read potentially unaligned pixels
  433. // We're reading 16 pixels, and actually only want 8,
  434. // but we simply ignore the extras.
  435. perm = vec_lvsl(0, s1);
  436. pixv = (vector unsigned char *) s1;
  437. bytes = vec_perm(pixv[0], pixv[1], perm);
  438. // convert the bytes into shorts
  439. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  440. // Do the same for the second block of pixels
  441. perm = vec_lvsl(0, s2);
  442. pixv = (vector unsigned char *) s2;
  443. bytes = vec_perm(pixv[0], pixv[1], perm);
  444. // convert the bytes into shorts
  445. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  446. // Do the subtraction
  447. shorts1 = vec_sub(shorts1, shorts2);
  448. // save the data to the block, we assume the block is 16-byte aligned
  449. vec_st(shorts1, 0, (vector signed short*)block);
  450. s1 += stride;
  451. s2 += stride;
  452. block += 8;
  453. // The code below is a copy of the code above... This is a manual
  454. // unroll.
  455. // Read potentially unaligned pixels
  456. // We're reading 16 pixels, and actually only want 8,
  457. // but we simply ignore the extras.
  458. perm = vec_lvsl(0, s1);
  459. pixv = (vector unsigned char *) s1;
  460. bytes = vec_perm(pixv[0], pixv[1], perm);
  461. // convert the bytes into shorts
  462. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  463. // Do the same for the second block of pixels
  464. perm = vec_lvsl(0, s2);
  465. pixv = (vector unsigned char *) s2;
  466. bytes = vec_perm(pixv[0], pixv[1], perm);
  467. // convert the bytes into shorts
  468. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  469. // Do the subtraction
  470. shorts1 = vec_sub(shorts1, shorts2);
  471. // save the data to the block, we assume the block is 16-byte aligned
  472. vec_st(shorts1, 0, (vector signed short*)block);
  473. s1 += stride;
  474. s2 += stride;
  475. block += 8;
  476. }
  477. }
  478. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  479. register int i;
  480. register vector unsigned char vdst, vsrc;
  481. /* dst and src are 16 bytes-aligned (guaranteed) */
  482. for(i = 0 ; (i + 15) < w ; i+=16)
  483. {
  484. vdst = vec_ld(i, (unsigned char*)dst);
  485. vsrc = vec_ld(i, (unsigned char*)src);
  486. vdst = vec_add(vsrc, vdst);
  487. vec_st(vdst, i, (unsigned char*)dst);
  488. }
  489. /* if w is not a multiple of 16 */
  490. for (; (i < w) ; i++)
  491. {
  492. dst[i] = src[i];
  493. }
  494. }
  495. /* next one assumes that ((line_size % 16) == 0) */
  496. void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  497. {
  498. POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
  499. register vector unsigned char pixelsv1, pixelsv2;
  500. register vector unsigned char pixelsv1B, pixelsv2B;
  501. register vector unsigned char pixelsv1C, pixelsv2C;
  502. register vector unsigned char pixelsv1D, pixelsv2D;
  503. register vector unsigned char perm = vec_lvsl(0, pixels);
  504. int i;
  505. register int line_size_2 = line_size << 1;
  506. register int line_size_3 = line_size + line_size_2;
  507. register int line_size_4 = line_size << 2;
  508. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  509. // hand-unrolling the loop by 4 gains about 15%
  510. // mininum execution time goes from 74 to 60 cycles
  511. // it's faster than -funroll-loops, but using
  512. // -funroll-loops w/ this is bad - 74 cycles again.
  513. // all this is on a 7450, tuning for the 7450
  514. #if 0
  515. for(i=0; i<h; i++) {
  516. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  517. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  518. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  519. 0, (unsigned char*)block);
  520. pixels+=line_size;
  521. block +=line_size;
  522. }
  523. #else
  524. for(i=0; i<h; i+=4) {
  525. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  526. pixelsv2 = vec_ld(15, (unsigned char*)pixels);
  527. pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
  528. pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
  529. pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
  530. pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
  531. pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
  532. pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
  533. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  534. 0, (unsigned char*)block);
  535. vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
  536. line_size, (unsigned char*)block);
  537. vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
  538. line_size_2, (unsigned char*)block);
  539. vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
  540. line_size_3, (unsigned char*)block);
  541. pixels+=line_size_4;
  542. block +=line_size_4;
  543. }
  544. #endif
  545. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  546. }
  547. /* next one assumes that ((line_size % 16) == 0) */
  548. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  549. void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  550. {
  551. POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
  552. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  553. register vector unsigned char perm = vec_lvsl(0, pixels);
  554. int i;
  555. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  556. for(i=0; i<h; i++) {
  557. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  558. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  559. blockv = vec_ld(0, block);
  560. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  561. blockv = vec_avg(blockv,pixelsv);
  562. vec_st(blockv, 0, (unsigned char*)block);
  563. pixels+=line_size;
  564. block +=line_size;
  565. }
  566. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  567. }
  568. /* next one assumes that ((line_size % 8) == 0) */
  569. void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  570. {
  571. POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
  572. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  573. int i;
  574. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  575. for (i = 0; i < h; i++) {
  576. /*
  577. block is 8 bytes-aligned, so we're either in the
  578. left block (16 bytes-aligned) or in the right block (not)
  579. */
  580. int rightside = ((unsigned long)block & 0x0000000F);
  581. blockv = vec_ld(0, block);
  582. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  583. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  584. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  585. if (rightside)
  586. {
  587. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  588. }
  589. else
  590. {
  591. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  592. }
  593. blockv = vec_avg(blockv, pixelsv);
  594. vec_st(blockv, 0, block);
  595. pixels += line_size;
  596. block += line_size;
  597. }
  598. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  599. }
  600. /* next one assumes that ((line_size % 8) == 0) */
  601. void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  602. {
  603. POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
  604. register int i;
  605. register vector unsigned char
  606. pixelsv1, pixelsv2,
  607. pixelsavg;
  608. register vector unsigned char
  609. blockv, temp1, temp2;
  610. register vector unsigned short
  611. pixelssum1, pixelssum2, temp3;
  612. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  613. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  614. temp1 = vec_ld(0, pixels);
  615. temp2 = vec_ld(16, pixels);
  616. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  617. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  618. {
  619. pixelsv2 = temp2;
  620. }
  621. else
  622. {
  623. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  624. }
  625. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  626. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  627. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  628. (vector unsigned short)pixelsv2);
  629. pixelssum1 = vec_add(pixelssum1, vctwo);
  630. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  631. for (i = 0; i < h ; i++) {
  632. int rightside = ((unsigned long)block & 0x0000000F);
  633. blockv = vec_ld(0, block);
  634. temp1 = vec_ld(line_size, pixels);
  635. temp2 = vec_ld(line_size + 16, pixels);
  636. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  637. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  638. {
  639. pixelsv2 = temp2;
  640. }
  641. else
  642. {
  643. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  644. }
  645. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  646. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  647. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  648. (vector unsigned short)pixelsv2);
  649. temp3 = vec_add(pixelssum1, pixelssum2);
  650. temp3 = vec_sra(temp3, vctwo);
  651. pixelssum1 = vec_add(pixelssum2, vctwo);
  652. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  653. if (rightside)
  654. {
  655. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  656. }
  657. else
  658. {
  659. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  660. }
  661. vec_st(blockv, 0, block);
  662. block += line_size;
  663. pixels += line_size;
  664. }
  665. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  666. }
  667. /* next one assumes that ((line_size % 8) == 0) */
  668. void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  669. {
  670. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
  671. register int i;
  672. register vector unsigned char
  673. pixelsv1, pixelsv2,
  674. pixelsavg;
  675. register vector unsigned char
  676. blockv, temp1, temp2;
  677. register vector unsigned short
  678. pixelssum1, pixelssum2, temp3;
  679. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  680. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  681. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  682. temp1 = vec_ld(0, pixels);
  683. temp2 = vec_ld(16, pixels);
  684. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  685. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  686. {
  687. pixelsv2 = temp2;
  688. }
  689. else
  690. {
  691. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  692. }
  693. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  694. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  695. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  696. (vector unsigned short)pixelsv2);
  697. pixelssum1 = vec_add(pixelssum1, vcone);
  698. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  699. for (i = 0; i < h ; i++) {
  700. int rightside = ((unsigned long)block & 0x0000000F);
  701. blockv = vec_ld(0, block);
  702. temp1 = vec_ld(line_size, pixels);
  703. temp2 = vec_ld(line_size + 16, pixels);
  704. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  705. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  706. {
  707. pixelsv2 = temp2;
  708. }
  709. else
  710. {
  711. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  712. }
  713. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  714. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  715. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  716. (vector unsigned short)pixelsv2);
  717. temp3 = vec_add(pixelssum1, pixelssum2);
  718. temp3 = vec_sra(temp3, vctwo);
  719. pixelssum1 = vec_add(pixelssum2, vcone);
  720. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  721. if (rightside)
  722. {
  723. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  724. }
  725. else
  726. {
  727. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  728. }
  729. vec_st(blockv, 0, block);
  730. block += line_size;
  731. pixels += line_size;
  732. }
  733. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  734. }
  735. /* next one assumes that ((line_size % 16) == 0) */
  736. void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  737. {
  738. POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
  739. register int i;
  740. register vector unsigned char
  741. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  742. register vector unsigned char
  743. blockv, temp1, temp2;
  744. register vector unsigned short
  745. pixelssum1, pixelssum2, temp3,
  746. pixelssum3, pixelssum4, temp4;
  747. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  748. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  749. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  750. temp1 = vec_ld(0, pixels);
  751. temp2 = vec_ld(16, pixels);
  752. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  753. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  754. {
  755. pixelsv2 = temp2;
  756. }
  757. else
  758. {
  759. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  760. }
  761. pixelsv3 = vec_mergel(vczero, pixelsv1);
  762. pixelsv4 = vec_mergel(vczero, pixelsv2);
  763. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  764. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  765. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  766. (vector unsigned short)pixelsv4);
  767. pixelssum3 = vec_add(pixelssum3, vctwo);
  768. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  769. (vector unsigned short)pixelsv2);
  770. pixelssum1 = vec_add(pixelssum1, vctwo);
  771. for (i = 0; i < h ; i++) {
  772. blockv = vec_ld(0, block);
  773. temp1 = vec_ld(line_size, pixels);
  774. temp2 = vec_ld(line_size + 16, pixels);
  775. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  776. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  777. {
  778. pixelsv2 = temp2;
  779. }
  780. else
  781. {
  782. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  783. }
  784. pixelsv3 = vec_mergel(vczero, pixelsv1);
  785. pixelsv4 = vec_mergel(vczero, pixelsv2);
  786. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  787. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  788. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  789. (vector unsigned short)pixelsv4);
  790. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  791. (vector unsigned short)pixelsv2);
  792. temp4 = vec_add(pixelssum3, pixelssum4);
  793. temp4 = vec_sra(temp4, vctwo);
  794. temp3 = vec_add(pixelssum1, pixelssum2);
  795. temp3 = vec_sra(temp3, vctwo);
  796. pixelssum3 = vec_add(pixelssum4, vctwo);
  797. pixelssum1 = vec_add(pixelssum2, vctwo);
  798. blockv = vec_packsu(temp3, temp4);
  799. vec_st(blockv, 0, block);
  800. block += line_size;
  801. pixels += line_size;
  802. }
  803. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  804. }
  805. /* next one assumes that ((line_size % 16) == 0) */
  806. void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  807. {
  808. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
  809. register int i;
  810. register vector unsigned char
  811. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  812. register vector unsigned char
  813. blockv, temp1, temp2;
  814. register vector unsigned short
  815. pixelssum1, pixelssum2, temp3,
  816. pixelssum3, pixelssum4, temp4;
  817. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  818. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  819. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  820. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  821. temp1 = vec_ld(0, pixels);
  822. temp2 = vec_ld(16, pixels);
  823. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  824. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  825. {
  826. pixelsv2 = temp2;
  827. }
  828. else
  829. {
  830. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  831. }
  832. pixelsv3 = vec_mergel(vczero, pixelsv1);
  833. pixelsv4 = vec_mergel(vczero, pixelsv2);
  834. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  835. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  836. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  837. (vector unsigned short)pixelsv4);
  838. pixelssum3 = vec_add(pixelssum3, vcone);
  839. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  840. (vector unsigned short)pixelsv2);
  841. pixelssum1 = vec_add(pixelssum1, vcone);
  842. for (i = 0; i < h ; i++) {
  843. blockv = vec_ld(0, block);
  844. temp1 = vec_ld(line_size, pixels);
  845. temp2 = vec_ld(line_size + 16, pixels);
  846. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  847. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  848. {
  849. pixelsv2 = temp2;
  850. }
  851. else
  852. {
  853. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  854. }
  855. pixelsv3 = vec_mergel(vczero, pixelsv1);
  856. pixelsv4 = vec_mergel(vczero, pixelsv2);
  857. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  858. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  859. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  860. (vector unsigned short)pixelsv4);
  861. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  862. (vector unsigned short)pixelsv2);
  863. temp4 = vec_add(pixelssum3, pixelssum4);
  864. temp4 = vec_sra(temp4, vctwo);
  865. temp3 = vec_add(pixelssum1, pixelssum2);
  866. temp3 = vec_sra(temp3, vctwo);
  867. pixelssum3 = vec_add(pixelssum4, vcone);
  868. pixelssum1 = vec_add(pixelssum2, vcone);
  869. blockv = vec_packsu(temp3, temp4);
  870. vec_st(blockv, 0, block);
  871. block += line_size;
  872. pixels += line_size;
  873. }
  874. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  875. }
  876. int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  877. POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
  878. int sum;
  879. register const vector unsigned char vzero =
  880. (const vector unsigned char)vec_splat_u8(0);
  881. register vector signed short temp0, temp1, temp2, temp3, temp4,
  882. temp5, temp6, temp7;
  883. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
  884. {
  885. register const vector signed short vprod1 =(const vector signed short)
  886. AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  887. register const vector signed short vprod2 =(const vector signed short)
  888. AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  889. register const vector signed short vprod3 =(const vector signed short)
  890. AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  891. register const vector unsigned char perm1 = (const vector unsigned char)
  892. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  893. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  894. register const vector unsigned char perm2 = (const vector unsigned char)
  895. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  896. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  897. register const vector unsigned char perm3 = (const vector unsigned char)
  898. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  899. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  900. #define ONEITERBUTTERFLY(i, res) \
  901. { \
  902. register vector unsigned char src1, src2, srcO; \
  903. register vector unsigned char dst1, dst2, dstO; \
  904. register vector signed short srcV, dstV; \
  905. register vector signed short but0, but1, but2, op1, op2, op3; \
  906. src1 = vec_ld(stride * i, src); \
  907. src2 = vec_ld((stride * i) + 15, src); \
  908. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  909. dst1 = vec_ld(stride * i, dst); \
  910. dst2 = vec_ld((stride * i) + 15, dst); \
  911. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  912. /* promote the unsigned chars to signed shorts */ \
  913. /* we're in the 8x8 function, we only care for the first 8 */ \
  914. srcV = \
  915. (vector signed short)vec_mergeh((vector signed char)vzero, \
  916. (vector signed char)srcO); \
  917. dstV = \
  918. (vector signed short)vec_mergeh((vector signed char)vzero, \
  919. (vector signed char)dstO); \
  920. /* subtractions inside the first butterfly */ \
  921. but0 = vec_sub(srcV, dstV); \
  922. op1 = vec_perm(but0, but0, perm1); \
  923. but1 = vec_mladd(but0, vprod1, op1); \
  924. op2 = vec_perm(but1, but1, perm2); \
  925. but2 = vec_mladd(but1, vprod2, op2); \
  926. op3 = vec_perm(but2, but2, perm3); \
  927. res = vec_mladd(but2, vprod3, op3); \
  928. }
  929. ONEITERBUTTERFLY(0, temp0);
  930. ONEITERBUTTERFLY(1, temp1);
  931. ONEITERBUTTERFLY(2, temp2);
  932. ONEITERBUTTERFLY(3, temp3);
  933. ONEITERBUTTERFLY(4, temp4);
  934. ONEITERBUTTERFLY(5, temp5);
  935. ONEITERBUTTERFLY(6, temp6);
  936. ONEITERBUTTERFLY(7, temp7);
  937. }
  938. #undef ONEITERBUTTERFLY
  939. {
  940. register vector signed int vsum;
  941. register vector signed short line0 = vec_add(temp0, temp1);
  942. register vector signed short line1 = vec_sub(temp0, temp1);
  943. register vector signed short line2 = vec_add(temp2, temp3);
  944. register vector signed short line3 = vec_sub(temp2, temp3);
  945. register vector signed short line4 = vec_add(temp4, temp5);
  946. register vector signed short line5 = vec_sub(temp4, temp5);
  947. register vector signed short line6 = vec_add(temp6, temp7);
  948. register vector signed short line7 = vec_sub(temp6, temp7);
  949. register vector signed short line0B = vec_add(line0, line2);
  950. register vector signed short line2B = vec_sub(line0, line2);
  951. register vector signed short line1B = vec_add(line1, line3);
  952. register vector signed short line3B = vec_sub(line1, line3);
  953. register vector signed short line4B = vec_add(line4, line6);
  954. register vector signed short line6B = vec_sub(line4, line6);
  955. register vector signed short line5B = vec_add(line5, line7);
  956. register vector signed short line7B = vec_sub(line5, line7);
  957. register vector signed short line0C = vec_add(line0B, line4B);
  958. register vector signed short line4C = vec_sub(line0B, line4B);
  959. register vector signed short line1C = vec_add(line1B, line5B);
  960. register vector signed short line5C = vec_sub(line1B, line5B);
  961. register vector signed short line2C = vec_add(line2B, line6B);
  962. register vector signed short line6C = vec_sub(line2B, line6B);
  963. register vector signed short line3C = vec_add(line3B, line7B);
  964. register vector signed short line7C = vec_sub(line3B, line7B);
  965. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  966. vsum = vec_sum4s(vec_abs(line1C), vsum);
  967. vsum = vec_sum4s(vec_abs(line2C), vsum);
  968. vsum = vec_sum4s(vec_abs(line3C), vsum);
  969. vsum = vec_sum4s(vec_abs(line4C), vsum);
  970. vsum = vec_sum4s(vec_abs(line5C), vsum);
  971. vsum = vec_sum4s(vec_abs(line6C), vsum);
  972. vsum = vec_sum4s(vec_abs(line7C), vsum);
  973. vsum = vec_sums(vsum, (vector signed int)vzero);
  974. vsum = vec_splat(vsum, 3);
  975. vec_ste(vsum, 0, &sum);
  976. }
  977. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
  978. return sum;
  979. }
  980. /*
  981. 16x8 works with 16 elements ; it allows to avoid replicating
  982. loads, and give the compiler more rooms for scheduling.
  983. It's only used from inside hadamard8_diff16_altivec.
  984. Unfortunately, it seems gcc-3.3 is a bit dumb, and
  985. the compiled code has a LOT of spill code, it seems
  986. gcc (unlike xlc) cannot keep everything in registers
  987. by itself. The following code include hand-made
  988. registers allocation. It's not clean, but on
  989. a 7450 the resulting code is much faster (best case
  990. fall from 700+ cycles to 550).
  991. xlc doesn't add spill code, but it doesn't know how to
  992. schedule for the 7450, and its code isn't much faster than
  993. gcc-3.3 on the 7450 (but uses 25% less instructions...)
  994. On the 970, the hand-made RA is still a win (around 690
  995. vs. around 780), but xlc goes to around 660 on the
  996. regular C code...
  997. */
  998. static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
  999. int sum;
  1000. register vector signed short
  1001. temp0 REG_v(v0),
  1002. temp1 REG_v(v1),
  1003. temp2 REG_v(v2),
  1004. temp3 REG_v(v3),
  1005. temp4 REG_v(v4),
  1006. temp5 REG_v(v5),
  1007. temp6 REG_v(v6),
  1008. temp7 REG_v(v7);
  1009. register vector signed short
  1010. temp0S REG_v(v8),
  1011. temp1S REG_v(v9),
  1012. temp2S REG_v(v10),
  1013. temp3S REG_v(v11),
  1014. temp4S REG_v(v12),
  1015. temp5S REG_v(v13),
  1016. temp6S REG_v(v14),
  1017. temp7S REG_v(v15);
  1018. register const vector unsigned char vzero REG_v(v31)=
  1019. (const vector unsigned char)vec_splat_u8(0);
  1020. {
  1021. register const vector signed short vprod1 REG_v(v16)=
  1022. (const vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  1023. register const vector signed short vprod2 REG_v(v17)=
  1024. (const vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  1025. register const vector signed short vprod3 REG_v(v18)=
  1026. (const vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  1027. register const vector unsigned char perm1 REG_v(v19)=
  1028. (const vector unsigned char)
  1029. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  1030. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  1031. register const vector unsigned char perm2 REG_v(v20)=
  1032. (const vector unsigned char)
  1033. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  1034. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  1035. register const vector unsigned char perm3 REG_v(v21)=
  1036. (const vector unsigned char)
  1037. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  1038. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  1039. #define ONEITERBUTTERFLY(i, res1, res2) \
  1040. { \
  1041. register vector unsigned char src1 REG_v(v22), \
  1042. src2 REG_v(v23), \
  1043. dst1 REG_v(v24), \
  1044. dst2 REG_v(v25), \
  1045. srcO REG_v(v22), \
  1046. dstO REG_v(v23); \
  1047. \
  1048. register vector signed short srcV REG_v(v24), \
  1049. dstV REG_v(v25), \
  1050. srcW REG_v(v26), \
  1051. dstW REG_v(v27), \
  1052. but0 REG_v(v28), \
  1053. but0S REG_v(v29), \
  1054. op1 REG_v(v30), \
  1055. but1 REG_v(v22), \
  1056. op1S REG_v(v23), \
  1057. but1S REG_v(v24), \
  1058. op2 REG_v(v25), \
  1059. but2 REG_v(v26), \
  1060. op2S REG_v(v27), \
  1061. but2S REG_v(v28), \
  1062. op3 REG_v(v29), \
  1063. op3S REG_v(v30); \
  1064. \
  1065. src1 = vec_ld(stride * i, src); \
  1066. src2 = vec_ld((stride * i) + 16, src); \
  1067. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  1068. dst1 = vec_ld(stride * i, dst); \
  1069. dst2 = vec_ld((stride * i) + 16, dst); \
  1070. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  1071. /* promote the unsigned chars to signed shorts */ \
  1072. srcV = \
  1073. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1074. (vector signed char)srcO); \
  1075. dstV = \
  1076. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1077. (vector signed char)dstO); \
  1078. srcW = \
  1079. (vector signed short)vec_mergel((vector signed char)vzero, \
  1080. (vector signed char)srcO); \
  1081. dstW = \
  1082. (vector signed short)vec_mergel((vector signed char)vzero, \
  1083. (vector signed char)dstO); \
  1084. /* subtractions inside the first butterfly */ \
  1085. but0 = vec_sub(srcV, dstV); \
  1086. but0S = vec_sub(srcW, dstW); \
  1087. op1 = vec_perm(but0, but0, perm1); \
  1088. but1 = vec_mladd(but0, vprod1, op1); \
  1089. op1S = vec_perm(but0S, but0S, perm1); \
  1090. but1S = vec_mladd(but0S, vprod1, op1S); \
  1091. op2 = vec_perm(but1, but1, perm2); \
  1092. but2 = vec_mladd(but1, vprod2, op2); \
  1093. op2S = vec_perm(but1S, but1S, perm2); \
  1094. but2S = vec_mladd(but1S, vprod2, op2S); \
  1095. op3 = vec_perm(but2, but2, perm3); \
  1096. res1 = vec_mladd(but2, vprod3, op3); \
  1097. op3S = vec_perm(but2S, but2S, perm3); \
  1098. res2 = vec_mladd(but2S, vprod3, op3S); \
  1099. }
  1100. ONEITERBUTTERFLY(0, temp0, temp0S);
  1101. ONEITERBUTTERFLY(1, temp1, temp1S);
  1102. ONEITERBUTTERFLY(2, temp2, temp2S);
  1103. ONEITERBUTTERFLY(3, temp3, temp3S);
  1104. ONEITERBUTTERFLY(4, temp4, temp4S);
  1105. ONEITERBUTTERFLY(5, temp5, temp5S);
  1106. ONEITERBUTTERFLY(6, temp6, temp6S);
  1107. ONEITERBUTTERFLY(7, temp7, temp7S);
  1108. }
  1109. #undef ONEITERBUTTERFLY
  1110. {
  1111. register vector signed int vsum;
  1112. register vector signed short line0S, line1S, line2S, line3S, line4S,
  1113. line5S, line6S, line7S, line0BS,line2BS,
  1114. line1BS,line3BS,line4BS,line6BS,line5BS,
  1115. line7BS,line0CS,line4CS,line1CS,line5CS,
  1116. line2CS,line6CS,line3CS,line7CS;
  1117. register vector signed short line0 = vec_add(temp0, temp1);
  1118. register vector signed short line1 = vec_sub(temp0, temp1);
  1119. register vector signed short line2 = vec_add(temp2, temp3);
  1120. register vector signed short line3 = vec_sub(temp2, temp3);
  1121. register vector signed short line4 = vec_add(temp4, temp5);
  1122. register vector signed short line5 = vec_sub(temp4, temp5);
  1123. register vector signed short line6 = vec_add(temp6, temp7);
  1124. register vector signed short line7 = vec_sub(temp6, temp7);
  1125. register vector signed short line0B = vec_add(line0, line2);
  1126. register vector signed short line2B = vec_sub(line0, line2);
  1127. register vector signed short line1B = vec_add(line1, line3);
  1128. register vector signed short line3B = vec_sub(line1, line3);
  1129. register vector signed short line4B = vec_add(line4, line6);
  1130. register vector signed short line6B = vec_sub(line4, line6);
  1131. register vector signed short line5B = vec_add(line5, line7);
  1132. register vector signed short line7B = vec_sub(line5, line7);
  1133. register vector signed short line0C = vec_add(line0B, line4B);
  1134. register vector signed short line4C = vec_sub(line0B, line4B);
  1135. register vector signed short line1C = vec_add(line1B, line5B);
  1136. register vector signed short line5C = vec_sub(line1B, line5B);
  1137. register vector signed short line2C = vec_add(line2B, line6B);
  1138. register vector signed short line6C = vec_sub(line2B, line6B);
  1139. register vector signed short line3C = vec_add(line3B, line7B);
  1140. register vector signed short line7C = vec_sub(line3B, line7B);
  1141. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1142. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1143. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1144. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1145. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1146. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1147. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1148. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1149. line0S = vec_add(temp0S, temp1S);
  1150. line1S = vec_sub(temp0S, temp1S);
  1151. line2S = vec_add(temp2S, temp3S);
  1152. line3S = vec_sub(temp2S, temp3S);
  1153. line4S = vec_add(temp4S, temp5S);
  1154. line5S = vec_sub(temp4S, temp5S);
  1155. line6S = vec_add(temp6S, temp7S);
  1156. line7S = vec_sub(temp6S, temp7S);
  1157. line0BS = vec_add(line0S, line2S);
  1158. line2BS = vec_sub(line0S, line2S);
  1159. line1BS = vec_add(line1S, line3S);
  1160. line3BS = vec_sub(line1S, line3S);
  1161. line4BS = vec_add(line4S, line6S);
  1162. line6BS = vec_sub(line4S, line6S);
  1163. line5BS = vec_add(line5S, line7S);
  1164. line7BS = vec_sub(line5S, line7S);
  1165. line0CS = vec_add(line0BS, line4BS);
  1166. line4CS = vec_sub(line0BS, line4BS);
  1167. line1CS = vec_add(line1BS, line5BS);
  1168. line5CS = vec_sub(line1BS, line5BS);
  1169. line2CS = vec_add(line2BS, line6BS);
  1170. line6CS = vec_sub(line2BS, line6BS);
  1171. line3CS = vec_add(line3BS, line7BS);
  1172. line7CS = vec_sub(line3BS, line7BS);
  1173. vsum = vec_sum4s(vec_abs(line0CS), vsum);
  1174. vsum = vec_sum4s(vec_abs(line1CS), vsum);
  1175. vsum = vec_sum4s(vec_abs(line2CS), vsum);
  1176. vsum = vec_sum4s(vec_abs(line3CS), vsum);
  1177. vsum = vec_sum4s(vec_abs(line4CS), vsum);
  1178. vsum = vec_sum4s(vec_abs(line5CS), vsum);
  1179. vsum = vec_sum4s(vec_abs(line6CS), vsum);
  1180. vsum = vec_sum4s(vec_abs(line7CS), vsum);
  1181. vsum = vec_sums(vsum, (vector signed int)vzero);
  1182. vsum = vec_splat(vsum, 3);
  1183. vec_ste(vsum, 0, &sum);
  1184. }
  1185. return sum;
  1186. }
  1187. int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1188. POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
  1189. int score;
  1190. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
  1191. score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1192. if (h==16) {
  1193. dst += 8*stride;
  1194. src += 8*stride;
  1195. score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1196. }
  1197. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
  1198. return score;
  1199. }
  1200. static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
  1201. int blocksize)
  1202. {
  1203. int i;
  1204. vector float m, a;
  1205. vector bool int t0, t1;
  1206. const vector unsigned int v_31 = //XXX
  1207. vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
  1208. for(i=0; i<blocksize; i+=4) {
  1209. m = vec_ld(0, mag+i);
  1210. a = vec_ld(0, ang+i);
  1211. t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
  1212. t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
  1213. a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
  1214. t0 = (vector bool int)vec_and(a, t1);
  1215. t1 = (vector bool int)vec_andc(a, t1);
  1216. a = vec_sub(m, (vector float)t1);
  1217. m = vec_add(m, (vector float)t0);
  1218. vec_stl(a, 0, ang+i);
  1219. vec_stl(m, 0, mag+i);
  1220. }
  1221. }
  1222. /* next one assumes that ((line_size % 8) == 0) */
  1223. void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  1224. {
  1225. POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
  1226. register int i;
  1227. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  1228. register vector unsigned char blockv, temp1, temp2, blocktemp;
  1229. register vector unsigned short pixelssum1, pixelssum2, temp3;
  1230. register const vector unsigned char vczero = (const vector unsigned char)
  1231. vec_splat_u8(0);
  1232. register const vector unsigned short vctwo = (const vector unsigned short)
  1233. vec_splat_u16(2);
  1234. temp1 = vec_ld(0, pixels);
  1235. temp2 = vec_ld(16, pixels);
  1236. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1237. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  1238. pixelsv2 = temp2;
  1239. } else {
  1240. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1241. }
  1242. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1243. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1244. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1245. (vector unsigned short)pixelsv2);
  1246. pixelssum1 = vec_add(pixelssum1, vctwo);
  1247. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1248. for (i = 0; i < h ; i++) {
  1249. int rightside = ((unsigned long)block & 0x0000000F);
  1250. blockv = vec_ld(0, block);
  1251. temp1 = vec_ld(line_size, pixels);
  1252. temp2 = vec_ld(line_size + 16, pixels);
  1253. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1254. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1255. {
  1256. pixelsv2 = temp2;
  1257. } else {
  1258. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1259. }
  1260. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1261. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1262. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1263. (vector unsigned short)pixelsv2);
  1264. temp3 = vec_add(pixelssum1, pixelssum2);
  1265. temp3 = vec_sra(temp3, vctwo);
  1266. pixelssum1 = vec_add(pixelssum2, vctwo);
  1267. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  1268. if (rightside) {
  1269. blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  1270. } else {
  1271. blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  1272. }
  1273. blockv = vec_avg(blocktemp, blockv);
  1274. vec_st(blockv, 0, block);
  1275. block += line_size;
  1276. pixels += line_size;
  1277. }
  1278. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1279. }
  1280. void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
  1281. {
  1282. c->pix_abs[0][1] = sad16_x2_altivec;
  1283. c->pix_abs[0][2] = sad16_y2_altivec;
  1284. c->pix_abs[0][3] = sad16_xy2_altivec;
  1285. c->pix_abs[0][0] = sad16_altivec;
  1286. c->pix_abs[1][0] = sad8_altivec;
  1287. c->sad[0]= sad16_altivec;
  1288. c->sad[1]= sad8_altivec;
  1289. c->pix_norm1 = pix_norm1_altivec;
  1290. c->sse[1]= sse8_altivec;
  1291. c->sse[0]= sse16_altivec;
  1292. c->pix_sum = pix_sum_altivec;
  1293. c->diff_pixels = diff_pixels_altivec;
  1294. c->get_pixels = get_pixels_altivec;
  1295. c->add_bytes= add_bytes_altivec;
  1296. c->put_pixels_tab[0][0] = put_pixels16_altivec;
  1297. /* the two functions do the same thing, so use the same code */
  1298. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
  1299. c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
  1300. c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
  1301. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
  1302. c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
  1303. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
  1304. c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
  1305. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
  1306. c->hadamard8_diff[0] = hadamard8_diff16_altivec;
  1307. c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
  1308. if (ENABLE_VORBIS_DECODER)
  1309. c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
  1310. }