You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1440 lines
56KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "libavcodec/dsputil.h"
  23. #include "gcc_fixes.h"
  24. #include "dsputil_ppc.h"
  25. #include "util_altivec.h"
  26. int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  27. {
  28. int i;
  29. DECLARE_ALIGNED_16(int, s);
  30. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  31. vector unsigned char *tv;
  32. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  33. vector unsigned int sad;
  34. vector signed int sumdiffs;
  35. s = 0;
  36. sad = (vector unsigned int)vec_splat_u32(0);
  37. for (i = 0; i < h; i++) {
  38. /* Read unaligned pixels into our vectors. The vectors are as follows:
  39. pix1v: pix1[0]-pix1[15]
  40. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */
  41. tv = (vector unsigned char *) pix1;
  42. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  43. tv = (vector unsigned char *) &pix2[0];
  44. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  45. tv = (vector unsigned char *) &pix2[1];
  46. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  47. /* Calculate the average vector */
  48. avgv = vec_avg(pix2v, pix2iv);
  49. /* Calculate a sum of abs differences vector */
  50. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  51. /* Add each 4 pixel group together and put 4 results into sad */
  52. sad = vec_sum4s(t5, sad);
  53. pix1 += line_size;
  54. pix2 += line_size;
  55. }
  56. /* Sum up the four partial sums, and put the result into s */
  57. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  58. sumdiffs = vec_splat(sumdiffs, 3);
  59. vec_ste(sumdiffs, 0, &s);
  60. return s;
  61. }
  62. int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  63. {
  64. int i;
  65. DECLARE_ALIGNED_16(int, s);
  66. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  67. vector unsigned char *tv;
  68. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  69. vector unsigned int sad;
  70. vector signed int sumdiffs;
  71. uint8_t *pix3 = pix2 + line_size;
  72. s = 0;
  73. sad = (vector unsigned int)vec_splat_u32(0);
  74. /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  75. iteration becomes pix2 in the next iteration. We can use this
  76. fact to avoid a potentially expensive unaligned read, each
  77. time around the loop.
  78. Read unaligned pixels into our vectors. The vectors are as follows:
  79. pix2v: pix2[0]-pix2[15]
  80. Split the pixel vectors into shorts */
  81. tv = (vector unsigned char *) &pix2[0];
  82. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  83. for (i = 0; i < h; i++) {
  84. /* Read unaligned pixels into our vectors. The vectors are as follows:
  85. pix1v: pix1[0]-pix1[15]
  86. pix3v: pix3[0]-pix3[15] */
  87. tv = (vector unsigned char *) pix1;
  88. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  89. tv = (vector unsigned char *) &pix3[0];
  90. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  91. /* Calculate the average vector */
  92. avgv = vec_avg(pix2v, pix3v);
  93. /* Calculate a sum of abs differences vector */
  94. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  95. /* Add each 4 pixel group together and put 4 results into sad */
  96. sad = vec_sum4s(t5, sad);
  97. pix1 += line_size;
  98. pix2v = pix3v;
  99. pix3 += line_size;
  100. }
  101. /* Sum up the four partial sums, and put the result into s */
  102. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  103. sumdiffs = vec_splat(sumdiffs, 3);
  104. vec_ste(sumdiffs, 0, &s);
  105. return s;
  106. }
  107. int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  108. {
  109. int i;
  110. DECLARE_ALIGNED_16(int, s);
  111. uint8_t *pix3 = pix2 + line_size;
  112. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  113. const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
  114. vector unsigned char *tv, avgv, t5;
  115. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  116. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  117. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  118. vector unsigned short avghv, avglv;
  119. vector unsigned short t1, t2, t3, t4;
  120. vector unsigned int sad;
  121. vector signed int sumdiffs;
  122. sad = (vector unsigned int)vec_splat_u32(0);
  123. s = 0;
  124. /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  125. iteration becomes pix2 in the next iteration. We can use this
  126. fact to avoid a potentially expensive unaligned read, as well
  127. as some splitting, and vector addition each time around the loop.
  128. Read unaligned pixels into our vectors. The vectors are as follows:
  129. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  130. Split the pixel vectors into shorts */
  131. tv = (vector unsigned char *) &pix2[0];
  132. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  133. tv = (vector unsigned char *) &pix2[1];
  134. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  135. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  136. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  137. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  138. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  139. t1 = vec_add(pix2hv, pix2ihv);
  140. t2 = vec_add(pix2lv, pix2ilv);
  141. for (i = 0; i < h; i++) {
  142. /* Read unaligned pixels into our vectors. The vectors are as follows:
  143. pix1v: pix1[0]-pix1[15]
  144. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */
  145. tv = (vector unsigned char *) pix1;
  146. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  147. tv = (vector unsigned char *) &pix3[0];
  148. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  149. tv = (vector unsigned char *) &pix3[1];
  150. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  151. /* Note that AltiVec does have vec_avg, but this works on vector pairs
  152. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  153. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  154. Instead, we have to split the pixel vectors into vectors of shorts,
  155. and do the averaging by hand. */
  156. /* Split the pixel vectors into shorts */
  157. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  158. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  159. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  160. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  161. /* Do the averaging on them */
  162. t3 = vec_add(pix3hv, pix3ihv);
  163. t4 = vec_add(pix3lv, pix3ilv);
  164. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  165. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  166. /* Pack the shorts back into a result */
  167. avgv = vec_pack(avghv, avglv);
  168. /* Calculate a sum of abs differences vector */
  169. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  170. /* Add each 4 pixel group together and put 4 results into sad */
  171. sad = vec_sum4s(t5, sad);
  172. pix1 += line_size;
  173. pix3 += line_size;
  174. /* Transfer the calculated values for pix3 into pix2 */
  175. t1 = t3;
  176. t2 = t4;
  177. }
  178. /* Sum up the four partial sums, and put the result into s */
  179. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  180. sumdiffs = vec_splat(sumdiffs, 3);
  181. vec_ste(sumdiffs, 0, &s);
  182. return s;
  183. }
  184. int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  185. {
  186. int i;
  187. DECLARE_ALIGNED_16(int, s);
  188. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  189. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  190. vector unsigned char t1, t2, t3,t4, t5;
  191. vector unsigned int sad;
  192. vector signed int sumdiffs;
  193. sad = (vector unsigned int)vec_splat_u32(0);
  194. for (i = 0; i < h; i++) {
  195. /* Read potentially unaligned pixels into t1 and t2 */
  196. perm1 = vec_lvsl(0, pix1);
  197. pix1v = (vector unsigned char *) pix1;
  198. perm2 = vec_lvsl(0, pix2);
  199. pix2v = (vector unsigned char *) pix2;
  200. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  201. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  202. /* Calculate a sum of abs differences vector */
  203. t3 = vec_max(t1, t2);
  204. t4 = vec_min(t1, t2);
  205. t5 = vec_sub(t3, t4);
  206. /* Add each 4 pixel group together and put 4 results into sad */
  207. sad = vec_sum4s(t5, sad);
  208. pix1 += line_size;
  209. pix2 += line_size;
  210. }
  211. /* Sum up the four partial sums, and put the result into s */
  212. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  213. sumdiffs = vec_splat(sumdiffs, 3);
  214. vec_ste(sumdiffs, 0, &s);
  215. return s;
  216. }
  217. int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  218. {
  219. int i;
  220. DECLARE_ALIGNED_16(int, s);
  221. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  222. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  223. vector unsigned char t1, t2, t3,t4, t5;
  224. vector unsigned int sad;
  225. vector signed int sumdiffs;
  226. sad = (vector unsigned int)vec_splat_u32(0);
  227. permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
  228. for (i = 0; i < h; i++) {
  229. /* Read potentially unaligned pixels into t1 and t2
  230. Since we're reading 16 pixels, and actually only want 8,
  231. mask out the last 8 pixels. The 0s don't change the sum. */
  232. perm1 = vec_lvsl(0, pix1);
  233. pix1v = (vector unsigned char *) pix1;
  234. perm2 = vec_lvsl(0, pix2);
  235. pix2v = (vector unsigned char *) pix2;
  236. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  237. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  238. /* Calculate a sum of abs differences vector */
  239. t3 = vec_max(t1, t2);
  240. t4 = vec_min(t1, t2);
  241. t5 = vec_sub(t3, t4);
  242. /* Add each 4 pixel group together and put 4 results into sad */
  243. sad = vec_sum4s(t5, sad);
  244. pix1 += line_size;
  245. pix2 += line_size;
  246. }
  247. /* Sum up the four partial sums, and put the result into s */
  248. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  249. sumdiffs = vec_splat(sumdiffs, 3);
  250. vec_ste(sumdiffs, 0, &s);
  251. return s;
  252. }
  253. int pix_norm1_altivec(uint8_t *pix, int line_size)
  254. {
  255. int i;
  256. DECLARE_ALIGNED_16(int, s);
  257. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  258. vector unsigned char *tv;
  259. vector unsigned char pixv;
  260. vector unsigned int sv;
  261. vector signed int sum;
  262. sv = (vector unsigned int)vec_splat_u32(0);
  263. s = 0;
  264. for (i = 0; i < 16; i++) {
  265. /* Read in the potentially unaligned pixels */
  266. tv = (vector unsigned char *) pix;
  267. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  268. /* Square the values, and add them to our sum */
  269. sv = vec_msum(pixv, pixv, sv);
  270. pix += line_size;
  271. }
  272. /* Sum up the four partial sums, and put the result into s */
  273. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  274. sum = vec_splat(sum, 3);
  275. vec_ste(sum, 0, &s);
  276. return s;
  277. }
  278. /**
  279. * Sum of Squared Errors for a 8x8 block.
  280. * AltiVec-enhanced.
  281. * It's the sad8_altivec code above w/ squaring added.
  282. */
  283. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  284. {
  285. int i;
  286. DECLARE_ALIGNED_16(int, s);
  287. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  288. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  289. vector unsigned char t1, t2, t3,t4, t5;
  290. vector unsigned int sum;
  291. vector signed int sumsqr;
  292. sum = (vector unsigned int)vec_splat_u32(0);
  293. permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
  294. for (i = 0; i < h; i++) {
  295. /* Read potentially unaligned pixels into t1 and t2
  296. Since we're reading 16 pixels, and actually only want 8,
  297. mask out the last 8 pixels. The 0s don't change the sum. */
  298. perm1 = vec_lvsl(0, pix1);
  299. pix1v = (vector unsigned char *) pix1;
  300. perm2 = vec_lvsl(0, pix2);
  301. pix2v = (vector unsigned char *) pix2;
  302. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  303. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  304. /* Since we want to use unsigned chars, we can take advantage
  305. of the fact that abs(a-b)^2 = (a-b)^2. */
  306. /* Calculate abs differences vector */
  307. t3 = vec_max(t1, t2);
  308. t4 = vec_min(t1, t2);
  309. t5 = vec_sub(t3, t4);
  310. /* Square the values and add them to our sum */
  311. sum = vec_msum(t5, t5, sum);
  312. pix1 += line_size;
  313. pix2 += line_size;
  314. }
  315. /* Sum up the four partial sums, and put the result into s */
  316. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  317. sumsqr = vec_splat(sumsqr, 3);
  318. vec_ste(sumsqr, 0, &s);
  319. return s;
  320. }
  321. /**
  322. * Sum of Squared Errors for a 16x16 block.
  323. * AltiVec-enhanced.
  324. * It's the sad16_altivec code above w/ squaring added.
  325. */
  326. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  327. {
  328. int i;
  329. DECLARE_ALIGNED_16(int, s);
  330. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  331. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  332. vector unsigned char t1, t2, t3,t4, t5;
  333. vector unsigned int sum;
  334. vector signed int sumsqr;
  335. sum = (vector unsigned int)vec_splat_u32(0);
  336. for (i = 0; i < h; i++) {
  337. /* Read potentially unaligned pixels into t1 and t2 */
  338. perm1 = vec_lvsl(0, pix1);
  339. pix1v = (vector unsigned char *) pix1;
  340. perm2 = vec_lvsl(0, pix2);
  341. pix2v = (vector unsigned char *) pix2;
  342. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  343. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  344. /* Since we want to use unsigned chars, we can take advantage
  345. of the fact that abs(a-b)^2 = (a-b)^2. */
  346. /* Calculate abs differences vector */
  347. t3 = vec_max(t1, t2);
  348. t4 = vec_min(t1, t2);
  349. t5 = vec_sub(t3, t4);
  350. /* Square the values and add them to our sum */
  351. sum = vec_msum(t5, t5, sum);
  352. pix1 += line_size;
  353. pix2 += line_size;
  354. }
  355. /* Sum up the four partial sums, and put the result into s */
  356. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  357. sumsqr = vec_splat(sumsqr, 3);
  358. vec_ste(sumsqr, 0, &s);
  359. return s;
  360. }
  361. int pix_sum_altivec(uint8_t * pix, int line_size)
  362. {
  363. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  364. vector unsigned char perm, *pixv;
  365. vector unsigned char t1;
  366. vector unsigned int sad;
  367. vector signed int sumdiffs;
  368. int i;
  369. DECLARE_ALIGNED_16(int, s);
  370. sad = (vector unsigned int)vec_splat_u32(0);
  371. for (i = 0; i < 16; i++) {
  372. /* Read the potentially unaligned 16 pixels into t1 */
  373. perm = vec_lvsl(0, pix);
  374. pixv = (vector unsigned char *) pix;
  375. t1 = vec_perm(pixv[0], pixv[1], perm);
  376. /* Add each 4 pixel group together and put 4 results into sad */
  377. sad = vec_sum4s(t1, sad);
  378. pix += line_size;
  379. }
  380. /* Sum up the four partial sums, and put the result into s */
  381. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  382. sumdiffs = vec_splat(sumdiffs, 3);
  383. vec_ste(sumdiffs, 0, &s);
  384. return s;
  385. }
  386. void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  387. {
  388. int i;
  389. vector unsigned char perm, bytes, *pixv;
  390. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  391. vector signed short shorts;
  392. for (i = 0; i < 8; i++) {
  393. // Read potentially unaligned pixels.
  394. // We're reading 16 pixels, and actually only want 8,
  395. // but we simply ignore the extras.
  396. perm = vec_lvsl(0, pixels);
  397. pixv = (vector unsigned char *) pixels;
  398. bytes = vec_perm(pixv[0], pixv[1], perm);
  399. // convert the bytes into shorts
  400. shorts = (vector signed short)vec_mergeh(zero, bytes);
  401. // save the data to the block, we assume the block is 16-byte aligned
  402. vec_st(shorts, i*16, (vector signed short*)block);
  403. pixels += line_size;
  404. }
  405. }
  406. void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  407. const uint8_t *s2, int stride)
  408. {
  409. int i;
  410. vector unsigned char perm, bytes, *pixv;
  411. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  412. vector signed short shorts1, shorts2;
  413. for (i = 0; i < 4; i++) {
  414. // Read potentially unaligned pixels
  415. // We're reading 16 pixels, and actually only want 8,
  416. // but we simply ignore the extras.
  417. perm = vec_lvsl(0, s1);
  418. pixv = (vector unsigned char *) s1;
  419. bytes = vec_perm(pixv[0], pixv[1], perm);
  420. // convert the bytes into shorts
  421. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  422. // Do the same for the second block of pixels
  423. perm = vec_lvsl(0, s2);
  424. pixv = (vector unsigned char *) s2;
  425. bytes = vec_perm(pixv[0], pixv[1], perm);
  426. // convert the bytes into shorts
  427. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  428. // Do the subtraction
  429. shorts1 = vec_sub(shorts1, shorts2);
  430. // save the data to the block, we assume the block is 16-byte aligned
  431. vec_st(shorts1, 0, (vector signed short*)block);
  432. s1 += stride;
  433. s2 += stride;
  434. block += 8;
  435. // The code below is a copy of the code above... This is a manual
  436. // unroll.
  437. // Read potentially unaligned pixels
  438. // We're reading 16 pixels, and actually only want 8,
  439. // but we simply ignore the extras.
  440. perm = vec_lvsl(0, s1);
  441. pixv = (vector unsigned char *) s1;
  442. bytes = vec_perm(pixv[0], pixv[1], perm);
  443. // convert the bytes into shorts
  444. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  445. // Do the same for the second block of pixels
  446. perm = vec_lvsl(0, s2);
  447. pixv = (vector unsigned char *) s2;
  448. bytes = vec_perm(pixv[0], pixv[1], perm);
  449. // convert the bytes into shorts
  450. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  451. // Do the subtraction
  452. shorts1 = vec_sub(shorts1, shorts2);
  453. // save the data to the block, we assume the block is 16-byte aligned
  454. vec_st(shorts1, 0, (vector signed short*)block);
  455. s1 += stride;
  456. s2 += stride;
  457. block += 8;
  458. }
  459. }
  460. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  461. register int i;
  462. register vector unsigned char vdst, vsrc;
  463. /* dst and src are 16 bytes-aligned (guaranteed) */
  464. for (i = 0 ; (i + 15) < w ; i+=16) {
  465. vdst = vec_ld(i, (unsigned char*)dst);
  466. vsrc = vec_ld(i, (unsigned char*)src);
  467. vdst = vec_add(vsrc, vdst);
  468. vec_st(vdst, i, (unsigned char*)dst);
  469. }
  470. /* if w is not a multiple of 16 */
  471. for (; (i < w) ; i++) {
  472. dst[i] = src[i];
  473. }
  474. }
  475. /* next one assumes that ((line_size % 16) == 0) */
  476. void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  477. {
  478. POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
  479. register vector unsigned char pixelsv1, pixelsv2;
  480. register vector unsigned char pixelsv1B, pixelsv2B;
  481. register vector unsigned char pixelsv1C, pixelsv2C;
  482. register vector unsigned char pixelsv1D, pixelsv2D;
  483. register vector unsigned char perm = vec_lvsl(0, pixels);
  484. int i;
  485. register int line_size_2 = line_size << 1;
  486. register int line_size_3 = line_size + line_size_2;
  487. register int line_size_4 = line_size << 2;
  488. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  489. // hand-unrolling the loop by 4 gains about 15%
  490. // mininum execution time goes from 74 to 60 cycles
  491. // it's faster than -funroll-loops, but using
  492. // -funroll-loops w/ this is bad - 74 cycles again.
  493. // all this is on a 7450, tuning for the 7450
  494. #if 0
  495. for (i = 0; i < h; i++) {
  496. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  497. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  498. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  499. 0, (unsigned char*)block);
  500. pixels+=line_size;
  501. block +=line_size;
  502. }
  503. #else
  504. for (i = 0; i < h; i += 4) {
  505. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  506. pixelsv2 = vec_ld(15, (unsigned char*)pixels);
  507. pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
  508. pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
  509. pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
  510. pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
  511. pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
  512. pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
  513. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  514. 0, (unsigned char*)block);
  515. vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
  516. line_size, (unsigned char*)block);
  517. vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
  518. line_size_2, (unsigned char*)block);
  519. vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
  520. line_size_3, (unsigned char*)block);
  521. pixels+=line_size_4;
  522. block +=line_size_4;
  523. }
  524. #endif
  525. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  526. }
  527. /* next one assumes that ((line_size % 16) == 0) */
  528. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  529. void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  530. {
  531. POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
  532. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  533. register vector unsigned char perm = vec_lvsl(0, pixels);
  534. int i;
  535. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  536. for (i = 0; i < h; i++) {
  537. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  538. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  539. blockv = vec_ld(0, block);
  540. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  541. blockv = vec_avg(blockv,pixelsv);
  542. vec_st(blockv, 0, (unsigned char*)block);
  543. pixels+=line_size;
  544. block +=line_size;
  545. }
  546. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  547. }
  548. /* next one assumes that ((line_size % 8) == 0) */
  549. void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  550. {
  551. POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
  552. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  553. int i;
  554. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  555. for (i = 0; i < h; i++) {
  556. /* block is 8 bytes-aligned, so we're either in the
  557. left block (16 bytes-aligned) or in the right block (not) */
  558. int rightside = ((unsigned long)block & 0x0000000F);
  559. blockv = vec_ld(0, block);
  560. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  561. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  562. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  563. if (rightside) {
  564. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  565. } else {
  566. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  567. }
  568. blockv = vec_avg(blockv, pixelsv);
  569. vec_st(blockv, 0, block);
  570. pixels += line_size;
  571. block += line_size;
  572. }
  573. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  574. }
  575. /* next one assumes that ((line_size % 8) == 0) */
  576. void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  577. {
  578. POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
  579. register int i;
  580. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  581. register vector unsigned char blockv, temp1, temp2;
  582. register vector unsigned short pixelssum1, pixelssum2, temp3;
  583. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  584. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  585. temp1 = vec_ld(0, pixels);
  586. temp2 = vec_ld(16, pixels);
  587. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  588. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  589. pixelsv2 = temp2;
  590. } else {
  591. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  592. }
  593. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  594. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  595. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  596. (vector unsigned short)pixelsv2);
  597. pixelssum1 = vec_add(pixelssum1, vctwo);
  598. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  599. for (i = 0; i < h ; i++) {
  600. int rightside = ((unsigned long)block & 0x0000000F);
  601. blockv = vec_ld(0, block);
  602. temp1 = vec_ld(line_size, pixels);
  603. temp2 = vec_ld(line_size + 16, pixels);
  604. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  605. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  606. pixelsv2 = temp2;
  607. } else {
  608. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  609. }
  610. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  611. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  612. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  613. (vector unsigned short)pixelsv2);
  614. temp3 = vec_add(pixelssum1, pixelssum2);
  615. temp3 = vec_sra(temp3, vctwo);
  616. pixelssum1 = vec_add(pixelssum2, vctwo);
  617. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  618. if (rightside) {
  619. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  620. } else {
  621. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  622. }
  623. vec_st(blockv, 0, block);
  624. block += line_size;
  625. pixels += line_size;
  626. }
  627. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  628. }
  629. /* next one assumes that ((line_size % 8) == 0) */
  630. void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  631. {
  632. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
  633. register int i;
  634. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  635. register vector unsigned char blockv, temp1, temp2;
  636. register vector unsigned short pixelssum1, pixelssum2, temp3;
  637. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  638. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  639. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  640. temp1 = vec_ld(0, pixels);
  641. temp2 = vec_ld(16, pixels);
  642. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  643. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  644. pixelsv2 = temp2;
  645. } else {
  646. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  647. }
  648. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  649. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  650. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  651. (vector unsigned short)pixelsv2);
  652. pixelssum1 = vec_add(pixelssum1, vcone);
  653. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  654. for (i = 0; i < h ; i++) {
  655. int rightside = ((unsigned long)block & 0x0000000F);
  656. blockv = vec_ld(0, block);
  657. temp1 = vec_ld(line_size, pixels);
  658. temp2 = vec_ld(line_size + 16, pixels);
  659. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  660. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  661. pixelsv2 = temp2;
  662. } else {
  663. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  664. }
  665. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  666. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  667. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  668. (vector unsigned short)pixelsv2);
  669. temp3 = vec_add(pixelssum1, pixelssum2);
  670. temp3 = vec_sra(temp3, vctwo);
  671. pixelssum1 = vec_add(pixelssum2, vcone);
  672. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  673. if (rightside) {
  674. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  675. } else {
  676. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  677. }
  678. vec_st(blockv, 0, block);
  679. block += line_size;
  680. pixels += line_size;
  681. }
  682. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  683. }
  684. /* next one assumes that ((line_size % 16) == 0) */
  685. void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  686. {
  687. POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
  688. register int i;
  689. register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  690. register vector unsigned char blockv, temp1, temp2;
  691. register vector unsigned short temp3, temp4,
  692. pixelssum1, pixelssum2, pixelssum3, pixelssum4;
  693. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  694. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  695. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  696. temp1 = vec_ld(0, pixels);
  697. temp2 = vec_ld(16, pixels);
  698. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  699. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  700. pixelsv2 = temp2;
  701. } else {
  702. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  703. }
  704. pixelsv3 = vec_mergel(vczero, pixelsv1);
  705. pixelsv4 = vec_mergel(vczero, pixelsv2);
  706. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  707. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  708. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  709. (vector unsigned short)pixelsv4);
  710. pixelssum3 = vec_add(pixelssum3, vctwo);
  711. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  712. (vector unsigned short)pixelsv2);
  713. pixelssum1 = vec_add(pixelssum1, vctwo);
  714. for (i = 0; i < h ; i++) {
  715. blockv = vec_ld(0, block);
  716. temp1 = vec_ld(line_size, pixels);
  717. temp2 = vec_ld(line_size + 16, pixels);
  718. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  719. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  720. pixelsv2 = temp2;
  721. } else {
  722. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  723. }
  724. pixelsv3 = vec_mergel(vczero, pixelsv1);
  725. pixelsv4 = vec_mergel(vczero, pixelsv2);
  726. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  727. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  728. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  729. (vector unsigned short)pixelsv4);
  730. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  731. (vector unsigned short)pixelsv2);
  732. temp4 = vec_add(pixelssum3, pixelssum4);
  733. temp4 = vec_sra(temp4, vctwo);
  734. temp3 = vec_add(pixelssum1, pixelssum2);
  735. temp3 = vec_sra(temp3, vctwo);
  736. pixelssum3 = vec_add(pixelssum4, vctwo);
  737. pixelssum1 = vec_add(pixelssum2, vctwo);
  738. blockv = vec_packsu(temp3, temp4);
  739. vec_st(blockv, 0, block);
  740. block += line_size;
  741. pixels += line_size;
  742. }
  743. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  744. }
  745. /* next one assumes that ((line_size % 16) == 0) */
  746. void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  747. {
  748. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
  749. register int i;
  750. register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  751. register vector unsigned char blockv, temp1, temp2;
  752. register vector unsigned short temp3, temp4,
  753. pixelssum1, pixelssum2, pixelssum3, pixelssum4;
  754. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  755. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  756. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  757. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  758. temp1 = vec_ld(0, pixels);
  759. temp2 = vec_ld(16, pixels);
  760. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  761. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  762. pixelsv2 = temp2;
  763. } else {
  764. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  765. }
  766. pixelsv3 = vec_mergel(vczero, pixelsv1);
  767. pixelsv4 = vec_mergel(vczero, pixelsv2);
  768. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  769. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  770. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  771. (vector unsigned short)pixelsv4);
  772. pixelssum3 = vec_add(pixelssum3, vcone);
  773. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  774. (vector unsigned short)pixelsv2);
  775. pixelssum1 = vec_add(pixelssum1, vcone);
  776. for (i = 0; i < h ; i++) {
  777. blockv = vec_ld(0, block);
  778. temp1 = vec_ld(line_size, pixels);
  779. temp2 = vec_ld(line_size + 16, pixels);
  780. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  781. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  782. pixelsv2 = temp2;
  783. } else {
  784. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  785. }
  786. pixelsv3 = vec_mergel(vczero, pixelsv1);
  787. pixelsv4 = vec_mergel(vczero, pixelsv2);
  788. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  789. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  790. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  791. (vector unsigned short)pixelsv4);
  792. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  793. (vector unsigned short)pixelsv2);
  794. temp4 = vec_add(pixelssum3, pixelssum4);
  795. temp4 = vec_sra(temp4, vctwo);
  796. temp3 = vec_add(pixelssum1, pixelssum2);
  797. temp3 = vec_sra(temp3, vctwo);
  798. pixelssum3 = vec_add(pixelssum4, vcone);
  799. pixelssum1 = vec_add(pixelssum2, vcone);
  800. blockv = vec_packsu(temp3, temp4);
  801. vec_st(blockv, 0, block);
  802. block += line_size;
  803. pixels += line_size;
  804. }
  805. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  806. }
  807. int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  808. POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
  809. int sum;
  810. register const vector unsigned char vzero =
  811. (const vector unsigned char)vec_splat_u8(0);
  812. register vector signed short temp0, temp1, temp2, temp3, temp4,
  813. temp5, temp6, temp7;
  814. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
  815. {
  816. register const vector signed short vprod1 =(const vector signed short)
  817. { 1,-1, 1,-1, 1,-1, 1,-1 };
  818. register const vector signed short vprod2 =(const vector signed short)
  819. { 1, 1,-1,-1, 1, 1,-1,-1 };
  820. register const vector signed short vprod3 =(const vector signed short)
  821. { 1, 1, 1, 1,-1,-1,-1,-1 };
  822. register const vector unsigned char perm1 = (const vector unsigned char)
  823. {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  824. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
  825. register const vector unsigned char perm2 = (const vector unsigned char)
  826. {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  827. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
  828. register const vector unsigned char perm3 = (const vector unsigned char)
  829. {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  830. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
  831. #define ONEITERBUTTERFLY(i, res) \
  832. { \
  833. register vector unsigned char src1, src2, srcO; \
  834. register vector unsigned char dst1, dst2, dstO; \
  835. register vector signed short srcV, dstV; \
  836. register vector signed short but0, but1, but2, op1, op2, op3; \
  837. src1 = vec_ld(stride * i, src); \
  838. src2 = vec_ld((stride * i) + 15, src); \
  839. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  840. dst1 = vec_ld(stride * i, dst); \
  841. dst2 = vec_ld((stride * i) + 15, dst); \
  842. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  843. /* promote the unsigned chars to signed shorts */ \
  844. /* we're in the 8x8 function, we only care for the first 8 */ \
  845. srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  846. (vector signed char)srcO); \
  847. dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  848. (vector signed char)dstO); \
  849. /* subtractions inside the first butterfly */ \
  850. but0 = vec_sub(srcV, dstV); \
  851. op1 = vec_perm(but0, but0, perm1); \
  852. but1 = vec_mladd(but0, vprod1, op1); \
  853. op2 = vec_perm(but1, but1, perm2); \
  854. but2 = vec_mladd(but1, vprod2, op2); \
  855. op3 = vec_perm(but2, but2, perm3); \
  856. res = vec_mladd(but2, vprod3, op3); \
  857. }
  858. ONEITERBUTTERFLY(0, temp0);
  859. ONEITERBUTTERFLY(1, temp1);
  860. ONEITERBUTTERFLY(2, temp2);
  861. ONEITERBUTTERFLY(3, temp3);
  862. ONEITERBUTTERFLY(4, temp4);
  863. ONEITERBUTTERFLY(5, temp5);
  864. ONEITERBUTTERFLY(6, temp6);
  865. ONEITERBUTTERFLY(7, temp7);
  866. }
  867. #undef ONEITERBUTTERFLY
  868. {
  869. register vector signed int vsum;
  870. register vector signed short line0 = vec_add(temp0, temp1);
  871. register vector signed short line1 = vec_sub(temp0, temp1);
  872. register vector signed short line2 = vec_add(temp2, temp3);
  873. register vector signed short line3 = vec_sub(temp2, temp3);
  874. register vector signed short line4 = vec_add(temp4, temp5);
  875. register vector signed short line5 = vec_sub(temp4, temp5);
  876. register vector signed short line6 = vec_add(temp6, temp7);
  877. register vector signed short line7 = vec_sub(temp6, temp7);
  878. register vector signed short line0B = vec_add(line0, line2);
  879. register vector signed short line2B = vec_sub(line0, line2);
  880. register vector signed short line1B = vec_add(line1, line3);
  881. register vector signed short line3B = vec_sub(line1, line3);
  882. register vector signed short line4B = vec_add(line4, line6);
  883. register vector signed short line6B = vec_sub(line4, line6);
  884. register vector signed short line5B = vec_add(line5, line7);
  885. register vector signed short line7B = vec_sub(line5, line7);
  886. register vector signed short line0C = vec_add(line0B, line4B);
  887. register vector signed short line4C = vec_sub(line0B, line4B);
  888. register vector signed short line1C = vec_add(line1B, line5B);
  889. register vector signed short line5C = vec_sub(line1B, line5B);
  890. register vector signed short line2C = vec_add(line2B, line6B);
  891. register vector signed short line6C = vec_sub(line2B, line6B);
  892. register vector signed short line3C = vec_add(line3B, line7B);
  893. register vector signed short line7C = vec_sub(line3B, line7B);
  894. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  895. vsum = vec_sum4s(vec_abs(line1C), vsum);
  896. vsum = vec_sum4s(vec_abs(line2C), vsum);
  897. vsum = vec_sum4s(vec_abs(line3C), vsum);
  898. vsum = vec_sum4s(vec_abs(line4C), vsum);
  899. vsum = vec_sum4s(vec_abs(line5C), vsum);
  900. vsum = vec_sum4s(vec_abs(line6C), vsum);
  901. vsum = vec_sum4s(vec_abs(line7C), vsum);
  902. vsum = vec_sums(vsum, (vector signed int)vzero);
  903. vsum = vec_splat(vsum, 3);
  904. vec_ste(vsum, 0, &sum);
  905. }
  906. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
  907. return sum;
  908. }
  909. /*
  910. 16x8 works with 16 elements; it allows to avoid replicating loads, and
  911. give the compiler more rooms for scheduling. It's only used from
  912. inside hadamard8_diff16_altivec.
  913. Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT
  914. of spill code, it seems gcc (unlike xlc) cannot keep everything in registers
  915. by itself. The following code include hand-made registers allocation. It's not
  916. clean, but on a 7450 the resulting code is much faster (best case fall from
  917. 700+ cycles to 550).
  918. xlc doesn't add spill code, but it doesn't know how to schedule for the 7450,
  919. and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less
  920. instructions...)
  921. On the 970, the hand-made RA is still a win (around 690 vs. around 780), but
  922. xlc goes to around 660 on the regular C code...
  923. */
  924. static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
  925. int sum;
  926. register vector signed short
  927. temp0 REG_v(v0),
  928. temp1 REG_v(v1),
  929. temp2 REG_v(v2),
  930. temp3 REG_v(v3),
  931. temp4 REG_v(v4),
  932. temp5 REG_v(v5),
  933. temp6 REG_v(v6),
  934. temp7 REG_v(v7);
  935. register vector signed short
  936. temp0S REG_v(v8),
  937. temp1S REG_v(v9),
  938. temp2S REG_v(v10),
  939. temp3S REG_v(v11),
  940. temp4S REG_v(v12),
  941. temp5S REG_v(v13),
  942. temp6S REG_v(v14),
  943. temp7S REG_v(v15);
  944. register const vector unsigned char vzero REG_v(v31)=
  945. (const vector unsigned char)vec_splat_u8(0);
  946. {
  947. register const vector signed short vprod1 REG_v(v16)=
  948. (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 };
  949. register const vector signed short vprod2 REG_v(v17)=
  950. (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 };
  951. register const vector signed short vprod3 REG_v(v18)=
  952. (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 };
  953. register const vector unsigned char perm1 REG_v(v19)=
  954. (const vector unsigned char)
  955. {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  956. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D};
  957. register const vector unsigned char perm2 REG_v(v20)=
  958. (const vector unsigned char)
  959. {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  960. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B};
  961. register const vector unsigned char perm3 REG_v(v21)=
  962. (const vector unsigned char)
  963. {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  964. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07};
  965. #define ONEITERBUTTERFLY(i, res1, res2) \
  966. { \
  967. register vector unsigned char src1 REG_v(v22), \
  968. src2 REG_v(v23), \
  969. dst1 REG_v(v24), \
  970. dst2 REG_v(v25), \
  971. srcO REG_v(v22), \
  972. dstO REG_v(v23); \
  973. \
  974. register vector signed short srcV REG_v(v24), \
  975. dstV REG_v(v25), \
  976. srcW REG_v(v26), \
  977. dstW REG_v(v27), \
  978. but0 REG_v(v28), \
  979. but0S REG_v(v29), \
  980. op1 REG_v(v30), \
  981. but1 REG_v(v22), \
  982. op1S REG_v(v23), \
  983. but1S REG_v(v24), \
  984. op2 REG_v(v25), \
  985. but2 REG_v(v26), \
  986. op2S REG_v(v27), \
  987. but2S REG_v(v28), \
  988. op3 REG_v(v29), \
  989. op3S REG_v(v30); \
  990. \
  991. src1 = vec_ld(stride * i, src); \
  992. src2 = vec_ld((stride * i) + 16, src); \
  993. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  994. dst1 = vec_ld(stride * i, dst); \
  995. dst2 = vec_ld((stride * i) + 16, dst); \
  996. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  997. /* promote the unsigned chars to signed shorts */ \
  998. srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  999. (vector signed char)srcO); \
  1000. dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \
  1001. (vector signed char)dstO); \
  1002. srcW = (vector signed short)vec_mergel((vector signed char)vzero, \
  1003. (vector signed char)srcO); \
  1004. dstW = (vector signed short)vec_mergel((vector signed char)vzero, \
  1005. (vector signed char)dstO); \
  1006. /* subtractions inside the first butterfly */ \
  1007. but0 = vec_sub(srcV, dstV); \
  1008. but0S = vec_sub(srcW, dstW); \
  1009. op1 = vec_perm(but0, but0, perm1); \
  1010. but1 = vec_mladd(but0, vprod1, op1); \
  1011. op1S = vec_perm(but0S, but0S, perm1); \
  1012. but1S = vec_mladd(but0S, vprod1, op1S); \
  1013. op2 = vec_perm(but1, but1, perm2); \
  1014. but2 = vec_mladd(but1, vprod2, op2); \
  1015. op2S = vec_perm(but1S, but1S, perm2); \
  1016. but2S = vec_mladd(but1S, vprod2, op2S); \
  1017. op3 = vec_perm(but2, but2, perm3); \
  1018. res1 = vec_mladd(but2, vprod3, op3); \
  1019. op3S = vec_perm(but2S, but2S, perm3); \
  1020. res2 = vec_mladd(but2S, vprod3, op3S); \
  1021. }
  1022. ONEITERBUTTERFLY(0, temp0, temp0S);
  1023. ONEITERBUTTERFLY(1, temp1, temp1S);
  1024. ONEITERBUTTERFLY(2, temp2, temp2S);
  1025. ONEITERBUTTERFLY(3, temp3, temp3S);
  1026. ONEITERBUTTERFLY(4, temp4, temp4S);
  1027. ONEITERBUTTERFLY(5, temp5, temp5S);
  1028. ONEITERBUTTERFLY(6, temp6, temp6S);
  1029. ONEITERBUTTERFLY(7, temp7, temp7S);
  1030. }
  1031. #undef ONEITERBUTTERFLY
  1032. {
  1033. register vector signed int vsum;
  1034. register vector signed short line0S, line1S, line2S, line3S, line4S,
  1035. line5S, line6S, line7S, line0BS,line2BS,
  1036. line1BS,line3BS,line4BS,line6BS,line5BS,
  1037. line7BS,line0CS,line4CS,line1CS,line5CS,
  1038. line2CS,line6CS,line3CS,line7CS;
  1039. register vector signed short line0 = vec_add(temp0, temp1);
  1040. register vector signed short line1 = vec_sub(temp0, temp1);
  1041. register vector signed short line2 = vec_add(temp2, temp3);
  1042. register vector signed short line3 = vec_sub(temp2, temp3);
  1043. register vector signed short line4 = vec_add(temp4, temp5);
  1044. register vector signed short line5 = vec_sub(temp4, temp5);
  1045. register vector signed short line6 = vec_add(temp6, temp7);
  1046. register vector signed short line7 = vec_sub(temp6, temp7);
  1047. register vector signed short line0B = vec_add(line0, line2);
  1048. register vector signed short line2B = vec_sub(line0, line2);
  1049. register vector signed short line1B = vec_add(line1, line3);
  1050. register vector signed short line3B = vec_sub(line1, line3);
  1051. register vector signed short line4B = vec_add(line4, line6);
  1052. register vector signed short line6B = vec_sub(line4, line6);
  1053. register vector signed short line5B = vec_add(line5, line7);
  1054. register vector signed short line7B = vec_sub(line5, line7);
  1055. register vector signed short line0C = vec_add(line0B, line4B);
  1056. register vector signed short line4C = vec_sub(line0B, line4B);
  1057. register vector signed short line1C = vec_add(line1B, line5B);
  1058. register vector signed short line5C = vec_sub(line1B, line5B);
  1059. register vector signed short line2C = vec_add(line2B, line6B);
  1060. register vector signed short line6C = vec_sub(line2B, line6B);
  1061. register vector signed short line3C = vec_add(line3B, line7B);
  1062. register vector signed short line7C = vec_sub(line3B, line7B);
  1063. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1064. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1065. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1066. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1067. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1068. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1069. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1070. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1071. line0S = vec_add(temp0S, temp1S);
  1072. line1S = vec_sub(temp0S, temp1S);
  1073. line2S = vec_add(temp2S, temp3S);
  1074. line3S = vec_sub(temp2S, temp3S);
  1075. line4S = vec_add(temp4S, temp5S);
  1076. line5S = vec_sub(temp4S, temp5S);
  1077. line6S = vec_add(temp6S, temp7S);
  1078. line7S = vec_sub(temp6S, temp7S);
  1079. line0BS = vec_add(line0S, line2S);
  1080. line2BS = vec_sub(line0S, line2S);
  1081. line1BS = vec_add(line1S, line3S);
  1082. line3BS = vec_sub(line1S, line3S);
  1083. line4BS = vec_add(line4S, line6S);
  1084. line6BS = vec_sub(line4S, line6S);
  1085. line5BS = vec_add(line5S, line7S);
  1086. line7BS = vec_sub(line5S, line7S);
  1087. line0CS = vec_add(line0BS, line4BS);
  1088. line4CS = vec_sub(line0BS, line4BS);
  1089. line1CS = vec_add(line1BS, line5BS);
  1090. line5CS = vec_sub(line1BS, line5BS);
  1091. line2CS = vec_add(line2BS, line6BS);
  1092. line6CS = vec_sub(line2BS, line6BS);
  1093. line3CS = vec_add(line3BS, line7BS);
  1094. line7CS = vec_sub(line3BS, line7BS);
  1095. vsum = vec_sum4s(vec_abs(line0CS), vsum);
  1096. vsum = vec_sum4s(vec_abs(line1CS), vsum);
  1097. vsum = vec_sum4s(vec_abs(line2CS), vsum);
  1098. vsum = vec_sum4s(vec_abs(line3CS), vsum);
  1099. vsum = vec_sum4s(vec_abs(line4CS), vsum);
  1100. vsum = vec_sum4s(vec_abs(line5CS), vsum);
  1101. vsum = vec_sum4s(vec_abs(line6CS), vsum);
  1102. vsum = vec_sum4s(vec_abs(line7CS), vsum);
  1103. vsum = vec_sums(vsum, (vector signed int)vzero);
  1104. vsum = vec_splat(vsum, 3);
  1105. vec_ste(vsum, 0, &sum);
  1106. }
  1107. return sum;
  1108. }
  1109. int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1110. POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
  1111. int score;
  1112. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
  1113. score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1114. if (h==16) {
  1115. dst += 8*stride;
  1116. src += 8*stride;
  1117. score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1118. }
  1119. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
  1120. return score;
  1121. }
  1122. static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
  1123. int blocksize)
  1124. {
  1125. int i;
  1126. vector float m, a;
  1127. vector bool int t0, t1;
  1128. const vector unsigned int v_31 = //XXX
  1129. vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
  1130. for (i = 0; i < blocksize; i += 4) {
  1131. m = vec_ld(0, mag+i);
  1132. a = vec_ld(0, ang+i);
  1133. t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
  1134. t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
  1135. a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
  1136. t0 = (vector bool int)vec_and(a, t1);
  1137. t1 = (vector bool int)vec_andc(a, t1);
  1138. a = vec_sub(m, (vector float)t1);
  1139. m = vec_add(m, (vector float)t0);
  1140. vec_stl(a, 0, ang+i);
  1141. vec_stl(m, 0, mag+i);
  1142. }
  1143. }
  1144. /* next one assumes that ((line_size % 8) == 0) */
  1145. void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  1146. {
  1147. POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
  1148. register int i;
  1149. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  1150. register vector unsigned char blockv, temp1, temp2, blocktemp;
  1151. register vector unsigned short pixelssum1, pixelssum2, temp3;
  1152. register const vector unsigned char vczero = (const vector unsigned char)
  1153. vec_splat_u8(0);
  1154. register const vector unsigned short vctwo = (const vector unsigned short)
  1155. vec_splat_u16(2);
  1156. temp1 = vec_ld(0, pixels);
  1157. temp2 = vec_ld(16, pixels);
  1158. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1159. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  1160. pixelsv2 = temp2;
  1161. } else {
  1162. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1163. }
  1164. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1165. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1166. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1167. (vector unsigned short)pixelsv2);
  1168. pixelssum1 = vec_add(pixelssum1, vctwo);
  1169. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1170. for (i = 0; i < h ; i++) {
  1171. int rightside = ((unsigned long)block & 0x0000000F);
  1172. blockv = vec_ld(0, block);
  1173. temp1 = vec_ld(line_size, pixels);
  1174. temp2 = vec_ld(line_size + 16, pixels);
  1175. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1176. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  1177. pixelsv2 = temp2;
  1178. } else {
  1179. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1180. }
  1181. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1182. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1183. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1184. (vector unsigned short)pixelsv2);
  1185. temp3 = vec_add(pixelssum1, pixelssum2);
  1186. temp3 = vec_sra(temp3, vctwo);
  1187. pixelssum1 = vec_add(pixelssum2, vctwo);
  1188. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  1189. if (rightside) {
  1190. blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  1191. } else {
  1192. blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  1193. }
  1194. blockv = vec_avg(blocktemp, blockv);
  1195. vec_st(blockv, 0, block);
  1196. block += line_size;
  1197. pixels += line_size;
  1198. }
  1199. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1200. }
  1201. void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
  1202. {
  1203. c->pix_abs[0][1] = sad16_x2_altivec;
  1204. c->pix_abs[0][2] = sad16_y2_altivec;
  1205. c->pix_abs[0][3] = sad16_xy2_altivec;
  1206. c->pix_abs[0][0] = sad16_altivec;
  1207. c->pix_abs[1][0] = sad8_altivec;
  1208. c->sad[0]= sad16_altivec;
  1209. c->sad[1]= sad8_altivec;
  1210. c->pix_norm1 = pix_norm1_altivec;
  1211. c->sse[1]= sse8_altivec;
  1212. c->sse[0]= sse16_altivec;
  1213. c->pix_sum = pix_sum_altivec;
  1214. c->diff_pixels = diff_pixels_altivec;
  1215. c->get_pixels = get_pixels_altivec;
  1216. c->add_bytes= add_bytes_altivec;
  1217. c->put_pixels_tab[0][0] = put_pixels16_altivec;
  1218. /* the two functions do the same thing, so use the same code */
  1219. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
  1220. c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
  1221. c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
  1222. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
  1223. c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
  1224. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
  1225. c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
  1226. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
  1227. c->hadamard8_diff[0] = hadamard8_diff16_altivec;
  1228. c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
  1229. if (ENABLE_VORBIS_DECODER)
  1230. c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
  1231. }