You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1590 lines
58KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "../dsputil.h"
  23. #include "gcc_fixes.h"
  24. #include "dsputil_altivec.h"
  25. #ifdef CONFIG_DARWIN
  26. #include <sys/sysctl.h>
  27. #else /* CONFIG_DARWIN */
  28. #ifdef __AMIGAOS4__
  29. #include <exec/exec.h>
  30. #include <interfaces/exec.h>
  31. #include <proto/exec.h>
  32. #else /* __AMIGAOS4__ */
  33. #include <signal.h>
  34. #include <setjmp.h>
  35. static sigjmp_buf jmpbuf;
  36. static volatile sig_atomic_t canjump = 0;
  37. static void sigill_handler (int sig)
  38. {
  39. if (!canjump) {
  40. signal (sig, SIG_DFL);
  41. raise (sig);
  42. }
  43. canjump = 0;
  44. siglongjmp (jmpbuf, 1);
  45. }
  46. #endif /* CONFIG_DARWIN */
  47. #endif /* __AMIGAOS4__ */
  48. int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  49. {
  50. int i;
  51. int s __attribute__((aligned(16)));
  52. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  53. vector unsigned char *tv;
  54. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  55. vector unsigned int sad;
  56. vector signed int sumdiffs;
  57. s = 0;
  58. sad = (vector unsigned int)vec_splat_u32(0);
  59. for(i=0;i<h;i++) {
  60. /*
  61. Read unaligned pixels into our vectors. The vectors are as follows:
  62. pix1v: pix1[0]-pix1[15]
  63. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  64. */
  65. tv = (vector unsigned char *) pix1;
  66. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  67. tv = (vector unsigned char *) &pix2[0];
  68. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  69. tv = (vector unsigned char *) &pix2[1];
  70. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  71. /* Calculate the average vector */
  72. avgv = vec_avg(pix2v, pix2iv);
  73. /* Calculate a sum of abs differences vector */
  74. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  75. /* Add each 4 pixel group together and put 4 results into sad */
  76. sad = vec_sum4s(t5, sad);
  77. pix1 += line_size;
  78. pix2 += line_size;
  79. }
  80. /* Sum up the four partial sums, and put the result into s */
  81. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  82. sumdiffs = vec_splat(sumdiffs, 3);
  83. vec_ste(sumdiffs, 0, &s);
  84. return s;
  85. }
  86. int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  87. {
  88. int i;
  89. int s __attribute__((aligned(16)));
  90. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  91. vector unsigned char *tv;
  92. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  93. vector unsigned int sad;
  94. vector signed int sumdiffs;
  95. uint8_t *pix3 = pix2 + line_size;
  96. s = 0;
  97. sad = (vector unsigned int)vec_splat_u32(0);
  98. /*
  99. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  100. iteration becomes pix2 in the next iteration. We can use this
  101. fact to avoid a potentially expensive unaligned read, each
  102. time around the loop.
  103. Read unaligned pixels into our vectors. The vectors are as follows:
  104. pix2v: pix2[0]-pix2[15]
  105. Split the pixel vectors into shorts
  106. */
  107. tv = (vector unsigned char *) &pix2[0];
  108. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  109. for(i=0;i<h;i++) {
  110. /*
  111. Read unaligned pixels into our vectors. The vectors are as follows:
  112. pix1v: pix1[0]-pix1[15]
  113. pix3v: pix3[0]-pix3[15]
  114. */
  115. tv = (vector unsigned char *) pix1;
  116. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  117. tv = (vector unsigned char *) &pix3[0];
  118. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  119. /* Calculate the average vector */
  120. avgv = vec_avg(pix2v, pix3v);
  121. /* Calculate a sum of abs differences vector */
  122. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  123. /* Add each 4 pixel group together and put 4 results into sad */
  124. sad = vec_sum4s(t5, sad);
  125. pix1 += line_size;
  126. pix2v = pix3v;
  127. pix3 += line_size;
  128. }
  129. /* Sum up the four partial sums, and put the result into s */
  130. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  131. sumdiffs = vec_splat(sumdiffs, 3);
  132. vec_ste(sumdiffs, 0, &s);
  133. return s;
  134. }
  135. int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  136. {
  137. int i;
  138. int s __attribute__((aligned(16)));
  139. uint8_t *pix3 = pix2 + line_size;
  140. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  141. const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
  142. vector unsigned char *tv, avgv, t5;
  143. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  144. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  145. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  146. vector unsigned short avghv, avglv;
  147. vector unsigned short t1, t2, t3, t4;
  148. vector unsigned int sad;
  149. vector signed int sumdiffs;
  150. sad = (vector unsigned int)vec_splat_u32(0);
  151. s = 0;
  152. /*
  153. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  154. iteration becomes pix2 in the next iteration. We can use this
  155. fact to avoid a potentially expensive unaligned read, as well
  156. as some splitting, and vector addition each time around the loop.
  157. Read unaligned pixels into our vectors. The vectors are as follows:
  158. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  159. Split the pixel vectors into shorts
  160. */
  161. tv = (vector unsigned char *) &pix2[0];
  162. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  163. tv = (vector unsigned char *) &pix2[1];
  164. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  165. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  166. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  167. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  168. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  169. t1 = vec_add(pix2hv, pix2ihv);
  170. t2 = vec_add(pix2lv, pix2ilv);
  171. for(i=0;i<h;i++) {
  172. /*
  173. Read unaligned pixels into our vectors. The vectors are as follows:
  174. pix1v: pix1[0]-pix1[15]
  175. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  176. */
  177. tv = (vector unsigned char *) pix1;
  178. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  179. tv = (vector unsigned char *) &pix3[0];
  180. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  181. tv = (vector unsigned char *) &pix3[1];
  182. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  183. /*
  184. Note that Altivec does have vec_avg, but this works on vector pairs
  185. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  186. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  187. Instead, we have to split the pixel vectors into vectors of shorts,
  188. and do the averaging by hand.
  189. */
  190. /* Split the pixel vectors into shorts */
  191. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  192. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  193. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  194. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  195. /* Do the averaging on them */
  196. t3 = vec_add(pix3hv, pix3ihv);
  197. t4 = vec_add(pix3lv, pix3ilv);
  198. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  199. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  200. /* Pack the shorts back into a result */
  201. avgv = vec_pack(avghv, avglv);
  202. /* Calculate a sum of abs differences vector */
  203. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  204. /* Add each 4 pixel group together and put 4 results into sad */
  205. sad = vec_sum4s(t5, sad);
  206. pix1 += line_size;
  207. pix3 += line_size;
  208. /* Transfer the calculated values for pix3 into pix2 */
  209. t1 = t3;
  210. t2 = t4;
  211. }
  212. /* Sum up the four partial sums, and put the result into s */
  213. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  214. sumdiffs = vec_splat(sumdiffs, 3);
  215. vec_ste(sumdiffs, 0, &s);
  216. return s;
  217. }
  218. int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  219. {
  220. int i;
  221. int s __attribute__((aligned(16)));
  222. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  223. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  224. vector unsigned char t1, t2, t3,t4, t5;
  225. vector unsigned int sad;
  226. vector signed int sumdiffs;
  227. sad = (vector unsigned int)vec_splat_u32(0);
  228. for(i=0;i<h;i++) {
  229. /* Read potentially unaligned pixels into t1 and t2 */
  230. perm1 = vec_lvsl(0, pix1);
  231. pix1v = (vector unsigned char *) pix1;
  232. perm2 = vec_lvsl(0, pix2);
  233. pix2v = (vector unsigned char *) pix2;
  234. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  235. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  236. /* Calculate a sum of abs differences vector */
  237. t3 = vec_max(t1, t2);
  238. t4 = vec_min(t1, t2);
  239. t5 = vec_sub(t3, t4);
  240. /* Add each 4 pixel group together and put 4 results into sad */
  241. sad = vec_sum4s(t5, sad);
  242. pix1 += line_size;
  243. pix2 += line_size;
  244. }
  245. /* Sum up the four partial sums, and put the result into s */
  246. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  247. sumdiffs = vec_splat(sumdiffs, 3);
  248. vec_ste(sumdiffs, 0, &s);
  249. return s;
  250. }
  251. int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  252. {
  253. int i;
  254. int s __attribute__((aligned(16)));
  255. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  256. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  257. vector unsigned char t1, t2, t3,t4, t5;
  258. vector unsigned int sad;
  259. vector signed int sumdiffs;
  260. sad = (vector unsigned int)vec_splat_u32(0);
  261. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  262. for(i=0;i<h;i++) {
  263. /* Read potentially unaligned pixels into t1 and t2
  264. Since we're reading 16 pixels, and actually only want 8,
  265. mask out the last 8 pixels. The 0s don't change the sum. */
  266. perm1 = vec_lvsl(0, pix1);
  267. pix1v = (vector unsigned char *) pix1;
  268. perm2 = vec_lvsl(0, pix2);
  269. pix2v = (vector unsigned char *) pix2;
  270. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  271. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  272. /* Calculate a sum of abs differences vector */
  273. t3 = vec_max(t1, t2);
  274. t4 = vec_min(t1, t2);
  275. t5 = vec_sub(t3, t4);
  276. /* Add each 4 pixel group together and put 4 results into sad */
  277. sad = vec_sum4s(t5, sad);
  278. pix1 += line_size;
  279. pix2 += line_size;
  280. }
  281. /* Sum up the four partial sums, and put the result into s */
  282. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  283. sumdiffs = vec_splat(sumdiffs, 3);
  284. vec_ste(sumdiffs, 0, &s);
  285. return s;
  286. }
  287. int pix_norm1_altivec(uint8_t *pix, int line_size)
  288. {
  289. int i;
  290. int s __attribute__((aligned(16)));
  291. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  292. vector unsigned char *tv;
  293. vector unsigned char pixv;
  294. vector unsigned int sv;
  295. vector signed int sum;
  296. sv = (vector unsigned int)vec_splat_u32(0);
  297. s = 0;
  298. for (i = 0; i < 16; i++) {
  299. /* Read in the potentially unaligned pixels */
  300. tv = (vector unsigned char *) pix;
  301. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  302. /* Square the values, and add them to our sum */
  303. sv = vec_msum(pixv, pixv, sv);
  304. pix += line_size;
  305. }
  306. /* Sum up the four partial sums, and put the result into s */
  307. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  308. sum = vec_splat(sum, 3);
  309. vec_ste(sum, 0, &s);
  310. return s;
  311. }
  312. /**
  313. * Sum of Squared Errors for a 8x8 block.
  314. * AltiVec-enhanced.
  315. * It's the sad8_altivec code above w/ squaring added.
  316. */
  317. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  318. {
  319. int i;
  320. int s __attribute__((aligned(16)));
  321. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  322. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  323. vector unsigned char t1, t2, t3,t4, t5;
  324. vector unsigned int sum;
  325. vector signed int sumsqr;
  326. sum = (vector unsigned int)vec_splat_u32(0);
  327. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  328. for(i=0;i<h;i++) {
  329. /* Read potentially unaligned pixels into t1 and t2
  330. Since we're reading 16 pixels, and actually only want 8,
  331. mask out the last 8 pixels. The 0s don't change the sum. */
  332. perm1 = vec_lvsl(0, pix1);
  333. pix1v = (vector unsigned char *) pix1;
  334. perm2 = vec_lvsl(0, pix2);
  335. pix2v = (vector unsigned char *) pix2;
  336. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  337. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  338. /*
  339. Since we want to use unsigned chars, we can take advantage
  340. of the fact that abs(a-b)^2 = (a-b)^2.
  341. */
  342. /* Calculate abs differences vector */
  343. t3 = vec_max(t1, t2);
  344. t4 = vec_min(t1, t2);
  345. t5 = vec_sub(t3, t4);
  346. /* Square the values and add them to our sum */
  347. sum = vec_msum(t5, t5, sum);
  348. pix1 += line_size;
  349. pix2 += line_size;
  350. }
  351. /* Sum up the four partial sums, and put the result into s */
  352. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  353. sumsqr = vec_splat(sumsqr, 3);
  354. vec_ste(sumsqr, 0, &s);
  355. return s;
  356. }
  357. /**
  358. * Sum of Squared Errors for a 16x16 block.
  359. * AltiVec-enhanced.
  360. * It's the sad16_altivec code above w/ squaring added.
  361. */
  362. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  363. {
  364. int i;
  365. int s __attribute__((aligned(16)));
  366. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  367. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  368. vector unsigned char t1, t2, t3,t4, t5;
  369. vector unsigned int sum;
  370. vector signed int sumsqr;
  371. sum = (vector unsigned int)vec_splat_u32(0);
  372. for(i=0;i<h;i++) {
  373. /* Read potentially unaligned pixels into t1 and t2 */
  374. perm1 = vec_lvsl(0, pix1);
  375. pix1v = (vector unsigned char *) pix1;
  376. perm2 = vec_lvsl(0, pix2);
  377. pix2v = (vector unsigned char *) pix2;
  378. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  379. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  380. /*
  381. Since we want to use unsigned chars, we can take advantage
  382. of the fact that abs(a-b)^2 = (a-b)^2.
  383. */
  384. /* Calculate abs differences vector */
  385. t3 = vec_max(t1, t2);
  386. t4 = vec_min(t1, t2);
  387. t5 = vec_sub(t3, t4);
  388. /* Square the values and add them to our sum */
  389. sum = vec_msum(t5, t5, sum);
  390. pix1 += line_size;
  391. pix2 += line_size;
  392. }
  393. /* Sum up the four partial sums, and put the result into s */
  394. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  395. sumsqr = vec_splat(sumsqr, 3);
  396. vec_ste(sumsqr, 0, &s);
  397. return s;
  398. }
  399. int pix_sum_altivec(uint8_t * pix, int line_size)
  400. {
  401. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  402. vector unsigned char perm, *pixv;
  403. vector unsigned char t1;
  404. vector unsigned int sad;
  405. vector signed int sumdiffs;
  406. int i;
  407. int s __attribute__((aligned(16)));
  408. sad = (vector unsigned int)vec_splat_u32(0);
  409. for (i = 0; i < 16; i++) {
  410. /* Read the potentially unaligned 16 pixels into t1 */
  411. perm = vec_lvsl(0, pix);
  412. pixv = (vector unsigned char *) pix;
  413. t1 = vec_perm(pixv[0], pixv[1], perm);
  414. /* Add each 4 pixel group together and put 4 results into sad */
  415. sad = vec_sum4s(t1, sad);
  416. pix += line_size;
  417. }
  418. /* Sum up the four partial sums, and put the result into s */
  419. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  420. sumdiffs = vec_splat(sumdiffs, 3);
  421. vec_ste(sumdiffs, 0, &s);
  422. return s;
  423. }
  424. void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  425. {
  426. int i;
  427. vector unsigned char perm, bytes, *pixv;
  428. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  429. vector signed short shorts;
  430. for(i=0;i<8;i++)
  431. {
  432. // Read potentially unaligned pixels.
  433. // We're reading 16 pixels, and actually only want 8,
  434. // but we simply ignore the extras.
  435. perm = vec_lvsl(0, pixels);
  436. pixv = (vector unsigned char *) pixels;
  437. bytes = vec_perm(pixv[0], pixv[1], perm);
  438. // convert the bytes into shorts
  439. shorts = (vector signed short)vec_mergeh(zero, bytes);
  440. // save the data to the block, we assume the block is 16-byte aligned
  441. vec_st(shorts, i*16, (vector signed short*)block);
  442. pixels += line_size;
  443. }
  444. }
  445. void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  446. const uint8_t *s2, int stride)
  447. {
  448. int i;
  449. vector unsigned char perm, bytes, *pixv;
  450. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  451. vector signed short shorts1, shorts2;
  452. for(i=0;i<4;i++)
  453. {
  454. // Read potentially unaligned pixels
  455. // We're reading 16 pixels, and actually only want 8,
  456. // but we simply ignore the extras.
  457. perm = vec_lvsl(0, s1);
  458. pixv = (vector unsigned char *) s1;
  459. bytes = vec_perm(pixv[0], pixv[1], perm);
  460. // convert the bytes into shorts
  461. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  462. // Do the same for the second block of pixels
  463. perm = vec_lvsl(0, s2);
  464. pixv = (vector unsigned char *) s2;
  465. bytes = vec_perm(pixv[0], pixv[1], perm);
  466. // convert the bytes into shorts
  467. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  468. // Do the subtraction
  469. shorts1 = vec_sub(shorts1, shorts2);
  470. // save the data to the block, we assume the block is 16-byte aligned
  471. vec_st(shorts1, 0, (vector signed short*)block);
  472. s1 += stride;
  473. s2 += stride;
  474. block += 8;
  475. // The code below is a copy of the code above... This is a manual
  476. // unroll.
  477. // Read potentially unaligned pixels
  478. // We're reading 16 pixels, and actually only want 8,
  479. // but we simply ignore the extras.
  480. perm = vec_lvsl(0, s1);
  481. pixv = (vector unsigned char *) s1;
  482. bytes = vec_perm(pixv[0], pixv[1], perm);
  483. // convert the bytes into shorts
  484. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  485. // Do the same for the second block of pixels
  486. perm = vec_lvsl(0, s2);
  487. pixv = (vector unsigned char *) s2;
  488. bytes = vec_perm(pixv[0], pixv[1], perm);
  489. // convert the bytes into shorts
  490. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  491. // Do the subtraction
  492. shorts1 = vec_sub(shorts1, shorts2);
  493. // save the data to the block, we assume the block is 16-byte aligned
  494. vec_st(shorts1, 0, (vector signed short*)block);
  495. s1 += stride;
  496. s2 += stride;
  497. block += 8;
  498. }
  499. }
  500. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  501. register int i;
  502. register vector unsigned char vdst, vsrc;
  503. /* dst and src are 16 bytes-aligned (guaranteed) */
  504. for(i = 0 ; (i + 15) < w ; i+=16)
  505. {
  506. vdst = vec_ld(i, (unsigned char*)dst);
  507. vsrc = vec_ld(i, (unsigned char*)src);
  508. vdst = vec_add(vsrc, vdst);
  509. vec_st(vdst, i, (unsigned char*)dst);
  510. }
  511. /* if w is not a multiple of 16 */
  512. for (; (i < w) ; i++)
  513. {
  514. dst[i] = src[i];
  515. }
  516. }
  517. /* next one assumes that ((line_size % 16) == 0) */
  518. void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  519. {
  520. POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
  521. register vector unsigned char pixelsv1, pixelsv2;
  522. register vector unsigned char pixelsv1B, pixelsv2B;
  523. register vector unsigned char pixelsv1C, pixelsv2C;
  524. register vector unsigned char pixelsv1D, pixelsv2D;
  525. register vector unsigned char perm = vec_lvsl(0, pixels);
  526. int i;
  527. register int line_size_2 = line_size << 1;
  528. register int line_size_3 = line_size + line_size_2;
  529. register int line_size_4 = line_size << 2;
  530. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  531. // hand-unrolling the loop by 4 gains about 15%
  532. // mininum execution time goes from 74 to 60 cycles
  533. // it's faster than -funroll-loops, but using
  534. // -funroll-loops w/ this is bad - 74 cycles again.
  535. // all this is on a 7450, tuning for the 7450
  536. #if 0
  537. for(i=0; i<h; i++) {
  538. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  539. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  540. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  541. 0, (unsigned char*)block);
  542. pixels+=line_size;
  543. block +=line_size;
  544. }
  545. #else
  546. for(i=0; i<h; i+=4) {
  547. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  548. pixelsv2 = vec_ld(15, (unsigned char*)pixels);
  549. pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
  550. pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
  551. pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
  552. pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
  553. pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
  554. pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
  555. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  556. 0, (unsigned char*)block);
  557. vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
  558. line_size, (unsigned char*)block);
  559. vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
  560. line_size_2, (unsigned char*)block);
  561. vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
  562. line_size_3, (unsigned char*)block);
  563. pixels+=line_size_4;
  564. block +=line_size_4;
  565. }
  566. #endif
  567. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  568. }
  569. /* next one assumes that ((line_size % 16) == 0) */
  570. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  571. void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  572. {
  573. POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
  574. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  575. register vector unsigned char perm = vec_lvsl(0, pixels);
  576. int i;
  577. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  578. for(i=0; i<h; i++) {
  579. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  580. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  581. blockv = vec_ld(0, block);
  582. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  583. blockv = vec_avg(blockv,pixelsv);
  584. vec_st(blockv, 0, (unsigned char*)block);
  585. pixels+=line_size;
  586. block +=line_size;
  587. }
  588. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  589. }
  590. /* next one assumes that ((line_size % 8) == 0) */
  591. void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  592. {
  593. POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
  594. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  595. int i;
  596. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  597. for (i = 0; i < h; i++) {
  598. /*
  599. block is 8 bytes-aligned, so we're either in the
  600. left block (16 bytes-aligned) or in the right block (not)
  601. */
  602. int rightside = ((unsigned long)block & 0x0000000F);
  603. blockv = vec_ld(0, block);
  604. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  605. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  606. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  607. if (rightside)
  608. {
  609. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  610. }
  611. else
  612. {
  613. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  614. }
  615. blockv = vec_avg(blockv, pixelsv);
  616. vec_st(blockv, 0, block);
  617. pixels += line_size;
  618. block += line_size;
  619. }
  620. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  621. }
  622. /* next one assumes that ((line_size % 8) == 0) */
  623. void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  624. {
  625. POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
  626. register int i;
  627. register vector unsigned char
  628. pixelsv1, pixelsv2,
  629. pixelsavg;
  630. register vector unsigned char
  631. blockv, temp1, temp2;
  632. register vector unsigned short
  633. pixelssum1, pixelssum2, temp3;
  634. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  635. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  636. temp1 = vec_ld(0, pixels);
  637. temp2 = vec_ld(16, pixels);
  638. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  639. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  640. {
  641. pixelsv2 = temp2;
  642. }
  643. else
  644. {
  645. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  646. }
  647. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  648. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  649. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  650. (vector unsigned short)pixelsv2);
  651. pixelssum1 = vec_add(pixelssum1, vctwo);
  652. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  653. for (i = 0; i < h ; i++) {
  654. int rightside = ((unsigned long)block & 0x0000000F);
  655. blockv = vec_ld(0, block);
  656. temp1 = vec_ld(line_size, pixels);
  657. temp2 = vec_ld(line_size + 16, pixels);
  658. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  659. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  660. {
  661. pixelsv2 = temp2;
  662. }
  663. else
  664. {
  665. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  666. }
  667. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  668. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  669. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  670. (vector unsigned short)pixelsv2);
  671. temp3 = vec_add(pixelssum1, pixelssum2);
  672. temp3 = vec_sra(temp3, vctwo);
  673. pixelssum1 = vec_add(pixelssum2, vctwo);
  674. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  675. if (rightside)
  676. {
  677. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  678. }
  679. else
  680. {
  681. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  682. }
  683. vec_st(blockv, 0, block);
  684. block += line_size;
  685. pixels += line_size;
  686. }
  687. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  688. }
  689. /* next one assumes that ((line_size % 8) == 0) */
  690. void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  691. {
  692. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
  693. register int i;
  694. register vector unsigned char
  695. pixelsv1, pixelsv2,
  696. pixelsavg;
  697. register vector unsigned char
  698. blockv, temp1, temp2;
  699. register vector unsigned short
  700. pixelssum1, pixelssum2, temp3;
  701. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  702. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  703. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  704. temp1 = vec_ld(0, pixels);
  705. temp2 = vec_ld(16, pixels);
  706. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  707. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  708. {
  709. pixelsv2 = temp2;
  710. }
  711. else
  712. {
  713. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  714. }
  715. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  716. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  717. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  718. (vector unsigned short)pixelsv2);
  719. pixelssum1 = vec_add(pixelssum1, vcone);
  720. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  721. for (i = 0; i < h ; i++) {
  722. int rightside = ((unsigned long)block & 0x0000000F);
  723. blockv = vec_ld(0, block);
  724. temp1 = vec_ld(line_size, pixels);
  725. temp2 = vec_ld(line_size + 16, pixels);
  726. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  727. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  728. {
  729. pixelsv2 = temp2;
  730. }
  731. else
  732. {
  733. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  734. }
  735. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  736. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  737. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  738. (vector unsigned short)pixelsv2);
  739. temp3 = vec_add(pixelssum1, pixelssum2);
  740. temp3 = vec_sra(temp3, vctwo);
  741. pixelssum1 = vec_add(pixelssum2, vcone);
  742. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  743. if (rightside)
  744. {
  745. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  746. }
  747. else
  748. {
  749. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  750. }
  751. vec_st(blockv, 0, block);
  752. block += line_size;
  753. pixels += line_size;
  754. }
  755. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  756. }
  757. /* next one assumes that ((line_size % 16) == 0) */
  758. void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  759. {
  760. POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
  761. register int i;
  762. register vector unsigned char
  763. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  764. register vector unsigned char
  765. blockv, temp1, temp2;
  766. register vector unsigned short
  767. pixelssum1, pixelssum2, temp3,
  768. pixelssum3, pixelssum4, temp4;
  769. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  770. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  771. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  772. temp1 = vec_ld(0, pixels);
  773. temp2 = vec_ld(16, pixels);
  774. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  775. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  776. {
  777. pixelsv2 = temp2;
  778. }
  779. else
  780. {
  781. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  782. }
  783. pixelsv3 = vec_mergel(vczero, pixelsv1);
  784. pixelsv4 = vec_mergel(vczero, pixelsv2);
  785. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  786. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  787. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  788. (vector unsigned short)pixelsv4);
  789. pixelssum3 = vec_add(pixelssum3, vctwo);
  790. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  791. (vector unsigned short)pixelsv2);
  792. pixelssum1 = vec_add(pixelssum1, vctwo);
  793. for (i = 0; i < h ; i++) {
  794. blockv = vec_ld(0, block);
  795. temp1 = vec_ld(line_size, pixels);
  796. temp2 = vec_ld(line_size + 16, pixels);
  797. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  798. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  799. {
  800. pixelsv2 = temp2;
  801. }
  802. else
  803. {
  804. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  805. }
  806. pixelsv3 = vec_mergel(vczero, pixelsv1);
  807. pixelsv4 = vec_mergel(vczero, pixelsv2);
  808. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  809. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  810. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  811. (vector unsigned short)pixelsv4);
  812. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  813. (vector unsigned short)pixelsv2);
  814. temp4 = vec_add(pixelssum3, pixelssum4);
  815. temp4 = vec_sra(temp4, vctwo);
  816. temp3 = vec_add(pixelssum1, pixelssum2);
  817. temp3 = vec_sra(temp3, vctwo);
  818. pixelssum3 = vec_add(pixelssum4, vctwo);
  819. pixelssum1 = vec_add(pixelssum2, vctwo);
  820. blockv = vec_packsu(temp3, temp4);
  821. vec_st(blockv, 0, block);
  822. block += line_size;
  823. pixels += line_size;
  824. }
  825. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  826. }
  827. /* next one assumes that ((line_size % 16) == 0) */
  828. void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  829. {
  830. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
  831. register int i;
  832. register vector unsigned char
  833. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  834. register vector unsigned char
  835. blockv, temp1, temp2;
  836. register vector unsigned short
  837. pixelssum1, pixelssum2, temp3,
  838. pixelssum3, pixelssum4, temp4;
  839. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  840. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  841. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  842. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  843. temp1 = vec_ld(0, pixels);
  844. temp2 = vec_ld(16, pixels);
  845. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  846. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  847. {
  848. pixelsv2 = temp2;
  849. }
  850. else
  851. {
  852. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  853. }
  854. pixelsv3 = vec_mergel(vczero, pixelsv1);
  855. pixelsv4 = vec_mergel(vczero, pixelsv2);
  856. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  857. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  858. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  859. (vector unsigned short)pixelsv4);
  860. pixelssum3 = vec_add(pixelssum3, vcone);
  861. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  862. (vector unsigned short)pixelsv2);
  863. pixelssum1 = vec_add(pixelssum1, vcone);
  864. for (i = 0; i < h ; i++) {
  865. blockv = vec_ld(0, block);
  866. temp1 = vec_ld(line_size, pixels);
  867. temp2 = vec_ld(line_size + 16, pixels);
  868. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  869. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  870. {
  871. pixelsv2 = temp2;
  872. }
  873. else
  874. {
  875. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  876. }
  877. pixelsv3 = vec_mergel(vczero, pixelsv1);
  878. pixelsv4 = vec_mergel(vczero, pixelsv2);
  879. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  880. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  881. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  882. (vector unsigned short)pixelsv4);
  883. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  884. (vector unsigned short)pixelsv2);
  885. temp4 = vec_add(pixelssum3, pixelssum4);
  886. temp4 = vec_sra(temp4, vctwo);
  887. temp3 = vec_add(pixelssum1, pixelssum2);
  888. temp3 = vec_sra(temp3, vctwo);
  889. pixelssum3 = vec_add(pixelssum4, vcone);
  890. pixelssum1 = vec_add(pixelssum2, vcone);
  891. blockv = vec_packsu(temp3, temp4);
  892. vec_st(blockv, 0, block);
  893. block += line_size;
  894. pixels += line_size;
  895. }
  896. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  897. }
  898. int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  899. POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
  900. int sum;
  901. register const_vector unsigned char vzero =
  902. (const_vector unsigned char)vec_splat_u8(0);
  903. register vector signed short temp0, temp1, temp2, temp3, temp4,
  904. temp5, temp6, temp7;
  905. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
  906. {
  907. register const_vector signed short vprod1 =(const_vector signed short)
  908. AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  909. register const_vector signed short vprod2 =(const_vector signed short)
  910. AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  911. register const_vector signed short vprod3 =(const_vector signed short)
  912. AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  913. register const_vector unsigned char perm1 = (const_vector unsigned char)
  914. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  915. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  916. register const_vector unsigned char perm2 = (const_vector unsigned char)
  917. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  918. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  919. register const_vector unsigned char perm3 = (const_vector unsigned char)
  920. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  921. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  922. #define ONEITERBUTTERFLY(i, res) \
  923. { \
  924. register vector unsigned char src1, src2, srcO; \
  925. register vector unsigned char dst1, dst2, dstO; \
  926. register vector signed short srcV, dstV; \
  927. register vector signed short but0, but1, but2, op1, op2, op3; \
  928. src1 = vec_ld(stride * i, src); \
  929. src2 = vec_ld((stride * i) + 15, src); \
  930. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  931. dst1 = vec_ld(stride * i, dst); \
  932. dst2 = vec_ld((stride * i) + 15, dst); \
  933. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  934. /* promote the unsigned chars to signed shorts */ \
  935. /* we're in the 8x8 function, we only care for the first 8 */ \
  936. srcV = \
  937. (vector signed short)vec_mergeh((vector signed char)vzero, \
  938. (vector signed char)srcO); \
  939. dstV = \
  940. (vector signed short)vec_mergeh((vector signed char)vzero, \
  941. (vector signed char)dstO); \
  942. /* substractions inside the first butterfly */ \
  943. but0 = vec_sub(srcV, dstV); \
  944. op1 = vec_perm(but0, but0, perm1); \
  945. but1 = vec_mladd(but0, vprod1, op1); \
  946. op2 = vec_perm(but1, but1, perm2); \
  947. but2 = vec_mladd(but1, vprod2, op2); \
  948. op3 = vec_perm(but2, but2, perm3); \
  949. res = vec_mladd(but2, vprod3, op3); \
  950. }
  951. ONEITERBUTTERFLY(0, temp0);
  952. ONEITERBUTTERFLY(1, temp1);
  953. ONEITERBUTTERFLY(2, temp2);
  954. ONEITERBUTTERFLY(3, temp3);
  955. ONEITERBUTTERFLY(4, temp4);
  956. ONEITERBUTTERFLY(5, temp5);
  957. ONEITERBUTTERFLY(6, temp6);
  958. ONEITERBUTTERFLY(7, temp7);
  959. }
  960. #undef ONEITERBUTTERFLY
  961. {
  962. register vector signed int vsum;
  963. register vector signed short line0 = vec_add(temp0, temp1);
  964. register vector signed short line1 = vec_sub(temp0, temp1);
  965. register vector signed short line2 = vec_add(temp2, temp3);
  966. register vector signed short line3 = vec_sub(temp2, temp3);
  967. register vector signed short line4 = vec_add(temp4, temp5);
  968. register vector signed short line5 = vec_sub(temp4, temp5);
  969. register vector signed short line6 = vec_add(temp6, temp7);
  970. register vector signed short line7 = vec_sub(temp6, temp7);
  971. register vector signed short line0B = vec_add(line0, line2);
  972. register vector signed short line2B = vec_sub(line0, line2);
  973. register vector signed short line1B = vec_add(line1, line3);
  974. register vector signed short line3B = vec_sub(line1, line3);
  975. register vector signed short line4B = vec_add(line4, line6);
  976. register vector signed short line6B = vec_sub(line4, line6);
  977. register vector signed short line5B = vec_add(line5, line7);
  978. register vector signed short line7B = vec_sub(line5, line7);
  979. register vector signed short line0C = vec_add(line0B, line4B);
  980. register vector signed short line4C = vec_sub(line0B, line4B);
  981. register vector signed short line1C = vec_add(line1B, line5B);
  982. register vector signed short line5C = vec_sub(line1B, line5B);
  983. register vector signed short line2C = vec_add(line2B, line6B);
  984. register vector signed short line6C = vec_sub(line2B, line6B);
  985. register vector signed short line3C = vec_add(line3B, line7B);
  986. register vector signed short line7C = vec_sub(line3B, line7B);
  987. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  988. vsum = vec_sum4s(vec_abs(line1C), vsum);
  989. vsum = vec_sum4s(vec_abs(line2C), vsum);
  990. vsum = vec_sum4s(vec_abs(line3C), vsum);
  991. vsum = vec_sum4s(vec_abs(line4C), vsum);
  992. vsum = vec_sum4s(vec_abs(line5C), vsum);
  993. vsum = vec_sum4s(vec_abs(line6C), vsum);
  994. vsum = vec_sum4s(vec_abs(line7C), vsum);
  995. vsum = vec_sums(vsum, (vector signed int)vzero);
  996. vsum = vec_splat(vsum, 3);
  997. vec_ste(vsum, 0, &sum);
  998. }
  999. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
  1000. return sum;
  1001. }
  1002. /*
  1003. 16x8 works with 16 elements ; it allows to avoid replicating
  1004. loads, and give the compiler more rooms for scheduling.
  1005. It's only used from inside hadamard8_diff16_altivec.
  1006. Unfortunately, it seems gcc-3.3 is a bit dumb, and
  1007. the compiled code has a LOT of spill code, it seems
  1008. gcc (unlike xlc) cannot keep everything in registers
  1009. by itself. The following code include hand-made
  1010. registers allocation. It's not clean, but on
  1011. a 7450 the resulting code is much faster (best case
  1012. fall from 700+ cycles to 550).
  1013. xlc doesn't add spill code, but it doesn't know how to
  1014. schedule for the 7450, and its code isn't much faster than
  1015. gcc-3.3 on the 7450 (but uses 25% less instructions...)
  1016. On the 970, the hand-made RA is still a win (arount 690
  1017. vs. around 780), but xlc goes to around 660 on the
  1018. regular C code...
  1019. */
  1020. static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
  1021. int sum;
  1022. register vector signed short
  1023. temp0 REG_v(v0),
  1024. temp1 REG_v(v1),
  1025. temp2 REG_v(v2),
  1026. temp3 REG_v(v3),
  1027. temp4 REG_v(v4),
  1028. temp5 REG_v(v5),
  1029. temp6 REG_v(v6),
  1030. temp7 REG_v(v7);
  1031. register vector signed short
  1032. temp0S REG_v(v8),
  1033. temp1S REG_v(v9),
  1034. temp2S REG_v(v10),
  1035. temp3S REG_v(v11),
  1036. temp4S REG_v(v12),
  1037. temp5S REG_v(v13),
  1038. temp6S REG_v(v14),
  1039. temp7S REG_v(v15);
  1040. register const_vector unsigned char vzero REG_v(v31)=
  1041. (const_vector unsigned char)vec_splat_u8(0);
  1042. {
  1043. register const_vector signed short vprod1 REG_v(v16)=
  1044. (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  1045. register const_vector signed short vprod2 REG_v(v17)=
  1046. (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  1047. register const_vector signed short vprod3 REG_v(v18)=
  1048. (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  1049. register const_vector unsigned char perm1 REG_v(v19)=
  1050. (const_vector unsigned char)
  1051. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  1052. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  1053. register const_vector unsigned char perm2 REG_v(v20)=
  1054. (const_vector unsigned char)
  1055. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  1056. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  1057. register const_vector unsigned char perm3 REG_v(v21)=
  1058. (const_vector unsigned char)
  1059. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  1060. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  1061. #define ONEITERBUTTERFLY(i, res1, res2) \
  1062. { \
  1063. register vector unsigned char src1 REG_v(v22), \
  1064. src2 REG_v(v23), \
  1065. dst1 REG_v(v24), \
  1066. dst2 REG_v(v25), \
  1067. srcO REG_v(v22), \
  1068. dstO REG_v(v23); \
  1069. \
  1070. register vector signed short srcV REG_v(v24), \
  1071. dstV REG_v(v25), \
  1072. srcW REG_v(v26), \
  1073. dstW REG_v(v27), \
  1074. but0 REG_v(v28), \
  1075. but0S REG_v(v29), \
  1076. op1 REG_v(v30), \
  1077. but1 REG_v(v22), \
  1078. op1S REG_v(v23), \
  1079. but1S REG_v(v24), \
  1080. op2 REG_v(v25), \
  1081. but2 REG_v(v26), \
  1082. op2S REG_v(v27), \
  1083. but2S REG_v(v28), \
  1084. op3 REG_v(v29), \
  1085. op3S REG_v(v30); \
  1086. \
  1087. src1 = vec_ld(stride * i, src); \
  1088. src2 = vec_ld((stride * i) + 16, src); \
  1089. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  1090. dst1 = vec_ld(stride * i, dst); \
  1091. dst2 = vec_ld((stride * i) + 16, dst); \
  1092. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  1093. /* promote the unsigned chars to signed shorts */ \
  1094. srcV = \
  1095. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1096. (vector signed char)srcO); \
  1097. dstV = \
  1098. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1099. (vector signed char)dstO); \
  1100. srcW = \
  1101. (vector signed short)vec_mergel((vector signed char)vzero, \
  1102. (vector signed char)srcO); \
  1103. dstW = \
  1104. (vector signed short)vec_mergel((vector signed char)vzero, \
  1105. (vector signed char)dstO); \
  1106. /* substractions inside the first butterfly */ \
  1107. but0 = vec_sub(srcV, dstV); \
  1108. but0S = vec_sub(srcW, dstW); \
  1109. op1 = vec_perm(but0, but0, perm1); \
  1110. but1 = vec_mladd(but0, vprod1, op1); \
  1111. op1S = vec_perm(but0S, but0S, perm1); \
  1112. but1S = vec_mladd(but0S, vprod1, op1S); \
  1113. op2 = vec_perm(but1, but1, perm2); \
  1114. but2 = vec_mladd(but1, vprod2, op2); \
  1115. op2S = vec_perm(but1S, but1S, perm2); \
  1116. but2S = vec_mladd(but1S, vprod2, op2S); \
  1117. op3 = vec_perm(but2, but2, perm3); \
  1118. res1 = vec_mladd(but2, vprod3, op3); \
  1119. op3S = vec_perm(but2S, but2S, perm3); \
  1120. res2 = vec_mladd(but2S, vprod3, op3S); \
  1121. }
  1122. ONEITERBUTTERFLY(0, temp0, temp0S);
  1123. ONEITERBUTTERFLY(1, temp1, temp1S);
  1124. ONEITERBUTTERFLY(2, temp2, temp2S);
  1125. ONEITERBUTTERFLY(3, temp3, temp3S);
  1126. ONEITERBUTTERFLY(4, temp4, temp4S);
  1127. ONEITERBUTTERFLY(5, temp5, temp5S);
  1128. ONEITERBUTTERFLY(6, temp6, temp6S);
  1129. ONEITERBUTTERFLY(7, temp7, temp7S);
  1130. }
  1131. #undef ONEITERBUTTERFLY
  1132. {
  1133. register vector signed int vsum;
  1134. register vector signed short line0S, line1S, line2S, line3S, line4S,
  1135. line5S, line6S, line7S, line0BS,line2BS,
  1136. line1BS,line3BS,line4BS,line6BS,line5BS,
  1137. line7BS,line0CS,line4CS,line1CS,line5CS,
  1138. line2CS,line6CS,line3CS,line7CS;
  1139. register vector signed short line0 = vec_add(temp0, temp1);
  1140. register vector signed short line1 = vec_sub(temp0, temp1);
  1141. register vector signed short line2 = vec_add(temp2, temp3);
  1142. register vector signed short line3 = vec_sub(temp2, temp3);
  1143. register vector signed short line4 = vec_add(temp4, temp5);
  1144. register vector signed short line5 = vec_sub(temp4, temp5);
  1145. register vector signed short line6 = vec_add(temp6, temp7);
  1146. register vector signed short line7 = vec_sub(temp6, temp7);
  1147. register vector signed short line0B = vec_add(line0, line2);
  1148. register vector signed short line2B = vec_sub(line0, line2);
  1149. register vector signed short line1B = vec_add(line1, line3);
  1150. register vector signed short line3B = vec_sub(line1, line3);
  1151. register vector signed short line4B = vec_add(line4, line6);
  1152. register vector signed short line6B = vec_sub(line4, line6);
  1153. register vector signed short line5B = vec_add(line5, line7);
  1154. register vector signed short line7B = vec_sub(line5, line7);
  1155. register vector signed short line0C = vec_add(line0B, line4B);
  1156. register vector signed short line4C = vec_sub(line0B, line4B);
  1157. register vector signed short line1C = vec_add(line1B, line5B);
  1158. register vector signed short line5C = vec_sub(line1B, line5B);
  1159. register vector signed short line2C = vec_add(line2B, line6B);
  1160. register vector signed short line6C = vec_sub(line2B, line6B);
  1161. register vector signed short line3C = vec_add(line3B, line7B);
  1162. register vector signed short line7C = vec_sub(line3B, line7B);
  1163. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1164. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1165. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1166. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1167. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1168. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1169. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1170. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1171. line0S = vec_add(temp0S, temp1S);
  1172. line1S = vec_sub(temp0S, temp1S);
  1173. line2S = vec_add(temp2S, temp3S);
  1174. line3S = vec_sub(temp2S, temp3S);
  1175. line4S = vec_add(temp4S, temp5S);
  1176. line5S = vec_sub(temp4S, temp5S);
  1177. line6S = vec_add(temp6S, temp7S);
  1178. line7S = vec_sub(temp6S, temp7S);
  1179. line0BS = vec_add(line0S, line2S);
  1180. line2BS = vec_sub(line0S, line2S);
  1181. line1BS = vec_add(line1S, line3S);
  1182. line3BS = vec_sub(line1S, line3S);
  1183. line4BS = vec_add(line4S, line6S);
  1184. line6BS = vec_sub(line4S, line6S);
  1185. line5BS = vec_add(line5S, line7S);
  1186. line7BS = vec_sub(line5S, line7S);
  1187. line0CS = vec_add(line0BS, line4BS);
  1188. line4CS = vec_sub(line0BS, line4BS);
  1189. line1CS = vec_add(line1BS, line5BS);
  1190. line5CS = vec_sub(line1BS, line5BS);
  1191. line2CS = vec_add(line2BS, line6BS);
  1192. line6CS = vec_sub(line2BS, line6BS);
  1193. line3CS = vec_add(line3BS, line7BS);
  1194. line7CS = vec_sub(line3BS, line7BS);
  1195. vsum = vec_sum4s(vec_abs(line0CS), vsum);
  1196. vsum = vec_sum4s(vec_abs(line1CS), vsum);
  1197. vsum = vec_sum4s(vec_abs(line2CS), vsum);
  1198. vsum = vec_sum4s(vec_abs(line3CS), vsum);
  1199. vsum = vec_sum4s(vec_abs(line4CS), vsum);
  1200. vsum = vec_sum4s(vec_abs(line5CS), vsum);
  1201. vsum = vec_sum4s(vec_abs(line6CS), vsum);
  1202. vsum = vec_sum4s(vec_abs(line7CS), vsum);
  1203. vsum = vec_sums(vsum, (vector signed int)vzero);
  1204. vsum = vec_splat(vsum, 3);
  1205. vec_ste(vsum, 0, &sum);
  1206. }
  1207. return sum;
  1208. }
  1209. int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1210. POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
  1211. int score;
  1212. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
  1213. score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1214. if (h==16) {
  1215. dst += 8*stride;
  1216. src += 8*stride;
  1217. score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1218. }
  1219. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
  1220. return score;
  1221. }
  1222. int has_altivec(void)
  1223. {
  1224. #ifdef __AMIGAOS4__
  1225. ULONG result = 0;
  1226. extern struct ExecIFace *IExec;
  1227. IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
  1228. if (result == VECTORTYPE_ALTIVEC) return 1;
  1229. return 0;
  1230. #else /* __AMIGAOS4__ */
  1231. #ifdef CONFIG_DARWIN
  1232. int sels[2] = {CTL_HW, HW_VECTORUNIT};
  1233. int has_vu = 0;
  1234. size_t len = sizeof(has_vu);
  1235. int err;
  1236. err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
  1237. if (err == 0) return (has_vu != 0);
  1238. #else /* CONFIG_DARWIN */
  1239. /* no Darwin, do it the brute-force way */
  1240. /* this is borrowed from the libmpeg2 library */
  1241. {
  1242. signal (SIGILL, sigill_handler);
  1243. if (sigsetjmp (jmpbuf, 1)) {
  1244. signal (SIGILL, SIG_DFL);
  1245. } else {
  1246. canjump = 1;
  1247. asm volatile ("mtspr 256, %0\n\t"
  1248. "vand %%v0, %%v0, %%v0"
  1249. :
  1250. : "r" (-1));
  1251. signal (SIGILL, SIG_DFL);
  1252. return 1;
  1253. }
  1254. }
  1255. #endif /* CONFIG_DARWIN */
  1256. return 0;
  1257. #endif /* __AMIGAOS4__ */
  1258. }
  1259. static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
  1260. int blocksize)
  1261. {
  1262. int i;
  1263. vector float m, a;
  1264. vector bool int t0, t1;
  1265. const vector unsigned int v_31 = //XXX
  1266. vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
  1267. for(i=0; i<blocksize; i+=4) {
  1268. m = vec_ld(0, mag+i);
  1269. a = vec_ld(0, ang+i);
  1270. t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
  1271. t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
  1272. a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
  1273. t0 = (vector bool int)vec_and(a, t1);
  1274. t1 = (vector bool int)vec_andc(a, t1);
  1275. a = vec_sub(m, (vector float)t1);
  1276. m = vec_add(m, (vector float)t0);
  1277. vec_stl(a, 0, ang+i);
  1278. vec_stl(m, 0, mag+i);
  1279. }
  1280. }
  1281. /* next one assumes that ((line_size % 8) == 0) */
  1282. void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  1283. {
  1284. POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
  1285. register int i;
  1286. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  1287. register vector unsigned char blockv, temp1, temp2, blocktemp;
  1288. register vector unsigned short pixelssum1, pixelssum2, temp3;
  1289. register const_vector unsigned char vczero = (const_vector unsigned char)
  1290. vec_splat_u8(0);
  1291. register const_vector unsigned short vctwo = (const_vector unsigned short)
  1292. vec_splat_u16(2);
  1293. temp1 = vec_ld(0, pixels);
  1294. temp2 = vec_ld(16, pixels);
  1295. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1296. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  1297. pixelsv2 = temp2;
  1298. } else {
  1299. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1300. }
  1301. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1302. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1303. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1304. (vector unsigned short)pixelsv2);
  1305. pixelssum1 = vec_add(pixelssum1, vctwo);
  1306. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1307. for (i = 0; i < h ; i++) {
  1308. int rightside = ((unsigned long)block & 0x0000000F);
  1309. blockv = vec_ld(0, block);
  1310. temp1 = vec_ld(line_size, pixels);
  1311. temp2 = vec_ld(line_size + 16, pixels);
  1312. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1313. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1314. {
  1315. pixelsv2 = temp2;
  1316. } else {
  1317. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1318. }
  1319. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1320. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1321. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1322. (vector unsigned short)pixelsv2);
  1323. temp3 = vec_add(pixelssum1, pixelssum2);
  1324. temp3 = vec_sra(temp3, vctwo);
  1325. pixelssum1 = vec_add(pixelssum2, vctwo);
  1326. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  1327. if (rightside) {
  1328. blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  1329. } else {
  1330. blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  1331. }
  1332. blockv = vec_avg(blocktemp, blockv);
  1333. vec_st(blockv, 0, block);
  1334. block += line_size;
  1335. pixels += line_size;
  1336. }
  1337. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1338. }
  1339. void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
  1340. {
  1341. c->pix_abs[0][1] = sad16_x2_altivec;
  1342. c->pix_abs[0][2] = sad16_y2_altivec;
  1343. c->pix_abs[0][3] = sad16_xy2_altivec;
  1344. c->pix_abs[0][0] = sad16_altivec;
  1345. c->pix_abs[1][0] = sad8_altivec;
  1346. c->sad[0]= sad16_altivec;
  1347. c->sad[1]= sad8_altivec;
  1348. c->pix_norm1 = pix_norm1_altivec;
  1349. c->sse[1]= sse8_altivec;
  1350. c->sse[0]= sse16_altivec;
  1351. c->pix_sum = pix_sum_altivec;
  1352. c->diff_pixels = diff_pixels_altivec;
  1353. c->get_pixels = get_pixels_altivec;
  1354. c->add_bytes= add_bytes_altivec;
  1355. c->put_pixels_tab[0][0] = put_pixels16_altivec;
  1356. /* the two functions do the same thing, so use the same code */
  1357. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
  1358. c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
  1359. c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
  1360. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
  1361. c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
  1362. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
  1363. c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
  1364. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
  1365. c->hadamard8_diff[0] = hadamard8_diff16_altivec;
  1366. c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
  1367. #ifdef CONFIG_VORBIS_DECODER
  1368. c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
  1369. #endif
  1370. }