You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1588 lines
58KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "dsputil.h"
  23. #include "gcc_fixes.h"
  24. #include "dsputil_altivec.h"
  25. #ifdef SYS_DARWIN
  26. #include <sys/sysctl.h>
  27. #elif __AMIGAOS4__
  28. #include <exec/exec.h>
  29. #include <interfaces/exec.h>
  30. #include <proto/exec.h>
  31. #else
  32. #include <signal.h>
  33. #include <setjmp.h>
  34. static sigjmp_buf jmpbuf;
  35. static volatile sig_atomic_t canjump = 0;
  36. static void sigill_handler (int sig)
  37. {
  38. if (!canjump) {
  39. signal (sig, SIG_DFL);
  40. raise (sig);
  41. }
  42. canjump = 0;
  43. siglongjmp (jmpbuf, 1);
  44. }
  45. #endif /* SYS_DARWIN */
  46. int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  47. {
  48. int i;
  49. DECLARE_ALIGNED_16(int, s);
  50. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  51. vector unsigned char *tv;
  52. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  53. vector unsigned int sad;
  54. vector signed int sumdiffs;
  55. s = 0;
  56. sad = (vector unsigned int)vec_splat_u32(0);
  57. for(i=0;i<h;i++) {
  58. /*
  59. Read unaligned pixels into our vectors. The vectors are as follows:
  60. pix1v: pix1[0]-pix1[15]
  61. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  62. */
  63. tv = (vector unsigned char *) pix1;
  64. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  65. tv = (vector unsigned char *) &pix2[0];
  66. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  67. tv = (vector unsigned char *) &pix2[1];
  68. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  69. /* Calculate the average vector */
  70. avgv = vec_avg(pix2v, pix2iv);
  71. /* Calculate a sum of abs differences vector */
  72. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  73. /* Add each 4 pixel group together and put 4 results into sad */
  74. sad = vec_sum4s(t5, sad);
  75. pix1 += line_size;
  76. pix2 += line_size;
  77. }
  78. /* Sum up the four partial sums, and put the result into s */
  79. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  80. sumdiffs = vec_splat(sumdiffs, 3);
  81. vec_ste(sumdiffs, 0, &s);
  82. return s;
  83. }
  84. int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  85. {
  86. int i;
  87. DECLARE_ALIGNED_16(int, s);
  88. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  89. vector unsigned char *tv;
  90. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  91. vector unsigned int sad;
  92. vector signed int sumdiffs;
  93. uint8_t *pix3 = pix2 + line_size;
  94. s = 0;
  95. sad = (vector unsigned int)vec_splat_u32(0);
  96. /*
  97. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  98. iteration becomes pix2 in the next iteration. We can use this
  99. fact to avoid a potentially expensive unaligned read, each
  100. time around the loop.
  101. Read unaligned pixels into our vectors. The vectors are as follows:
  102. pix2v: pix2[0]-pix2[15]
  103. Split the pixel vectors into shorts
  104. */
  105. tv = (vector unsigned char *) &pix2[0];
  106. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  107. for(i=0;i<h;i++) {
  108. /*
  109. Read unaligned pixels into our vectors. The vectors are as follows:
  110. pix1v: pix1[0]-pix1[15]
  111. pix3v: pix3[0]-pix3[15]
  112. */
  113. tv = (vector unsigned char *) pix1;
  114. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  115. tv = (vector unsigned char *) &pix3[0];
  116. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  117. /* Calculate the average vector */
  118. avgv = vec_avg(pix2v, pix3v);
  119. /* Calculate a sum of abs differences vector */
  120. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  121. /* Add each 4 pixel group together and put 4 results into sad */
  122. sad = vec_sum4s(t5, sad);
  123. pix1 += line_size;
  124. pix2v = pix3v;
  125. pix3 += line_size;
  126. }
  127. /* Sum up the four partial sums, and put the result into s */
  128. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  129. sumdiffs = vec_splat(sumdiffs, 3);
  130. vec_ste(sumdiffs, 0, &s);
  131. return s;
  132. }
  133. int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  134. {
  135. int i;
  136. DECLARE_ALIGNED_16(int, s);
  137. uint8_t *pix3 = pix2 + line_size;
  138. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  139. const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
  140. vector unsigned char *tv, avgv, t5;
  141. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  142. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  143. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  144. vector unsigned short avghv, avglv;
  145. vector unsigned short t1, t2, t3, t4;
  146. vector unsigned int sad;
  147. vector signed int sumdiffs;
  148. sad = (vector unsigned int)vec_splat_u32(0);
  149. s = 0;
  150. /*
  151. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  152. iteration becomes pix2 in the next iteration. We can use this
  153. fact to avoid a potentially expensive unaligned read, as well
  154. as some splitting, and vector addition each time around the loop.
  155. Read unaligned pixels into our vectors. The vectors are as follows:
  156. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  157. Split the pixel vectors into shorts
  158. */
  159. tv = (vector unsigned char *) &pix2[0];
  160. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  161. tv = (vector unsigned char *) &pix2[1];
  162. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  163. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  164. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  165. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  166. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  167. t1 = vec_add(pix2hv, pix2ihv);
  168. t2 = vec_add(pix2lv, pix2ilv);
  169. for(i=0;i<h;i++) {
  170. /*
  171. Read unaligned pixels into our vectors. The vectors are as follows:
  172. pix1v: pix1[0]-pix1[15]
  173. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  174. */
  175. tv = (vector unsigned char *) pix1;
  176. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  177. tv = (vector unsigned char *) &pix3[0];
  178. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  179. tv = (vector unsigned char *) &pix3[1];
  180. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  181. /*
  182. Note that Altivec does have vec_avg, but this works on vector pairs
  183. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  184. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  185. Instead, we have to split the pixel vectors into vectors of shorts,
  186. and do the averaging by hand.
  187. */
  188. /* Split the pixel vectors into shorts */
  189. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  190. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  191. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  192. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  193. /* Do the averaging on them */
  194. t3 = vec_add(pix3hv, pix3ihv);
  195. t4 = vec_add(pix3lv, pix3ilv);
  196. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  197. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  198. /* Pack the shorts back into a result */
  199. avgv = vec_pack(avghv, avglv);
  200. /* Calculate a sum of abs differences vector */
  201. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  202. /* Add each 4 pixel group together and put 4 results into sad */
  203. sad = vec_sum4s(t5, sad);
  204. pix1 += line_size;
  205. pix3 += line_size;
  206. /* Transfer the calculated values for pix3 into pix2 */
  207. t1 = t3;
  208. t2 = t4;
  209. }
  210. /* Sum up the four partial sums, and put the result into s */
  211. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  212. sumdiffs = vec_splat(sumdiffs, 3);
  213. vec_ste(sumdiffs, 0, &s);
  214. return s;
  215. }
  216. int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  217. {
  218. int i;
  219. DECLARE_ALIGNED_16(int, s);
  220. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  221. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  222. vector unsigned char t1, t2, t3,t4, t5;
  223. vector unsigned int sad;
  224. vector signed int sumdiffs;
  225. sad = (vector unsigned int)vec_splat_u32(0);
  226. for(i=0;i<h;i++) {
  227. /* Read potentially unaligned pixels into t1 and t2 */
  228. perm1 = vec_lvsl(0, pix1);
  229. pix1v = (vector unsigned char *) pix1;
  230. perm2 = vec_lvsl(0, pix2);
  231. pix2v = (vector unsigned char *) pix2;
  232. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  233. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  234. /* Calculate a sum of abs differences vector */
  235. t3 = vec_max(t1, t2);
  236. t4 = vec_min(t1, t2);
  237. t5 = vec_sub(t3, t4);
  238. /* Add each 4 pixel group together and put 4 results into sad */
  239. sad = vec_sum4s(t5, sad);
  240. pix1 += line_size;
  241. pix2 += line_size;
  242. }
  243. /* Sum up the four partial sums, and put the result into s */
  244. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  245. sumdiffs = vec_splat(sumdiffs, 3);
  246. vec_ste(sumdiffs, 0, &s);
  247. return s;
  248. }
  249. int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  250. {
  251. int i;
  252. DECLARE_ALIGNED_16(int, s);;
  253. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  254. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  255. vector unsigned char t1, t2, t3,t4, t5;
  256. vector unsigned int sad;
  257. vector signed int sumdiffs;
  258. sad = (vector unsigned int)vec_splat_u32(0);
  259. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  260. for(i=0;i<h;i++) {
  261. /* Read potentially unaligned pixels into t1 and t2
  262. Since we're reading 16 pixels, and actually only want 8,
  263. mask out the last 8 pixels. The 0s don't change the sum. */
  264. perm1 = vec_lvsl(0, pix1);
  265. pix1v = (vector unsigned char *) pix1;
  266. perm2 = vec_lvsl(0, pix2);
  267. pix2v = (vector unsigned char *) pix2;
  268. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  269. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  270. /* Calculate a sum of abs differences vector */
  271. t3 = vec_max(t1, t2);
  272. t4 = vec_min(t1, t2);
  273. t5 = vec_sub(t3, t4);
  274. /* Add each 4 pixel group together and put 4 results into sad */
  275. sad = vec_sum4s(t5, sad);
  276. pix1 += line_size;
  277. pix2 += line_size;
  278. }
  279. /* Sum up the four partial sums, and put the result into s */
  280. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  281. sumdiffs = vec_splat(sumdiffs, 3);
  282. vec_ste(sumdiffs, 0, &s);
  283. return s;
  284. }
  285. int pix_norm1_altivec(uint8_t *pix, int line_size)
  286. {
  287. int i;
  288. DECLARE_ALIGNED_16(int, s);
  289. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  290. vector unsigned char *tv;
  291. vector unsigned char pixv;
  292. vector unsigned int sv;
  293. vector signed int sum;
  294. sv = (vector unsigned int)vec_splat_u32(0);
  295. s = 0;
  296. for (i = 0; i < 16; i++) {
  297. /* Read in the potentially unaligned pixels */
  298. tv = (vector unsigned char *) pix;
  299. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  300. /* Square the values, and add them to our sum */
  301. sv = vec_msum(pixv, pixv, sv);
  302. pix += line_size;
  303. }
  304. /* Sum up the four partial sums, and put the result into s */
  305. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  306. sum = vec_splat(sum, 3);
  307. vec_ste(sum, 0, &s);
  308. return s;
  309. }
  310. /**
  311. * Sum of Squared Errors for a 8x8 block.
  312. * AltiVec-enhanced.
  313. * It's the sad8_altivec code above w/ squaring added.
  314. */
  315. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  316. {
  317. int i;
  318. DECLARE_ALIGNED_16(int, s);
  319. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  320. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  321. vector unsigned char t1, t2, t3,t4, t5;
  322. vector unsigned int sum;
  323. vector signed int sumsqr;
  324. sum = (vector unsigned int)vec_splat_u32(0);
  325. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  326. for(i=0;i<h;i++) {
  327. /* Read potentially unaligned pixels into t1 and t2
  328. Since we're reading 16 pixels, and actually only want 8,
  329. mask out the last 8 pixels. The 0s don't change the sum. */
  330. perm1 = vec_lvsl(0, pix1);
  331. pix1v = (vector unsigned char *) pix1;
  332. perm2 = vec_lvsl(0, pix2);
  333. pix2v = (vector unsigned char *) pix2;
  334. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  335. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  336. /*
  337. Since we want to use unsigned chars, we can take advantage
  338. of the fact that abs(a-b)^2 = (a-b)^2.
  339. */
  340. /* Calculate abs differences vector */
  341. t3 = vec_max(t1, t2);
  342. t4 = vec_min(t1, t2);
  343. t5 = vec_sub(t3, t4);
  344. /* Square the values and add them to our sum */
  345. sum = vec_msum(t5, t5, sum);
  346. pix1 += line_size;
  347. pix2 += line_size;
  348. }
  349. /* Sum up the four partial sums, and put the result into s */
  350. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  351. sumsqr = vec_splat(sumsqr, 3);
  352. vec_ste(sumsqr, 0, &s);
  353. return s;
  354. }
  355. /**
  356. * Sum of Squared Errors for a 16x16 block.
  357. * AltiVec-enhanced.
  358. * It's the sad16_altivec code above w/ squaring added.
  359. */
  360. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  361. {
  362. int i;
  363. DECLARE_ALIGNED_16(int, s);
  364. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  365. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  366. vector unsigned char t1, t2, t3,t4, t5;
  367. vector unsigned int sum;
  368. vector signed int sumsqr;
  369. sum = (vector unsigned int)vec_splat_u32(0);
  370. for(i=0;i<h;i++) {
  371. /* Read potentially unaligned pixels into t1 and t2 */
  372. perm1 = vec_lvsl(0, pix1);
  373. pix1v = (vector unsigned char *) pix1;
  374. perm2 = vec_lvsl(0, pix2);
  375. pix2v = (vector unsigned char *) pix2;
  376. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  377. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  378. /*
  379. Since we want to use unsigned chars, we can take advantage
  380. of the fact that abs(a-b)^2 = (a-b)^2.
  381. */
  382. /* Calculate abs differences vector */
  383. t3 = vec_max(t1, t2);
  384. t4 = vec_min(t1, t2);
  385. t5 = vec_sub(t3, t4);
  386. /* Square the values and add them to our sum */
  387. sum = vec_msum(t5, t5, sum);
  388. pix1 += line_size;
  389. pix2 += line_size;
  390. }
  391. /* Sum up the four partial sums, and put the result into s */
  392. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  393. sumsqr = vec_splat(sumsqr, 3);
  394. vec_ste(sumsqr, 0, &s);
  395. return s;
  396. }
  397. int pix_sum_altivec(uint8_t * pix, int line_size)
  398. {
  399. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  400. vector unsigned char perm, *pixv;
  401. vector unsigned char t1;
  402. vector unsigned int sad;
  403. vector signed int sumdiffs;
  404. int i;
  405. DECLARE_ALIGNED_16(int, s);
  406. sad = (vector unsigned int)vec_splat_u32(0);
  407. for (i = 0; i < 16; i++) {
  408. /* Read the potentially unaligned 16 pixels into t1 */
  409. perm = vec_lvsl(0, pix);
  410. pixv = (vector unsigned char *) pix;
  411. t1 = vec_perm(pixv[0], pixv[1], perm);
  412. /* Add each 4 pixel group together and put 4 results into sad */
  413. sad = vec_sum4s(t1, sad);
  414. pix += line_size;
  415. }
  416. /* Sum up the four partial sums, and put the result into s */
  417. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  418. sumdiffs = vec_splat(sumdiffs, 3);
  419. vec_ste(sumdiffs, 0, &s);
  420. return s;
  421. }
  422. void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  423. {
  424. int i;
  425. vector unsigned char perm, bytes, *pixv;
  426. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  427. vector signed short shorts;
  428. for(i=0;i<8;i++)
  429. {
  430. // Read potentially unaligned pixels.
  431. // We're reading 16 pixels, and actually only want 8,
  432. // but we simply ignore the extras.
  433. perm = vec_lvsl(0, pixels);
  434. pixv = (vector unsigned char *) pixels;
  435. bytes = vec_perm(pixv[0], pixv[1], perm);
  436. // convert the bytes into shorts
  437. shorts = (vector signed short)vec_mergeh(zero, bytes);
  438. // save the data to the block, we assume the block is 16-byte aligned
  439. vec_st(shorts, i*16, (vector signed short*)block);
  440. pixels += line_size;
  441. }
  442. }
  443. void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  444. const uint8_t *s2, int stride)
  445. {
  446. int i;
  447. vector unsigned char perm, bytes, *pixv;
  448. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  449. vector signed short shorts1, shorts2;
  450. for(i=0;i<4;i++)
  451. {
  452. // Read potentially unaligned pixels
  453. // We're reading 16 pixels, and actually only want 8,
  454. // but we simply ignore the extras.
  455. perm = vec_lvsl(0, s1);
  456. pixv = (vector unsigned char *) s1;
  457. bytes = vec_perm(pixv[0], pixv[1], perm);
  458. // convert the bytes into shorts
  459. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  460. // Do the same for the second block of pixels
  461. perm = vec_lvsl(0, s2);
  462. pixv = (vector unsigned char *) s2;
  463. bytes = vec_perm(pixv[0], pixv[1], perm);
  464. // convert the bytes into shorts
  465. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  466. // Do the subtraction
  467. shorts1 = vec_sub(shorts1, shorts2);
  468. // save the data to the block, we assume the block is 16-byte aligned
  469. vec_st(shorts1, 0, (vector signed short*)block);
  470. s1 += stride;
  471. s2 += stride;
  472. block += 8;
  473. // The code below is a copy of the code above... This is a manual
  474. // unroll.
  475. // Read potentially unaligned pixels
  476. // We're reading 16 pixels, and actually only want 8,
  477. // but we simply ignore the extras.
  478. perm = vec_lvsl(0, s1);
  479. pixv = (vector unsigned char *) s1;
  480. bytes = vec_perm(pixv[0], pixv[1], perm);
  481. // convert the bytes into shorts
  482. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  483. // Do the same for the second block of pixels
  484. perm = vec_lvsl(0, s2);
  485. pixv = (vector unsigned char *) s2;
  486. bytes = vec_perm(pixv[0], pixv[1], perm);
  487. // convert the bytes into shorts
  488. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  489. // Do the subtraction
  490. shorts1 = vec_sub(shorts1, shorts2);
  491. // save the data to the block, we assume the block is 16-byte aligned
  492. vec_st(shorts1, 0, (vector signed short*)block);
  493. s1 += stride;
  494. s2 += stride;
  495. block += 8;
  496. }
  497. }
  498. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  499. register int i;
  500. register vector unsigned char vdst, vsrc;
  501. /* dst and src are 16 bytes-aligned (guaranteed) */
  502. for(i = 0 ; (i + 15) < w ; i+=16)
  503. {
  504. vdst = vec_ld(i, (unsigned char*)dst);
  505. vsrc = vec_ld(i, (unsigned char*)src);
  506. vdst = vec_add(vsrc, vdst);
  507. vec_st(vdst, i, (unsigned char*)dst);
  508. }
  509. /* if w is not a multiple of 16 */
  510. for (; (i < w) ; i++)
  511. {
  512. dst[i] = src[i];
  513. }
  514. }
  515. /* next one assumes that ((line_size % 16) == 0) */
  516. void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  517. {
  518. POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
  519. register vector unsigned char pixelsv1, pixelsv2;
  520. register vector unsigned char pixelsv1B, pixelsv2B;
  521. register vector unsigned char pixelsv1C, pixelsv2C;
  522. register vector unsigned char pixelsv1D, pixelsv2D;
  523. register vector unsigned char perm = vec_lvsl(0, pixels);
  524. int i;
  525. register int line_size_2 = line_size << 1;
  526. register int line_size_3 = line_size + line_size_2;
  527. register int line_size_4 = line_size << 2;
  528. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  529. // hand-unrolling the loop by 4 gains about 15%
  530. // mininum execution time goes from 74 to 60 cycles
  531. // it's faster than -funroll-loops, but using
  532. // -funroll-loops w/ this is bad - 74 cycles again.
  533. // all this is on a 7450, tuning for the 7450
  534. #if 0
  535. for(i=0; i<h; i++) {
  536. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  537. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  538. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  539. 0, (unsigned char*)block);
  540. pixels+=line_size;
  541. block +=line_size;
  542. }
  543. #else
  544. for(i=0; i<h; i+=4) {
  545. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  546. pixelsv2 = vec_ld(15, (unsigned char*)pixels);
  547. pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
  548. pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
  549. pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
  550. pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
  551. pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
  552. pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
  553. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  554. 0, (unsigned char*)block);
  555. vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
  556. line_size, (unsigned char*)block);
  557. vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
  558. line_size_2, (unsigned char*)block);
  559. vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
  560. line_size_3, (unsigned char*)block);
  561. pixels+=line_size_4;
  562. block +=line_size_4;
  563. }
  564. #endif
  565. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  566. }
  567. /* next one assumes that ((line_size % 16) == 0) */
  568. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  569. void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  570. {
  571. POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
  572. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  573. register vector unsigned char perm = vec_lvsl(0, pixels);
  574. int i;
  575. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  576. for(i=0; i<h; i++) {
  577. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  578. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  579. blockv = vec_ld(0, block);
  580. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  581. blockv = vec_avg(blockv,pixelsv);
  582. vec_st(blockv, 0, (unsigned char*)block);
  583. pixels+=line_size;
  584. block +=line_size;
  585. }
  586. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  587. }
  588. /* next one assumes that ((line_size % 8) == 0) */
  589. void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  590. {
  591. POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
  592. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  593. int i;
  594. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  595. for (i = 0; i < h; i++) {
  596. /*
  597. block is 8 bytes-aligned, so we're either in the
  598. left block (16 bytes-aligned) or in the right block (not)
  599. */
  600. int rightside = ((unsigned long)block & 0x0000000F);
  601. blockv = vec_ld(0, block);
  602. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  603. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  604. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  605. if (rightside)
  606. {
  607. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  608. }
  609. else
  610. {
  611. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  612. }
  613. blockv = vec_avg(blockv, pixelsv);
  614. vec_st(blockv, 0, block);
  615. pixels += line_size;
  616. block += line_size;
  617. }
  618. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  619. }
  620. /* next one assumes that ((line_size % 8) == 0) */
  621. void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  622. {
  623. POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
  624. register int i;
  625. register vector unsigned char
  626. pixelsv1, pixelsv2,
  627. pixelsavg;
  628. register vector unsigned char
  629. blockv, temp1, temp2;
  630. register vector unsigned short
  631. pixelssum1, pixelssum2, temp3;
  632. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  633. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  634. temp1 = vec_ld(0, pixels);
  635. temp2 = vec_ld(16, pixels);
  636. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  637. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  638. {
  639. pixelsv2 = temp2;
  640. }
  641. else
  642. {
  643. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  644. }
  645. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  646. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  647. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  648. (vector unsigned short)pixelsv2);
  649. pixelssum1 = vec_add(pixelssum1, vctwo);
  650. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  651. for (i = 0; i < h ; i++) {
  652. int rightside = ((unsigned long)block & 0x0000000F);
  653. blockv = vec_ld(0, block);
  654. temp1 = vec_ld(line_size, pixels);
  655. temp2 = vec_ld(line_size + 16, pixels);
  656. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  657. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  658. {
  659. pixelsv2 = temp2;
  660. }
  661. else
  662. {
  663. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  664. }
  665. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  666. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  667. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  668. (vector unsigned short)pixelsv2);
  669. temp3 = vec_add(pixelssum1, pixelssum2);
  670. temp3 = vec_sra(temp3, vctwo);
  671. pixelssum1 = vec_add(pixelssum2, vctwo);
  672. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  673. if (rightside)
  674. {
  675. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  676. }
  677. else
  678. {
  679. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  680. }
  681. vec_st(blockv, 0, block);
  682. block += line_size;
  683. pixels += line_size;
  684. }
  685. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  686. }
  687. /* next one assumes that ((line_size % 8) == 0) */
  688. void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  689. {
  690. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
  691. register int i;
  692. register vector unsigned char
  693. pixelsv1, pixelsv2,
  694. pixelsavg;
  695. register vector unsigned char
  696. blockv, temp1, temp2;
  697. register vector unsigned short
  698. pixelssum1, pixelssum2, temp3;
  699. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  700. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  701. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  702. temp1 = vec_ld(0, pixels);
  703. temp2 = vec_ld(16, pixels);
  704. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  705. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  706. {
  707. pixelsv2 = temp2;
  708. }
  709. else
  710. {
  711. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  712. }
  713. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  714. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  715. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  716. (vector unsigned short)pixelsv2);
  717. pixelssum1 = vec_add(pixelssum1, vcone);
  718. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  719. for (i = 0; i < h ; i++) {
  720. int rightside = ((unsigned long)block & 0x0000000F);
  721. blockv = vec_ld(0, block);
  722. temp1 = vec_ld(line_size, pixels);
  723. temp2 = vec_ld(line_size + 16, pixels);
  724. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  725. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  726. {
  727. pixelsv2 = temp2;
  728. }
  729. else
  730. {
  731. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  732. }
  733. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  734. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  735. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  736. (vector unsigned short)pixelsv2);
  737. temp3 = vec_add(pixelssum1, pixelssum2);
  738. temp3 = vec_sra(temp3, vctwo);
  739. pixelssum1 = vec_add(pixelssum2, vcone);
  740. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  741. if (rightside)
  742. {
  743. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  744. }
  745. else
  746. {
  747. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  748. }
  749. vec_st(blockv, 0, block);
  750. block += line_size;
  751. pixels += line_size;
  752. }
  753. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  754. }
  755. /* next one assumes that ((line_size % 16) == 0) */
  756. void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  757. {
  758. POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
  759. register int i;
  760. register vector unsigned char
  761. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  762. register vector unsigned char
  763. blockv, temp1, temp2;
  764. register vector unsigned short
  765. pixelssum1, pixelssum2, temp3,
  766. pixelssum3, pixelssum4, temp4;
  767. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  768. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  769. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  770. temp1 = vec_ld(0, pixels);
  771. temp2 = vec_ld(16, pixels);
  772. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  773. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  774. {
  775. pixelsv2 = temp2;
  776. }
  777. else
  778. {
  779. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  780. }
  781. pixelsv3 = vec_mergel(vczero, pixelsv1);
  782. pixelsv4 = vec_mergel(vczero, pixelsv2);
  783. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  784. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  785. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  786. (vector unsigned short)pixelsv4);
  787. pixelssum3 = vec_add(pixelssum3, vctwo);
  788. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  789. (vector unsigned short)pixelsv2);
  790. pixelssum1 = vec_add(pixelssum1, vctwo);
  791. for (i = 0; i < h ; i++) {
  792. blockv = vec_ld(0, block);
  793. temp1 = vec_ld(line_size, pixels);
  794. temp2 = vec_ld(line_size + 16, pixels);
  795. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  796. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  797. {
  798. pixelsv2 = temp2;
  799. }
  800. else
  801. {
  802. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  803. }
  804. pixelsv3 = vec_mergel(vczero, pixelsv1);
  805. pixelsv4 = vec_mergel(vczero, pixelsv2);
  806. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  807. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  808. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  809. (vector unsigned short)pixelsv4);
  810. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  811. (vector unsigned short)pixelsv2);
  812. temp4 = vec_add(pixelssum3, pixelssum4);
  813. temp4 = vec_sra(temp4, vctwo);
  814. temp3 = vec_add(pixelssum1, pixelssum2);
  815. temp3 = vec_sra(temp3, vctwo);
  816. pixelssum3 = vec_add(pixelssum4, vctwo);
  817. pixelssum1 = vec_add(pixelssum2, vctwo);
  818. blockv = vec_packsu(temp3, temp4);
  819. vec_st(blockv, 0, block);
  820. block += line_size;
  821. pixels += line_size;
  822. }
  823. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  824. }
  825. /* next one assumes that ((line_size % 16) == 0) */
  826. void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  827. {
  828. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
  829. register int i;
  830. register vector unsigned char
  831. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  832. register vector unsigned char
  833. blockv, temp1, temp2;
  834. register vector unsigned short
  835. pixelssum1, pixelssum2, temp3,
  836. pixelssum3, pixelssum4, temp4;
  837. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  838. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  839. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  840. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  841. temp1 = vec_ld(0, pixels);
  842. temp2 = vec_ld(16, pixels);
  843. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  844. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  845. {
  846. pixelsv2 = temp2;
  847. }
  848. else
  849. {
  850. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  851. }
  852. pixelsv3 = vec_mergel(vczero, pixelsv1);
  853. pixelsv4 = vec_mergel(vczero, pixelsv2);
  854. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  855. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  856. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  857. (vector unsigned short)pixelsv4);
  858. pixelssum3 = vec_add(pixelssum3, vcone);
  859. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  860. (vector unsigned short)pixelsv2);
  861. pixelssum1 = vec_add(pixelssum1, vcone);
  862. for (i = 0; i < h ; i++) {
  863. blockv = vec_ld(0, block);
  864. temp1 = vec_ld(line_size, pixels);
  865. temp2 = vec_ld(line_size + 16, pixels);
  866. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  867. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  868. {
  869. pixelsv2 = temp2;
  870. }
  871. else
  872. {
  873. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  874. }
  875. pixelsv3 = vec_mergel(vczero, pixelsv1);
  876. pixelsv4 = vec_mergel(vczero, pixelsv2);
  877. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  878. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  879. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  880. (vector unsigned short)pixelsv4);
  881. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  882. (vector unsigned short)pixelsv2);
  883. temp4 = vec_add(pixelssum3, pixelssum4);
  884. temp4 = vec_sra(temp4, vctwo);
  885. temp3 = vec_add(pixelssum1, pixelssum2);
  886. temp3 = vec_sra(temp3, vctwo);
  887. pixelssum3 = vec_add(pixelssum4, vcone);
  888. pixelssum1 = vec_add(pixelssum2, vcone);
  889. blockv = vec_packsu(temp3, temp4);
  890. vec_st(blockv, 0, block);
  891. block += line_size;
  892. pixels += line_size;
  893. }
  894. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  895. }
  896. int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  897. POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
  898. int sum;
  899. register const_vector unsigned char vzero =
  900. (const_vector unsigned char)vec_splat_u8(0);
  901. register vector signed short temp0, temp1, temp2, temp3, temp4,
  902. temp5, temp6, temp7;
  903. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
  904. {
  905. register const_vector signed short vprod1 =(const_vector signed short)
  906. AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  907. register const_vector signed short vprod2 =(const_vector signed short)
  908. AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  909. register const_vector signed short vprod3 =(const_vector signed short)
  910. AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  911. register const_vector unsigned char perm1 = (const_vector unsigned char)
  912. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  913. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  914. register const_vector unsigned char perm2 = (const_vector unsigned char)
  915. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  916. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  917. register const_vector unsigned char perm3 = (const_vector unsigned char)
  918. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  919. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  920. #define ONEITERBUTTERFLY(i, res) \
  921. { \
  922. register vector unsigned char src1, src2, srcO; \
  923. register vector unsigned char dst1, dst2, dstO; \
  924. register vector signed short srcV, dstV; \
  925. register vector signed short but0, but1, but2, op1, op2, op3; \
  926. src1 = vec_ld(stride * i, src); \
  927. src2 = vec_ld((stride * i) + 15, src); \
  928. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  929. dst1 = vec_ld(stride * i, dst); \
  930. dst2 = vec_ld((stride * i) + 15, dst); \
  931. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  932. /* promote the unsigned chars to signed shorts */ \
  933. /* we're in the 8x8 function, we only care for the first 8 */ \
  934. srcV = \
  935. (vector signed short)vec_mergeh((vector signed char)vzero, \
  936. (vector signed char)srcO); \
  937. dstV = \
  938. (vector signed short)vec_mergeh((vector signed char)vzero, \
  939. (vector signed char)dstO); \
  940. /* substractions inside the first butterfly */ \
  941. but0 = vec_sub(srcV, dstV); \
  942. op1 = vec_perm(but0, but0, perm1); \
  943. but1 = vec_mladd(but0, vprod1, op1); \
  944. op2 = vec_perm(but1, but1, perm2); \
  945. but2 = vec_mladd(but1, vprod2, op2); \
  946. op3 = vec_perm(but2, but2, perm3); \
  947. res = vec_mladd(but2, vprod3, op3); \
  948. }
  949. ONEITERBUTTERFLY(0, temp0);
  950. ONEITERBUTTERFLY(1, temp1);
  951. ONEITERBUTTERFLY(2, temp2);
  952. ONEITERBUTTERFLY(3, temp3);
  953. ONEITERBUTTERFLY(4, temp4);
  954. ONEITERBUTTERFLY(5, temp5);
  955. ONEITERBUTTERFLY(6, temp6);
  956. ONEITERBUTTERFLY(7, temp7);
  957. }
  958. #undef ONEITERBUTTERFLY
  959. {
  960. register vector signed int vsum;
  961. register vector signed short line0 = vec_add(temp0, temp1);
  962. register vector signed short line1 = vec_sub(temp0, temp1);
  963. register vector signed short line2 = vec_add(temp2, temp3);
  964. register vector signed short line3 = vec_sub(temp2, temp3);
  965. register vector signed short line4 = vec_add(temp4, temp5);
  966. register vector signed short line5 = vec_sub(temp4, temp5);
  967. register vector signed short line6 = vec_add(temp6, temp7);
  968. register vector signed short line7 = vec_sub(temp6, temp7);
  969. register vector signed short line0B = vec_add(line0, line2);
  970. register vector signed short line2B = vec_sub(line0, line2);
  971. register vector signed short line1B = vec_add(line1, line3);
  972. register vector signed short line3B = vec_sub(line1, line3);
  973. register vector signed short line4B = vec_add(line4, line6);
  974. register vector signed short line6B = vec_sub(line4, line6);
  975. register vector signed short line5B = vec_add(line5, line7);
  976. register vector signed short line7B = vec_sub(line5, line7);
  977. register vector signed short line0C = vec_add(line0B, line4B);
  978. register vector signed short line4C = vec_sub(line0B, line4B);
  979. register vector signed short line1C = vec_add(line1B, line5B);
  980. register vector signed short line5C = vec_sub(line1B, line5B);
  981. register vector signed short line2C = vec_add(line2B, line6B);
  982. register vector signed short line6C = vec_sub(line2B, line6B);
  983. register vector signed short line3C = vec_add(line3B, line7B);
  984. register vector signed short line7C = vec_sub(line3B, line7B);
  985. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  986. vsum = vec_sum4s(vec_abs(line1C), vsum);
  987. vsum = vec_sum4s(vec_abs(line2C), vsum);
  988. vsum = vec_sum4s(vec_abs(line3C), vsum);
  989. vsum = vec_sum4s(vec_abs(line4C), vsum);
  990. vsum = vec_sum4s(vec_abs(line5C), vsum);
  991. vsum = vec_sum4s(vec_abs(line6C), vsum);
  992. vsum = vec_sum4s(vec_abs(line7C), vsum);
  993. vsum = vec_sums(vsum, (vector signed int)vzero);
  994. vsum = vec_splat(vsum, 3);
  995. vec_ste(vsum, 0, &sum);
  996. }
  997. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
  998. return sum;
  999. }
  1000. /*
  1001. 16x8 works with 16 elements ; it allows to avoid replicating
  1002. loads, and give the compiler more rooms for scheduling.
  1003. It's only used from inside hadamard8_diff16_altivec.
  1004. Unfortunately, it seems gcc-3.3 is a bit dumb, and
  1005. the compiled code has a LOT of spill code, it seems
  1006. gcc (unlike xlc) cannot keep everything in registers
  1007. by itself. The following code include hand-made
  1008. registers allocation. It's not clean, but on
  1009. a 7450 the resulting code is much faster (best case
  1010. fall from 700+ cycles to 550).
  1011. xlc doesn't add spill code, but it doesn't know how to
  1012. schedule for the 7450, and its code isn't much faster than
  1013. gcc-3.3 on the 7450 (but uses 25% less instructions...)
  1014. On the 970, the hand-made RA is still a win (arount 690
  1015. vs. around 780), but xlc goes to around 660 on the
  1016. regular C code...
  1017. */
  1018. static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
  1019. int sum;
  1020. register vector signed short
  1021. temp0 REG_v(v0),
  1022. temp1 REG_v(v1),
  1023. temp2 REG_v(v2),
  1024. temp3 REG_v(v3),
  1025. temp4 REG_v(v4),
  1026. temp5 REG_v(v5),
  1027. temp6 REG_v(v6),
  1028. temp7 REG_v(v7);
  1029. register vector signed short
  1030. temp0S REG_v(v8),
  1031. temp1S REG_v(v9),
  1032. temp2S REG_v(v10),
  1033. temp3S REG_v(v11),
  1034. temp4S REG_v(v12),
  1035. temp5S REG_v(v13),
  1036. temp6S REG_v(v14),
  1037. temp7S REG_v(v15);
  1038. register const_vector unsigned char vzero REG_v(v31)=
  1039. (const_vector unsigned char)vec_splat_u8(0);
  1040. {
  1041. register const_vector signed short vprod1 REG_v(v16)=
  1042. (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  1043. register const_vector signed short vprod2 REG_v(v17)=
  1044. (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  1045. register const_vector signed short vprod3 REG_v(v18)=
  1046. (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  1047. register const_vector unsigned char perm1 REG_v(v19)=
  1048. (const_vector unsigned char)
  1049. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  1050. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  1051. register const_vector unsigned char perm2 REG_v(v20)=
  1052. (const_vector unsigned char)
  1053. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  1054. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  1055. register const_vector unsigned char perm3 REG_v(v21)=
  1056. (const_vector unsigned char)
  1057. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  1058. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  1059. #define ONEITERBUTTERFLY(i, res1, res2) \
  1060. { \
  1061. register vector unsigned char src1 REG_v(v22), \
  1062. src2 REG_v(v23), \
  1063. dst1 REG_v(v24), \
  1064. dst2 REG_v(v25), \
  1065. srcO REG_v(v22), \
  1066. dstO REG_v(v23); \
  1067. \
  1068. register vector signed short srcV REG_v(v24), \
  1069. dstV REG_v(v25), \
  1070. srcW REG_v(v26), \
  1071. dstW REG_v(v27), \
  1072. but0 REG_v(v28), \
  1073. but0S REG_v(v29), \
  1074. op1 REG_v(v30), \
  1075. but1 REG_v(v22), \
  1076. op1S REG_v(v23), \
  1077. but1S REG_v(v24), \
  1078. op2 REG_v(v25), \
  1079. but2 REG_v(v26), \
  1080. op2S REG_v(v27), \
  1081. but2S REG_v(v28), \
  1082. op3 REG_v(v29), \
  1083. op3S REG_v(v30); \
  1084. \
  1085. src1 = vec_ld(stride * i, src); \
  1086. src2 = vec_ld((stride * i) + 16, src); \
  1087. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  1088. dst1 = vec_ld(stride * i, dst); \
  1089. dst2 = vec_ld((stride * i) + 16, dst); \
  1090. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  1091. /* promote the unsigned chars to signed shorts */ \
  1092. srcV = \
  1093. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1094. (vector signed char)srcO); \
  1095. dstV = \
  1096. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1097. (vector signed char)dstO); \
  1098. srcW = \
  1099. (vector signed short)vec_mergel((vector signed char)vzero, \
  1100. (vector signed char)srcO); \
  1101. dstW = \
  1102. (vector signed short)vec_mergel((vector signed char)vzero, \
  1103. (vector signed char)dstO); \
  1104. /* substractions inside the first butterfly */ \
  1105. but0 = vec_sub(srcV, dstV); \
  1106. but0S = vec_sub(srcW, dstW); \
  1107. op1 = vec_perm(but0, but0, perm1); \
  1108. but1 = vec_mladd(but0, vprod1, op1); \
  1109. op1S = vec_perm(but0S, but0S, perm1); \
  1110. but1S = vec_mladd(but0S, vprod1, op1S); \
  1111. op2 = vec_perm(but1, but1, perm2); \
  1112. but2 = vec_mladd(but1, vprod2, op2); \
  1113. op2S = vec_perm(but1S, but1S, perm2); \
  1114. but2S = vec_mladd(but1S, vprod2, op2S); \
  1115. op3 = vec_perm(but2, but2, perm3); \
  1116. res1 = vec_mladd(but2, vprod3, op3); \
  1117. op3S = vec_perm(but2S, but2S, perm3); \
  1118. res2 = vec_mladd(but2S, vprod3, op3S); \
  1119. }
  1120. ONEITERBUTTERFLY(0, temp0, temp0S);
  1121. ONEITERBUTTERFLY(1, temp1, temp1S);
  1122. ONEITERBUTTERFLY(2, temp2, temp2S);
  1123. ONEITERBUTTERFLY(3, temp3, temp3S);
  1124. ONEITERBUTTERFLY(4, temp4, temp4S);
  1125. ONEITERBUTTERFLY(5, temp5, temp5S);
  1126. ONEITERBUTTERFLY(6, temp6, temp6S);
  1127. ONEITERBUTTERFLY(7, temp7, temp7S);
  1128. }
  1129. #undef ONEITERBUTTERFLY
  1130. {
  1131. register vector signed int vsum;
  1132. register vector signed short line0S, line1S, line2S, line3S, line4S,
  1133. line5S, line6S, line7S, line0BS,line2BS,
  1134. line1BS,line3BS,line4BS,line6BS,line5BS,
  1135. line7BS,line0CS,line4CS,line1CS,line5CS,
  1136. line2CS,line6CS,line3CS,line7CS;
  1137. register vector signed short line0 = vec_add(temp0, temp1);
  1138. register vector signed short line1 = vec_sub(temp0, temp1);
  1139. register vector signed short line2 = vec_add(temp2, temp3);
  1140. register vector signed short line3 = vec_sub(temp2, temp3);
  1141. register vector signed short line4 = vec_add(temp4, temp5);
  1142. register vector signed short line5 = vec_sub(temp4, temp5);
  1143. register vector signed short line6 = vec_add(temp6, temp7);
  1144. register vector signed short line7 = vec_sub(temp6, temp7);
  1145. register vector signed short line0B = vec_add(line0, line2);
  1146. register vector signed short line2B = vec_sub(line0, line2);
  1147. register vector signed short line1B = vec_add(line1, line3);
  1148. register vector signed short line3B = vec_sub(line1, line3);
  1149. register vector signed short line4B = vec_add(line4, line6);
  1150. register vector signed short line6B = vec_sub(line4, line6);
  1151. register vector signed short line5B = vec_add(line5, line7);
  1152. register vector signed short line7B = vec_sub(line5, line7);
  1153. register vector signed short line0C = vec_add(line0B, line4B);
  1154. register vector signed short line4C = vec_sub(line0B, line4B);
  1155. register vector signed short line1C = vec_add(line1B, line5B);
  1156. register vector signed short line5C = vec_sub(line1B, line5B);
  1157. register vector signed short line2C = vec_add(line2B, line6B);
  1158. register vector signed short line6C = vec_sub(line2B, line6B);
  1159. register vector signed short line3C = vec_add(line3B, line7B);
  1160. register vector signed short line7C = vec_sub(line3B, line7B);
  1161. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1162. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1163. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1164. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1165. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1166. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1167. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1168. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1169. line0S = vec_add(temp0S, temp1S);
  1170. line1S = vec_sub(temp0S, temp1S);
  1171. line2S = vec_add(temp2S, temp3S);
  1172. line3S = vec_sub(temp2S, temp3S);
  1173. line4S = vec_add(temp4S, temp5S);
  1174. line5S = vec_sub(temp4S, temp5S);
  1175. line6S = vec_add(temp6S, temp7S);
  1176. line7S = vec_sub(temp6S, temp7S);
  1177. line0BS = vec_add(line0S, line2S);
  1178. line2BS = vec_sub(line0S, line2S);
  1179. line1BS = vec_add(line1S, line3S);
  1180. line3BS = vec_sub(line1S, line3S);
  1181. line4BS = vec_add(line4S, line6S);
  1182. line6BS = vec_sub(line4S, line6S);
  1183. line5BS = vec_add(line5S, line7S);
  1184. line7BS = vec_sub(line5S, line7S);
  1185. line0CS = vec_add(line0BS, line4BS);
  1186. line4CS = vec_sub(line0BS, line4BS);
  1187. line1CS = vec_add(line1BS, line5BS);
  1188. line5CS = vec_sub(line1BS, line5BS);
  1189. line2CS = vec_add(line2BS, line6BS);
  1190. line6CS = vec_sub(line2BS, line6BS);
  1191. line3CS = vec_add(line3BS, line7BS);
  1192. line7CS = vec_sub(line3BS, line7BS);
  1193. vsum = vec_sum4s(vec_abs(line0CS), vsum);
  1194. vsum = vec_sum4s(vec_abs(line1CS), vsum);
  1195. vsum = vec_sum4s(vec_abs(line2CS), vsum);
  1196. vsum = vec_sum4s(vec_abs(line3CS), vsum);
  1197. vsum = vec_sum4s(vec_abs(line4CS), vsum);
  1198. vsum = vec_sum4s(vec_abs(line5CS), vsum);
  1199. vsum = vec_sum4s(vec_abs(line6CS), vsum);
  1200. vsum = vec_sum4s(vec_abs(line7CS), vsum);
  1201. vsum = vec_sums(vsum, (vector signed int)vzero);
  1202. vsum = vec_splat(vsum, 3);
  1203. vec_ste(vsum, 0, &sum);
  1204. }
  1205. return sum;
  1206. }
  1207. int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1208. POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
  1209. int score;
  1210. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
  1211. score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1212. if (h==16) {
  1213. dst += 8*stride;
  1214. src += 8*stride;
  1215. score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1216. }
  1217. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
  1218. return score;
  1219. }
  1220. int has_altivec(void)
  1221. {
  1222. #ifdef __AMIGAOS4__
  1223. ULONG result = 0;
  1224. extern struct ExecIFace *IExec;
  1225. IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
  1226. if (result == VECTORTYPE_ALTIVEC) return 1;
  1227. return 0;
  1228. #else /* __AMIGAOS4__ */
  1229. #ifdef SYS_DARWIN
  1230. int sels[2] = {CTL_HW, HW_VECTORUNIT};
  1231. int has_vu = 0;
  1232. size_t len = sizeof(has_vu);
  1233. int err;
  1234. err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
  1235. if (err == 0) return (has_vu != 0);
  1236. #else /* SYS_DARWIN */
  1237. /* no Darwin, do it the brute-force way */
  1238. /* this is borrowed from the libmpeg2 library */
  1239. {
  1240. signal (SIGILL, sigill_handler);
  1241. if (sigsetjmp (jmpbuf, 1)) {
  1242. signal (SIGILL, SIG_DFL);
  1243. } else {
  1244. canjump = 1;
  1245. asm volatile ("mtspr 256, %0\n\t"
  1246. "vand %%v0, %%v0, %%v0"
  1247. :
  1248. : "r" (-1));
  1249. signal (SIGILL, SIG_DFL);
  1250. return 1;
  1251. }
  1252. }
  1253. #endif /* SYS_DARWIN */
  1254. return 0;
  1255. #endif /* __AMIGAOS4__ */
  1256. }
  1257. static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
  1258. int blocksize)
  1259. {
  1260. int i;
  1261. vector float m, a;
  1262. vector bool int t0, t1;
  1263. const vector unsigned int v_31 = //XXX
  1264. vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
  1265. for(i=0; i<blocksize; i+=4) {
  1266. m = vec_ld(0, mag+i);
  1267. a = vec_ld(0, ang+i);
  1268. t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
  1269. t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
  1270. a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
  1271. t0 = (vector bool int)vec_and(a, t1);
  1272. t1 = (vector bool int)vec_andc(a, t1);
  1273. a = vec_sub(m, (vector float)t1);
  1274. m = vec_add(m, (vector float)t0);
  1275. vec_stl(a, 0, ang+i);
  1276. vec_stl(m, 0, mag+i);
  1277. }
  1278. }
  1279. /* next one assumes that ((line_size % 8) == 0) */
  1280. void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  1281. {
  1282. POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
  1283. register int i;
  1284. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  1285. register vector unsigned char blockv, temp1, temp2, blocktemp;
  1286. register vector unsigned short pixelssum1, pixelssum2, temp3;
  1287. register const_vector unsigned char vczero = (const_vector unsigned char)
  1288. vec_splat_u8(0);
  1289. register const_vector unsigned short vctwo = (const_vector unsigned short)
  1290. vec_splat_u16(2);
  1291. temp1 = vec_ld(0, pixels);
  1292. temp2 = vec_ld(16, pixels);
  1293. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1294. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  1295. pixelsv2 = temp2;
  1296. } else {
  1297. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1298. }
  1299. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1300. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1301. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1302. (vector unsigned short)pixelsv2);
  1303. pixelssum1 = vec_add(pixelssum1, vctwo);
  1304. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1305. for (i = 0; i < h ; i++) {
  1306. int rightside = ((unsigned long)block & 0x0000000F);
  1307. blockv = vec_ld(0, block);
  1308. temp1 = vec_ld(line_size, pixels);
  1309. temp2 = vec_ld(line_size + 16, pixels);
  1310. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1311. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1312. {
  1313. pixelsv2 = temp2;
  1314. } else {
  1315. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1316. }
  1317. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1318. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1319. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1320. (vector unsigned short)pixelsv2);
  1321. temp3 = vec_add(pixelssum1, pixelssum2);
  1322. temp3 = vec_sra(temp3, vctwo);
  1323. pixelssum1 = vec_add(pixelssum2, vctwo);
  1324. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  1325. if (rightside) {
  1326. blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  1327. } else {
  1328. blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  1329. }
  1330. blockv = vec_avg(blocktemp, blockv);
  1331. vec_st(blockv, 0, block);
  1332. block += line_size;
  1333. pixels += line_size;
  1334. }
  1335. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1336. }
  1337. void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
  1338. {
  1339. c->pix_abs[0][1] = sad16_x2_altivec;
  1340. c->pix_abs[0][2] = sad16_y2_altivec;
  1341. c->pix_abs[0][3] = sad16_xy2_altivec;
  1342. c->pix_abs[0][0] = sad16_altivec;
  1343. c->pix_abs[1][0] = sad8_altivec;
  1344. c->sad[0]= sad16_altivec;
  1345. c->sad[1]= sad8_altivec;
  1346. c->pix_norm1 = pix_norm1_altivec;
  1347. c->sse[1]= sse8_altivec;
  1348. c->sse[0]= sse16_altivec;
  1349. c->pix_sum = pix_sum_altivec;
  1350. c->diff_pixels = diff_pixels_altivec;
  1351. c->get_pixels = get_pixels_altivec;
  1352. c->add_bytes= add_bytes_altivec;
  1353. c->put_pixels_tab[0][0] = put_pixels16_altivec;
  1354. /* the two functions do the same thing, so use the same code */
  1355. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
  1356. c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
  1357. c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
  1358. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
  1359. c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
  1360. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
  1361. c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
  1362. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
  1363. c->hadamard8_diff[0] = hadamard8_diff16_altivec;
  1364. c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
  1365. #ifdef CONFIG_VORBIS_DECODER
  1366. c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
  1367. #endif
  1368. }