You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

659 lines
21KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. */
  19. #include "../dsputil.h"
  20. #include "dsputil_altivec.h"
  21. #if CONFIG_DARWIN
  22. #include <sys/sysctl.h>
  23. #endif
  24. int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  25. {
  26. int i;
  27. int s __attribute__((aligned(16)));
  28. const vector unsigned char zero = (const vector unsigned char)(0);
  29. vector unsigned char *tv;
  30. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  31. vector unsigned int sad;
  32. vector signed int sumdiffs;
  33. s = 0;
  34. sad = (vector unsigned int)(0);
  35. for(i=0;i<16;i++) {
  36. /*
  37. Read unaligned pixels into our vectors. The vectors are as follows:
  38. pix1v: pix1[0]-pix1[15]
  39. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  40. */
  41. tv = (vector unsigned char *) pix1;
  42. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  43. tv = (vector unsigned char *) &pix2[0];
  44. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  45. tv = (vector unsigned char *) &pix2[1];
  46. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  47. /* Calculate the average vector */
  48. avgv = vec_avg(pix2v, pix2iv);
  49. /* Calculate a sum of abs differences vector */
  50. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  51. /* Add each 4 pixel group together and put 4 results into sad */
  52. sad = vec_sum4s(t5, sad);
  53. pix1 += line_size;
  54. pix2 += line_size;
  55. }
  56. /* Sum up the four partial sums, and put the result into s */
  57. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  58. sumdiffs = vec_splat(sumdiffs, 3);
  59. vec_ste(sumdiffs, 0, &s);
  60. return s;
  61. }
  62. int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  63. {
  64. int i;
  65. int s __attribute__((aligned(16)));
  66. const vector unsigned char zero = (const vector unsigned char)(0);
  67. vector unsigned char *tv;
  68. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  69. vector unsigned int sad;
  70. vector signed int sumdiffs;
  71. uint8_t *pix3 = pix2 + line_size;
  72. s = 0;
  73. sad = (vector unsigned int)(0);
  74. /*
  75. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  76. iteration becomes pix2 in the next iteration. We can use this
  77. fact to avoid a potentially expensive unaligned read, each
  78. time around the loop.
  79. Read unaligned pixels into our vectors. The vectors are as follows:
  80. pix2v: pix2[0]-pix2[15]
  81. Split the pixel vectors into shorts
  82. */
  83. tv = (vector unsigned char *) &pix2[0];
  84. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  85. for(i=0;i<16;i++) {
  86. /*
  87. Read unaligned pixels into our vectors. The vectors are as follows:
  88. pix1v: pix1[0]-pix1[15]
  89. pix3v: pix3[0]-pix3[15]
  90. */
  91. tv = (vector unsigned char *) pix1;
  92. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  93. tv = (vector unsigned char *) &pix3[0];
  94. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  95. /* Calculate the average vector */
  96. avgv = vec_avg(pix2v, pix3v);
  97. /* Calculate a sum of abs differences vector */
  98. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  99. /* Add each 4 pixel group together and put 4 results into sad */
  100. sad = vec_sum4s(t5, sad);
  101. pix1 += line_size;
  102. pix2v = pix3v;
  103. pix3 += line_size;
  104. }
  105. /* Sum up the four partial sums, and put the result into s */
  106. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  107. sumdiffs = vec_splat(sumdiffs, 3);
  108. vec_ste(sumdiffs, 0, &s);
  109. return s;
  110. }
  111. int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  112. {
  113. int i;
  114. int s __attribute__((aligned(16)));
  115. uint8_t *pix3 = pix2 + line_size;
  116. const vector unsigned char zero = (const vector unsigned char)(0);
  117. const vector unsigned short two = (const vector unsigned short)(2);
  118. vector unsigned char *tv, avgv, t5;
  119. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  120. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  121. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  122. vector unsigned short avghv, avglv;
  123. vector unsigned short t1, t2, t3, t4;
  124. vector unsigned int sad;
  125. vector signed int sumdiffs;
  126. sad = (vector unsigned int)(0);
  127. s = 0;
  128. /*
  129. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  130. iteration becomes pix2 in the next iteration. We can use this
  131. fact to avoid a potentially expensive unaligned read, as well
  132. as some splitting, and vector addition each time around the loop.
  133. Read unaligned pixels into our vectors. The vectors are as follows:
  134. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  135. Split the pixel vectors into shorts
  136. */
  137. tv = (vector unsigned char *) &pix2[0];
  138. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  139. tv = (vector unsigned char *) &pix2[1];
  140. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  141. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  142. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  143. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  144. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  145. t1 = vec_add(pix2hv, pix2ihv);
  146. t2 = vec_add(pix2lv, pix2ilv);
  147. for(i=0;i<16;i++) {
  148. /*
  149. Read unaligned pixels into our vectors. The vectors are as follows:
  150. pix1v: pix1[0]-pix1[15]
  151. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  152. */
  153. tv = (vector unsigned char *) pix1;
  154. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  155. tv = (vector unsigned char *) &pix3[0];
  156. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  157. tv = (vector unsigned char *) &pix3[1];
  158. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  159. /*
  160. Note that Altivec does have vec_avg, but this works on vector pairs
  161. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  162. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  163. Instead, we have to split the pixel vectors into vectors of shorts,
  164. and do the averaging by hand.
  165. */
  166. /* Split the pixel vectors into shorts */
  167. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  168. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  169. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  170. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  171. /* Do the averaging on them */
  172. t3 = vec_add(pix3hv, pix3ihv);
  173. t4 = vec_add(pix3lv, pix3ilv);
  174. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  175. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  176. /* Pack the shorts back into a result */
  177. avgv = vec_pack(avghv, avglv);
  178. /* Calculate a sum of abs differences vector */
  179. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  180. /* Add each 4 pixel group together and put 4 results into sad */
  181. sad = vec_sum4s(t5, sad);
  182. pix1 += line_size;
  183. pix3 += line_size;
  184. /* Transfer the calculated values for pix3 into pix2 */
  185. t1 = t3;
  186. t2 = t4;
  187. }
  188. /* Sum up the four partial sums, and put the result into s */
  189. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  190. sumdiffs = vec_splat(sumdiffs, 3);
  191. vec_ste(sumdiffs, 0, &s);
  192. return s;
  193. }
  194. int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  195. {
  196. int i;
  197. int s __attribute__((aligned(16)));
  198. const vector unsigned int zero = (const vector unsigned int)(0);
  199. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  200. vector unsigned char t1, t2, t3,t4, t5;
  201. vector unsigned int sad;
  202. vector signed int sumdiffs;
  203. sad = (vector unsigned int) (0);
  204. for(i=0;i<16;i++) {
  205. /* Read potentially unaligned pixels into t1 and t2 */
  206. perm1 = vec_lvsl(0, pix1);
  207. pix1v = (vector unsigned char *) pix1;
  208. perm2 = vec_lvsl(0, pix2);
  209. pix2v = (vector unsigned char *) pix2;
  210. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  211. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  212. /* Calculate a sum of abs differences vector */
  213. t3 = vec_max(t1, t2);
  214. t4 = vec_min(t1, t2);
  215. t5 = vec_sub(t3, t4);
  216. /* Add each 4 pixel group together and put 4 results into sad */
  217. sad = vec_sum4s(t5, sad);
  218. pix1 += line_size;
  219. pix2 += line_size;
  220. }
  221. /* Sum up the four partial sums, and put the result into s */
  222. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  223. sumdiffs = vec_splat(sumdiffs, 3);
  224. vec_ste(sumdiffs, 0, &s);
  225. return s;
  226. }
  227. int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  228. {
  229. int i;
  230. int s __attribute__((aligned(16)));
  231. const vector unsigned int zero = (const vector unsigned int)(0);
  232. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  233. vector unsigned char t1, t2, t3,t4, t5;
  234. vector unsigned int sad;
  235. vector signed int sumdiffs;
  236. sad = (vector unsigned int)(0);
  237. permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  238. for(i=0;i<8;i++) {
  239. /* Read potentially unaligned pixels into t1 and t2
  240. Since we're reading 16 pixels, and actually only want 8,
  241. mask out the last 8 pixels. The 0s don't change the sum. */
  242. perm1 = vec_lvsl(0, pix1);
  243. pix1v = (vector unsigned char *) pix1;
  244. perm2 = vec_lvsl(0, pix2);
  245. pix2v = (vector unsigned char *) pix2;
  246. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  247. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  248. /* Calculate a sum of abs differences vector */
  249. t3 = vec_max(t1, t2);
  250. t4 = vec_min(t1, t2);
  251. t5 = vec_sub(t3, t4);
  252. /* Add each 4 pixel group together and put 4 results into sad */
  253. sad = vec_sum4s(t5, sad);
  254. pix1 += line_size;
  255. pix2 += line_size;
  256. }
  257. /* Sum up the four partial sums, and put the result into s */
  258. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  259. sumdiffs = vec_splat(sumdiffs, 3);
  260. vec_ste(sumdiffs, 0, &s);
  261. return s;
  262. }
  263. int pix_norm1_altivec(uint8_t *pix, int line_size)
  264. {
  265. int i;
  266. int s __attribute__((aligned(16)));
  267. const vector unsigned int zero = (const vector unsigned int)(0);
  268. vector unsigned char *tv;
  269. vector unsigned char pixv;
  270. vector unsigned int sv;
  271. vector signed int sum;
  272. sv = (vector unsigned int)(0);
  273. s = 0;
  274. for (i = 0; i < 16; i++) {
  275. /* Read in the potentially unaligned pixels */
  276. tv = (vector unsigned char *) pix;
  277. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  278. /* Square the values, and add them to our sum */
  279. sv = vec_msum(pixv, pixv, sv);
  280. pix += line_size;
  281. }
  282. /* Sum up the four partial sums, and put the result into s */
  283. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  284. sum = vec_splat(sum, 3);
  285. vec_ste(sum, 0, &s);
  286. return s;
  287. }
  288. /**
  289. * Sum of Squared Errors for a 8x8 block.
  290. * AltiVec-enhanced.
  291. * It's the pix_abs8x8_altivec code above w/ squaring added.
  292. */
  293. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
  294. {
  295. int i;
  296. int s __attribute__((aligned(16)));
  297. const vector unsigned int zero = (const vector unsigned int)(0);
  298. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  299. vector unsigned char t1, t2, t3,t4, t5;
  300. vector unsigned int sum;
  301. vector signed int sumsqr;
  302. sum = (vector unsigned int)(0);
  303. permclear = (vector unsigned char)(0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00);
  304. for(i=0;i<8;i++) {
  305. /* Read potentially unaligned pixels into t1 and t2
  306. Since we're reading 16 pixels, and actually only want 8,
  307. mask out the last 8 pixels. The 0s don't change the sum. */
  308. perm1 = vec_lvsl(0, pix1);
  309. pix1v = (vector unsigned char *) pix1;
  310. perm2 = vec_lvsl(0, pix2);
  311. pix2v = (vector unsigned char *) pix2;
  312. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  313. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  314. /*
  315. Since we want to use unsigned chars, we can take advantage
  316. of the fact that abs(a-b)^2 = (a-b)^2.
  317. */
  318. /* Calculate abs differences vector */
  319. t3 = vec_max(t1, t2);
  320. t4 = vec_min(t1, t2);
  321. t5 = vec_sub(t3, t4);
  322. /* Square the values and add them to our sum */
  323. sum = vec_msum(t5, t5, sum);
  324. pix1 += line_size;
  325. pix2 += line_size;
  326. }
  327. /* Sum up the four partial sums, and put the result into s */
  328. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  329. sumsqr = vec_splat(sumsqr, 3);
  330. vec_ste(sumsqr, 0, &s);
  331. return s;
  332. }
  333. /**
  334. * Sum of Squared Errors for a 16x16 block.
  335. * AltiVec-enhanced.
  336. * It's the pix_abs16x16_altivec code above w/ squaring added.
  337. */
  338. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
  339. {
  340. int i;
  341. int s __attribute__((aligned(16)));
  342. const vector unsigned int zero = (const vector unsigned int)(0);
  343. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  344. vector unsigned char t1, t2, t3,t4, t5;
  345. vector unsigned int sum;
  346. vector signed int sumsqr;
  347. sum = (vector unsigned int)(0);
  348. for(i=0;i<16;i++) {
  349. /* Read potentially unaligned pixels into t1 and t2 */
  350. perm1 = vec_lvsl(0, pix1);
  351. pix1v = (vector unsigned char *) pix1;
  352. perm2 = vec_lvsl(0, pix2);
  353. pix2v = (vector unsigned char *) pix2;
  354. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  355. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  356. /*
  357. Since we want to use unsigned chars, we can take advantage
  358. of the fact that abs(a-b)^2 = (a-b)^2.
  359. */
  360. /* Calculate abs differences vector */
  361. t3 = vec_max(t1, t2);
  362. t4 = vec_min(t1, t2);
  363. t5 = vec_sub(t3, t4);
  364. /* Square the values and add them to our sum */
  365. sum = vec_msum(t5, t5, sum);
  366. pix1 += line_size;
  367. pix2 += line_size;
  368. }
  369. /* Sum up the four partial sums, and put the result into s */
  370. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  371. sumsqr = vec_splat(sumsqr, 3);
  372. vec_ste(sumsqr, 0, &s);
  373. return s;
  374. }
  375. int pix_sum_altivec(UINT8 * pix, int line_size)
  376. {
  377. const vector unsigned int zero = (const vector unsigned int)(0);
  378. vector unsigned char perm, *pixv;
  379. vector unsigned char t1;
  380. vector unsigned int sad;
  381. vector signed int sumdiffs;
  382. int i;
  383. int s __attribute__((aligned(16)));
  384. sad = (vector unsigned int) (0);
  385. for (i = 0; i < 16; i++) {
  386. /* Read the potentially unaligned 16 pixels into t1 */
  387. perm = vec_lvsl(0, pix);
  388. pixv = (vector unsigned char *) pix;
  389. t1 = vec_perm(pixv[0], pixv[1], perm);
  390. /* Add each 4 pixel group together and put 4 results into sad */
  391. sad = vec_sum4s(t1, sad);
  392. pix += line_size;
  393. }
  394. /* Sum up the four partial sums, and put the result into s */
  395. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  396. sumdiffs = vec_splat(sumdiffs, 3);
  397. vec_ste(sumdiffs, 0, &s);
  398. return s;
  399. }
  400. void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
  401. {
  402. int i;
  403. vector unsigned char perm, bytes, *pixv;
  404. const vector unsigned char zero = (const vector unsigned char) (0);
  405. vector signed short shorts;
  406. for(i=0;i<8;i++)
  407. {
  408. // Read potentially unaligned pixels.
  409. // We're reading 16 pixels, and actually only want 8,
  410. // but we simply ignore the extras.
  411. perm = vec_lvsl(0, pixels);
  412. pixv = (vector unsigned char *) pixels;
  413. bytes = vec_perm(pixv[0], pixv[1], perm);
  414. // convert the bytes into shorts
  415. shorts = (vector signed short)vec_mergeh(zero, bytes);
  416. // save the data to the block, we assume the block is 16-byte aligned
  417. vec_st(shorts, i*16, (vector signed short*)block);
  418. pixels += line_size;
  419. }
  420. }
  421. void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
  422. const UINT8 *s2, int stride)
  423. {
  424. int i;
  425. vector unsigned char perm, bytes, *pixv;
  426. const vector unsigned char zero = (const vector unsigned char) (0);
  427. vector signed short shorts1, shorts2;
  428. for(i=0;i<4;i++)
  429. {
  430. // Read potentially unaligned pixels
  431. // We're reading 16 pixels, and actually only want 8,
  432. // but we simply ignore the extras.
  433. perm = vec_lvsl(0, s1);
  434. pixv = (vector unsigned char *) s1;
  435. bytes = vec_perm(pixv[0], pixv[1], perm);
  436. // convert the bytes into shorts
  437. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  438. // Do the same for the second block of pixels
  439. perm = vec_lvsl(0, s2);
  440. pixv = (vector unsigned char *) s2;
  441. bytes = vec_perm(pixv[0], pixv[1], perm);
  442. // convert the bytes into shorts
  443. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  444. // Do the subtraction
  445. shorts1 = vec_sub(shorts1, shorts2);
  446. // save the data to the block, we assume the block is 16-byte aligned
  447. vec_st(shorts1, 0, (vector signed short*)block);
  448. s1 += stride;
  449. s2 += stride;
  450. block += 8;
  451. // The code below is a copy of the code above... This is a manual
  452. // unroll.
  453. // Read potentially unaligned pixels
  454. // We're reading 16 pixels, and actually only want 8,
  455. // but we simply ignore the extras.
  456. perm = vec_lvsl(0, s1);
  457. pixv = (vector unsigned char *) s1;
  458. bytes = vec_perm(pixv[0], pixv[1], perm);
  459. // convert the bytes into shorts
  460. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  461. // Do the same for the second block of pixels
  462. perm = vec_lvsl(0, s2);
  463. pixv = (vector unsigned char *) s2;
  464. bytes = vec_perm(pixv[0], pixv[1], perm);
  465. // convert the bytes into shorts
  466. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  467. // Do the subtraction
  468. shorts1 = vec_sub(shorts1, shorts2);
  469. // save the data to the block, we assume the block is 16-byte aligned
  470. vec_st(shorts1, 0, (vector signed short*)block);
  471. s1 += stride;
  472. s2 += stride;
  473. block += 8;
  474. }
  475. }
  476. int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
  477. return pix_abs16x16_altivec(a,b,stride);
  478. }
  479. int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
  480. return pix_abs8x8_altivec(a,b,stride);
  481. }
  482. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  483. #if 0
  484. int i;
  485. for(i=0; i+7<w; i++){
  486. dst[i+0] += src[i+0];
  487. dst[i+1] += src[i+1];
  488. dst[i+2] += src[i+2];
  489. dst[i+3] += src[i+3];
  490. dst[i+4] += src[i+4];
  491. dst[i+5] += src[i+5];
  492. dst[i+6] += src[i+6];
  493. dst[i+7] += src[i+7];
  494. }
  495. for(; i<w; i++)
  496. dst[i+0] += src[i+0];
  497. #else
  498. register int i;
  499. register uint8_t *temp_src = src, *temp_dst = dst;
  500. register vector unsigned char vdst, vsrc, temp1, temp2;
  501. register vector unsigned char perm;
  502. register int count = 0;
  503. for (i = 0; (i < w) && ((unsigned long)temp_dst & 0x0000000F) ; i++)
  504. {
  505. dst[i] = src[i];
  506. temp_src ++;
  507. temp_dst ++;
  508. }
  509. /* temp_dst is a properly aligned pointer */
  510. /* we still need to deal with ill-aligned src */
  511. perm = vec_lvsl(0, temp_src);
  512. temp1 = vec_ld(0, temp_src);
  513. while ((i + 15) < w)
  514. {
  515. temp2 = vec_ld(count + 16, temp_src);
  516. vdst = vec_ld(count, temp_dst);
  517. vsrc = vec_perm(temp1, temp2, perm);
  518. temp1 = temp2;
  519. vdst = vec_add(vsrc, vdst);
  520. vec_st(vdst, count, temp_dst);
  521. count += 16;
  522. }
  523. for (; (i < w) ; i++)
  524. {
  525. dst[i] = src[i];
  526. }
  527. #endif
  528. }
  529. int has_altivec(void)
  530. {
  531. #if CONFIG_DARWIN
  532. int sels[2] = {CTL_HW, HW_VECTORUNIT};
  533. int has_vu = 0;
  534. size_t len = sizeof(has_vu);
  535. int err;
  536. err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
  537. if (err == 0) return (has_vu != 0);
  538. #endif
  539. return 0;
  540. }