You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1313 lines
44KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  19. */
  20. #include "../dsputil.h"
  21. #include "dsputil_altivec.h"
  22. #ifdef CONFIG_DARWIN
  23. #include <sys/sysctl.h>
  24. #else /* CONFIG_DARWIN */
  25. #include <signal.h>
  26. #include <setjmp.h>
  27. static sigjmp_buf jmpbuf;
  28. static volatile sig_atomic_t canjump = 0;
  29. static void sigill_handler (int sig)
  30. {
  31. if (!canjump) {
  32. signal (sig, SIG_DFL);
  33. raise (sig);
  34. }
  35. canjump = 0;
  36. siglongjmp (jmpbuf, 1);
  37. }
  38. #endif /* CONFIG_DARWIN */
  39. int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  40. {
  41. int i;
  42. int s __attribute__((aligned(16)));
  43. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  44. vector unsigned char *tv;
  45. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  46. vector unsigned int sad;
  47. vector signed int sumdiffs;
  48. s = 0;
  49. sad = (vector unsigned int)vec_splat_u32(0);
  50. for(i=0;i<16;i++) {
  51. /*
  52. Read unaligned pixels into our vectors. The vectors are as follows:
  53. pix1v: pix1[0]-pix1[15]
  54. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  55. */
  56. tv = (vector unsigned char *) pix1;
  57. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  58. tv = (vector unsigned char *) &pix2[0];
  59. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  60. tv = (vector unsigned char *) &pix2[1];
  61. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  62. /* Calculate the average vector */
  63. avgv = vec_avg(pix2v, pix2iv);
  64. /* Calculate a sum of abs differences vector */
  65. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  66. /* Add each 4 pixel group together and put 4 results into sad */
  67. sad = vec_sum4s(t5, sad);
  68. pix1 += line_size;
  69. pix2 += line_size;
  70. }
  71. /* Sum up the four partial sums, and put the result into s */
  72. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  73. sumdiffs = vec_splat(sumdiffs, 3);
  74. vec_ste(sumdiffs, 0, &s);
  75. return s;
  76. }
  77. int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  78. {
  79. int i;
  80. int s __attribute__((aligned(16)));
  81. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  82. vector unsigned char *tv;
  83. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  84. vector unsigned int sad;
  85. vector signed int sumdiffs;
  86. uint8_t *pix3 = pix2 + line_size;
  87. s = 0;
  88. sad = (vector unsigned int)vec_splat_u32(0);
  89. /*
  90. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  91. iteration becomes pix2 in the next iteration. We can use this
  92. fact to avoid a potentially expensive unaligned read, each
  93. time around the loop.
  94. Read unaligned pixels into our vectors. The vectors are as follows:
  95. pix2v: pix2[0]-pix2[15]
  96. Split the pixel vectors into shorts
  97. */
  98. tv = (vector unsigned char *) &pix2[0];
  99. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  100. for(i=0;i<16;i++) {
  101. /*
  102. Read unaligned pixels into our vectors. The vectors are as follows:
  103. pix1v: pix1[0]-pix1[15]
  104. pix3v: pix3[0]-pix3[15]
  105. */
  106. tv = (vector unsigned char *) pix1;
  107. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  108. tv = (vector unsigned char *) &pix3[0];
  109. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  110. /* Calculate the average vector */
  111. avgv = vec_avg(pix2v, pix3v);
  112. /* Calculate a sum of abs differences vector */
  113. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  114. /* Add each 4 pixel group together and put 4 results into sad */
  115. sad = vec_sum4s(t5, sad);
  116. pix1 += line_size;
  117. pix2v = pix3v;
  118. pix3 += line_size;
  119. }
  120. /* Sum up the four partial sums, and put the result into s */
  121. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  122. sumdiffs = vec_splat(sumdiffs, 3);
  123. vec_ste(sumdiffs, 0, &s);
  124. return s;
  125. }
  126. int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  127. {
  128. int i;
  129. int s __attribute__((aligned(16)));
  130. uint8_t *pix3 = pix2 + line_size;
  131. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  132. const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2);
  133. vector unsigned char *tv, avgv, t5;
  134. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  135. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  136. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  137. vector unsigned short avghv, avglv;
  138. vector unsigned short t1, t2, t3, t4;
  139. vector unsigned int sad;
  140. vector signed int sumdiffs;
  141. sad = (vector unsigned int)vec_splat_u32(0);
  142. s = 0;
  143. /*
  144. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  145. iteration becomes pix2 in the next iteration. We can use this
  146. fact to avoid a potentially expensive unaligned read, as well
  147. as some splitting, and vector addition each time around the loop.
  148. Read unaligned pixels into our vectors. The vectors are as follows:
  149. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  150. Split the pixel vectors into shorts
  151. */
  152. tv = (vector unsigned char *) &pix2[0];
  153. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  154. tv = (vector unsigned char *) &pix2[1];
  155. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  156. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  157. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  158. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  159. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  160. t1 = vec_add(pix2hv, pix2ihv);
  161. t2 = vec_add(pix2lv, pix2ilv);
  162. for(i=0;i<16;i++) {
  163. /*
  164. Read unaligned pixels into our vectors. The vectors are as follows:
  165. pix1v: pix1[0]-pix1[15]
  166. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  167. */
  168. tv = (vector unsigned char *) pix1;
  169. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  170. tv = (vector unsigned char *) &pix3[0];
  171. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  172. tv = (vector unsigned char *) &pix3[1];
  173. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  174. /*
  175. Note that Altivec does have vec_avg, but this works on vector pairs
  176. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  177. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  178. Instead, we have to split the pixel vectors into vectors of shorts,
  179. and do the averaging by hand.
  180. */
  181. /* Split the pixel vectors into shorts */
  182. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  183. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  184. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  185. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  186. /* Do the averaging on them */
  187. t3 = vec_add(pix3hv, pix3ihv);
  188. t4 = vec_add(pix3lv, pix3ilv);
  189. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  190. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  191. /* Pack the shorts back into a result */
  192. avgv = vec_pack(avghv, avglv);
  193. /* Calculate a sum of abs differences vector */
  194. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  195. /* Add each 4 pixel group together and put 4 results into sad */
  196. sad = vec_sum4s(t5, sad);
  197. pix1 += line_size;
  198. pix3 += line_size;
  199. /* Transfer the calculated values for pix3 into pix2 */
  200. t1 = t3;
  201. t2 = t4;
  202. }
  203. /* Sum up the four partial sums, and put the result into s */
  204. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  205. sumdiffs = vec_splat(sumdiffs, 3);
  206. vec_ste(sumdiffs, 0, &s);
  207. return s;
  208. }
  209. int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  210. {
  211. int i;
  212. int s __attribute__((aligned(16)));
  213. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  214. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  215. vector unsigned char t1, t2, t3,t4, t5;
  216. vector unsigned int sad;
  217. vector signed int sumdiffs;
  218. sad = (vector unsigned int)vec_splat_u32(0);
  219. for(i=0;i<16;i++) {
  220. /* Read potentially unaligned pixels into t1 and t2 */
  221. perm1 = vec_lvsl(0, pix1);
  222. pix1v = (vector unsigned char *) pix1;
  223. perm2 = vec_lvsl(0, pix2);
  224. pix2v = (vector unsigned char *) pix2;
  225. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  226. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  227. /* Calculate a sum of abs differences vector */
  228. t3 = vec_max(t1, t2);
  229. t4 = vec_min(t1, t2);
  230. t5 = vec_sub(t3, t4);
  231. /* Add each 4 pixel group together and put 4 results into sad */
  232. sad = vec_sum4s(t5, sad);
  233. pix1 += line_size;
  234. pix2 += line_size;
  235. }
  236. /* Sum up the four partial sums, and put the result into s */
  237. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  238. sumdiffs = vec_splat(sumdiffs, 3);
  239. vec_ste(sumdiffs, 0, &s);
  240. return s;
  241. }
  242. int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  243. {
  244. int i;
  245. int s __attribute__((aligned(16)));
  246. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  247. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  248. vector unsigned char t1, t2, t3,t4, t5;
  249. vector unsigned int sad;
  250. vector signed int sumdiffs;
  251. sad = (vector unsigned int)vec_splat_u32(0);
  252. #ifdef CONFIG_DARWIN
  253. permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  254. #else
  255. permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
  256. #endif
  257. for(i=0;i<8;i++) {
  258. /* Read potentially unaligned pixels into t1 and t2
  259. Since we're reading 16 pixels, and actually only want 8,
  260. mask out the last 8 pixels. The 0s don't change the sum. */
  261. perm1 = vec_lvsl(0, pix1);
  262. pix1v = (vector unsigned char *) pix1;
  263. perm2 = vec_lvsl(0, pix2);
  264. pix2v = (vector unsigned char *) pix2;
  265. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  266. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  267. /* Calculate a sum of abs differences vector */
  268. t3 = vec_max(t1, t2);
  269. t4 = vec_min(t1, t2);
  270. t5 = vec_sub(t3, t4);
  271. /* Add each 4 pixel group together and put 4 results into sad */
  272. sad = vec_sum4s(t5, sad);
  273. pix1 += line_size;
  274. pix2 += line_size;
  275. }
  276. /* Sum up the four partial sums, and put the result into s */
  277. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  278. sumdiffs = vec_splat(sumdiffs, 3);
  279. vec_ste(sumdiffs, 0, &s);
  280. return s;
  281. }
  282. int pix_norm1_altivec(uint8_t *pix, int line_size)
  283. {
  284. int i;
  285. int s __attribute__((aligned(16)));
  286. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  287. vector unsigned char *tv;
  288. vector unsigned char pixv;
  289. vector unsigned int sv;
  290. vector signed int sum;
  291. sv = (vector unsigned int)vec_splat_u32(0);
  292. s = 0;
  293. for (i = 0; i < 16; i++) {
  294. /* Read in the potentially unaligned pixels */
  295. tv = (vector unsigned char *) pix;
  296. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  297. /* Square the values, and add them to our sum */
  298. sv = vec_msum(pixv, pixv, sv);
  299. pix += line_size;
  300. }
  301. /* Sum up the four partial sums, and put the result into s */
  302. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  303. sum = vec_splat(sum, 3);
  304. vec_ste(sum, 0, &s);
  305. return s;
  306. }
  307. /**
  308. * Sum of Squared Errors for a 8x8 block.
  309. * AltiVec-enhanced.
  310. * It's the pix_abs8x8_altivec code above w/ squaring added.
  311. */
  312. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
  313. {
  314. int i;
  315. int s __attribute__((aligned(16)));
  316. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  317. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  318. vector unsigned char t1, t2, t3,t4, t5;
  319. vector unsigned int sum;
  320. vector signed int sumsqr;
  321. sum = (vector unsigned int)vec_splat_u32(0);
  322. #ifdef CONFIG_DARWIN
  323. permclear = (vector unsigned char)(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  324. #else
  325. permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0};
  326. #endif
  327. for(i=0;i<8;i++) {
  328. /* Read potentially unaligned pixels into t1 and t2
  329. Since we're reading 16 pixels, and actually only want 8,
  330. mask out the last 8 pixels. The 0s don't change the sum. */
  331. perm1 = vec_lvsl(0, pix1);
  332. pix1v = (vector unsigned char *) pix1;
  333. perm2 = vec_lvsl(0, pix2);
  334. pix2v = (vector unsigned char *) pix2;
  335. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  336. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  337. /*
  338. Since we want to use unsigned chars, we can take advantage
  339. of the fact that abs(a-b)^2 = (a-b)^2.
  340. */
  341. /* Calculate abs differences vector */
  342. t3 = vec_max(t1, t2);
  343. t4 = vec_min(t1, t2);
  344. t5 = vec_sub(t3, t4);
  345. /* Square the values and add them to our sum */
  346. sum = vec_msum(t5, t5, sum);
  347. pix1 += line_size;
  348. pix2 += line_size;
  349. }
  350. /* Sum up the four partial sums, and put the result into s */
  351. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  352. sumsqr = vec_splat(sumsqr, 3);
  353. vec_ste(sumsqr, 0, &s);
  354. return s;
  355. }
  356. /**
  357. * Sum of Squared Errors for a 16x16 block.
  358. * AltiVec-enhanced.
  359. * It's the pix_abs16x16_altivec code above w/ squaring added.
  360. */
  361. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size)
  362. {
  363. int i;
  364. int s __attribute__((aligned(16)));
  365. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  366. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  367. vector unsigned char t1, t2, t3,t4, t5;
  368. vector unsigned int sum;
  369. vector signed int sumsqr;
  370. sum = (vector unsigned int)vec_splat_u32(0);
  371. for(i=0;i<16;i++) {
  372. /* Read potentially unaligned pixels into t1 and t2 */
  373. perm1 = vec_lvsl(0, pix1);
  374. pix1v = (vector unsigned char *) pix1;
  375. perm2 = vec_lvsl(0, pix2);
  376. pix2v = (vector unsigned char *) pix2;
  377. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  378. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  379. /*
  380. Since we want to use unsigned chars, we can take advantage
  381. of the fact that abs(a-b)^2 = (a-b)^2.
  382. */
  383. /* Calculate abs differences vector */
  384. t3 = vec_max(t1, t2);
  385. t4 = vec_min(t1, t2);
  386. t5 = vec_sub(t3, t4);
  387. /* Square the values and add them to our sum */
  388. sum = vec_msum(t5, t5, sum);
  389. pix1 += line_size;
  390. pix2 += line_size;
  391. }
  392. /* Sum up the four partial sums, and put the result into s */
  393. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  394. sumsqr = vec_splat(sumsqr, 3);
  395. vec_ste(sumsqr, 0, &s);
  396. return s;
  397. }
  398. int pix_sum_altivec(uint8_t * pix, int line_size)
  399. {
  400. const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0);
  401. vector unsigned char perm, *pixv;
  402. vector unsigned char t1;
  403. vector unsigned int sad;
  404. vector signed int sumdiffs;
  405. int i;
  406. int s __attribute__((aligned(16)));
  407. sad = (vector unsigned int)vec_splat_u32(0);
  408. for (i = 0; i < 16; i++) {
  409. /* Read the potentially unaligned 16 pixels into t1 */
  410. perm = vec_lvsl(0, pix);
  411. pixv = (vector unsigned char *) pix;
  412. t1 = vec_perm(pixv[0], pixv[1], perm);
  413. /* Add each 4 pixel group together and put 4 results into sad */
  414. sad = vec_sum4s(t1, sad);
  415. pix += line_size;
  416. }
  417. /* Sum up the four partial sums, and put the result into s */
  418. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  419. sumdiffs = vec_splat(sumdiffs, 3);
  420. vec_ste(sumdiffs, 0, &s);
  421. return s;
  422. }
  423. void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  424. {
  425. int i;
  426. vector unsigned char perm, bytes, *pixv;
  427. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  428. vector signed short shorts;
  429. for(i=0;i<8;i++)
  430. {
  431. // Read potentially unaligned pixels.
  432. // We're reading 16 pixels, and actually only want 8,
  433. // but we simply ignore the extras.
  434. perm = vec_lvsl(0, pixels);
  435. pixv = (vector unsigned char *) pixels;
  436. bytes = vec_perm(pixv[0], pixv[1], perm);
  437. // convert the bytes into shorts
  438. shorts = (vector signed short)vec_mergeh(zero, bytes);
  439. // save the data to the block, we assume the block is 16-byte aligned
  440. vec_st(shorts, i*16, (vector signed short*)block);
  441. pixels += line_size;
  442. }
  443. }
  444. void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  445. const uint8_t *s2, int stride)
  446. {
  447. int i;
  448. vector unsigned char perm, bytes, *pixv;
  449. const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0);
  450. vector signed short shorts1, shorts2;
  451. for(i=0;i<4;i++)
  452. {
  453. // Read potentially unaligned pixels
  454. // We're reading 16 pixels, and actually only want 8,
  455. // but we simply ignore the extras.
  456. perm = vec_lvsl(0, s1);
  457. pixv = (vector unsigned char *) s1;
  458. bytes = vec_perm(pixv[0], pixv[1], perm);
  459. // convert the bytes into shorts
  460. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  461. // Do the same for the second block of pixels
  462. perm = vec_lvsl(0, s2);
  463. pixv = (vector unsigned char *) s2;
  464. bytes = vec_perm(pixv[0], pixv[1], perm);
  465. // convert the bytes into shorts
  466. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  467. // Do the subtraction
  468. shorts1 = vec_sub(shorts1, shorts2);
  469. // save the data to the block, we assume the block is 16-byte aligned
  470. vec_st(shorts1, 0, (vector signed short*)block);
  471. s1 += stride;
  472. s2 += stride;
  473. block += 8;
  474. // The code below is a copy of the code above... This is a manual
  475. // unroll.
  476. // Read potentially unaligned pixels
  477. // We're reading 16 pixels, and actually only want 8,
  478. // but we simply ignore the extras.
  479. perm = vec_lvsl(0, s1);
  480. pixv = (vector unsigned char *) s1;
  481. bytes = vec_perm(pixv[0], pixv[1], perm);
  482. // convert the bytes into shorts
  483. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  484. // Do the same for the second block of pixels
  485. perm = vec_lvsl(0, s2);
  486. pixv = (vector unsigned char *) s2;
  487. bytes = vec_perm(pixv[0], pixv[1], perm);
  488. // convert the bytes into shorts
  489. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  490. // Do the subtraction
  491. shorts1 = vec_sub(shorts1, shorts2);
  492. // save the data to the block, we assume the block is 16-byte aligned
  493. vec_st(shorts1, 0, (vector signed short*)block);
  494. s1 += stride;
  495. s2 += stride;
  496. block += 8;
  497. }
  498. }
  499. int sad16x16_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
  500. return pix_abs16x16_altivec(a,b,stride);
  501. }
  502. int sad8x8_altivec(void *s, uint8_t *a, uint8_t *b, int stride) {
  503. return pix_abs8x8_altivec(a,b,stride);
  504. }
  505. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  506. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  507. int i;
  508. for(i=0; i+7<w; i++){
  509. dst[i+0] += src[i+0];
  510. dst[i+1] += src[i+1];
  511. dst[i+2] += src[i+2];
  512. dst[i+3] += src[i+3];
  513. dst[i+4] += src[i+4];
  514. dst[i+5] += src[i+5];
  515. dst[i+6] += src[i+6];
  516. dst[i+7] += src[i+7];
  517. }
  518. for(; i<w; i++)
  519. dst[i+0] += src[i+0];
  520. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  521. register int i;
  522. register vector unsigned char vdst, vsrc;
  523. /* dst and src are 16 bytes-aligned (guaranteed) */
  524. for(i = 0 ; (i + 15) < w ; i++)
  525. {
  526. vdst = vec_ld(i << 4, (unsigned char*)dst);
  527. vsrc = vec_ld(i << 4, (unsigned char*)src);
  528. vdst = vec_add(vsrc, vdst);
  529. vec_st(vdst, i << 4, (unsigned char*)dst);
  530. }
  531. /* if w is not a multiple of 16 */
  532. for (; (i < w) ; i++)
  533. {
  534. dst[i] = src[i];
  535. }
  536. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  537. }
  538. /* next one assumes that ((line_size % 16) == 0) */
  539. void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  540. {
  541. POWERPC_TBL_DECLARE(altivec_put_pixels16_num, 1);
  542. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  543. int i;
  544. POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
  545. for(i=0; i<h; i++) {
  546. *((uint32_t*)(block )) = (((const struct unaligned_32 *) (pixels))->l);
  547. *((uint32_t*)(block+4)) = (((const struct unaligned_32 *) (pixels+4))->l);
  548. *((uint32_t*)(block+8)) = (((const struct unaligned_32 *) (pixels+8))->l);
  549. *((uint32_t*)(block+12)) = (((const struct unaligned_32 *) (pixels+12))->l);
  550. pixels+=line_size;
  551. block +=line_size;
  552. }
  553. POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
  554. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  555. register vector unsigned char pixelsv1, pixelsv2;
  556. register vector unsigned char perm = vec_lvsl(0, pixels);
  557. int i;
  558. POWERPC_TBL_START_COUNT(altivec_put_pixels16_num, 1);
  559. for(i=0; i<h; i++) {
  560. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  561. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  562. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  563. 0, (unsigned char*)block);
  564. pixels+=line_size;
  565. block +=line_size;
  566. }
  567. POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_num, 1);
  568. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  569. }
  570. /* next one assumes that ((line_size % 16) == 0) */
  571. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  572. void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  573. {
  574. POWERPC_TBL_DECLARE(altivec_avg_pixels16_num, 1);
  575. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  576. int i;
  577. POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
  578. for(i=0; i<h; i++) {
  579. op_avg(*((uint32_t*)(block)),(((const struct unaligned_32 *)(pixels))->l));
  580. op_avg(*((uint32_t*)(block+4)),(((const struct unaligned_32 *)(pixels+4))->l));
  581. op_avg(*((uint32_t*)(block+8)),(((const struct unaligned_32 *)(pixels+8))->l));
  582. op_avg(*((uint32_t*)(block+12)),(((const struct unaligned_32 *)(pixels+12))->l));
  583. pixels+=line_size;
  584. block +=line_size;
  585. }
  586. POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
  587. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  588. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  589. register vector unsigned char perm = vec_lvsl(0, pixels);
  590. int i;
  591. POWERPC_TBL_START_COUNT(altivec_avg_pixels16_num, 1);
  592. for(i=0; i<h; i++) {
  593. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  594. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  595. blockv = vec_ld(0, block);
  596. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  597. blockv = vec_avg(blockv,pixelsv);
  598. vec_st(blockv, 0, (unsigned char*)block);
  599. pixels+=line_size;
  600. block +=line_size;
  601. }
  602. POWERPC_TBL_STOP_COUNT(altivec_avg_pixels16_num, 1);
  603. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  604. }
  605. /* next one assumes that ((line_size % 8) == 0) */
  606. void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  607. {
  608. POWERPC_TBL_DECLARE(altivec_avg_pixels8_num, 1);
  609. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  610. int i;
  611. POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
  612. for (i = 0; i < h; i++) {
  613. *((uint32_t *) (block)) =
  614. (((*((uint32_t *) (block))) |
  615. ((((const struct unaligned_32 *) (pixels))->l))) -
  616. ((((*((uint32_t *) (block))) ^
  617. ((((const struct unaligned_32 *) (pixels))->
  618. l))) & 0xFEFEFEFEUL) >> 1));
  619. *((uint32_t *) (block + 4)) =
  620. (((*((uint32_t *) (block + 4))) |
  621. ((((const struct unaligned_32 *) (pixels + 4))->l))) -
  622. ((((*((uint32_t *) (block + 4))) ^
  623. ((((const struct unaligned_32 *) (pixels +
  624. 4))->
  625. l))) & 0xFEFEFEFEUL) >> 1));
  626. pixels += line_size;
  627. block += line_size;
  628. }
  629. POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
  630. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  631. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  632. int i;
  633. POWERPC_TBL_START_COUNT(altivec_avg_pixels8_num, 1);
  634. for (i = 0; i < h; i++) {
  635. /*
  636. block is 8 bytes-aligned, so we're either in the
  637. left block (16 bytes-aligned) or in the right block (not)
  638. */
  639. int rightside = ((unsigned long)block & 0x0000000F);
  640. blockv = vec_ld(0, block);
  641. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  642. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  643. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  644. if (rightside)
  645. {
  646. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  647. }
  648. else
  649. {
  650. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  651. }
  652. blockv = vec_avg(blockv, pixelsv);
  653. vec_st(blockv, 0, block);
  654. pixels += line_size;
  655. block += line_size;
  656. }
  657. POWERPC_TBL_STOP_COUNT(altivec_avg_pixels8_num, 1);
  658. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  659. }
  660. /* next one assumes that ((line_size % 8) == 0) */
  661. void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  662. {
  663. POWERPC_TBL_DECLARE(altivec_put_pixels8_xy2_num, 1);
  664. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  665. int j;
  666. POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  667. for (j = 0; j < 2; j++) {
  668. int i;
  669. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  670. const uint32_t b =
  671. (((const struct unaligned_32 *) (pixels + 1))->l);
  672. uint32_t l0 =
  673. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  674. uint32_t h0 =
  675. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  676. uint32_t l1, h1;
  677. pixels += line_size;
  678. for (i = 0; i < h; i += 2) {
  679. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  680. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  681. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  682. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  683. *((uint32_t *) block) =
  684. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  685. pixels += line_size;
  686. block += line_size;
  687. a = (((const struct unaligned_32 *) (pixels))->l);
  688. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  689. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  690. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  691. *((uint32_t *) block) =
  692. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  693. pixels += line_size;
  694. block += line_size;
  695. } pixels += 4 - line_size * (h + 1);
  696. block += 4 - line_size * h;
  697. }
  698. POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  699. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  700. register int i;
  701. register vector unsigned char
  702. pixelsv1, pixelsv2,
  703. pixelsavg;
  704. register vector unsigned char
  705. blockv, temp1, temp2;
  706. register vector unsigned short
  707. pixelssum1, pixelssum2, temp3;
  708. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  709. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  710. temp1 = vec_ld(0, pixels);
  711. temp2 = vec_ld(16, pixels);
  712. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  713. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  714. {
  715. pixelsv2 = temp2;
  716. }
  717. else
  718. {
  719. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  720. }
  721. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  722. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  723. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  724. (vector unsigned short)pixelsv2);
  725. pixelssum1 = vec_add(pixelssum1, vctwo);
  726. POWERPC_TBL_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  727. for (i = 0; i < h ; i++) {
  728. int rightside = ((unsigned long)block & 0x0000000F);
  729. blockv = vec_ld(0, block);
  730. temp1 = vec_ld(line_size, pixels);
  731. temp2 = vec_ld(line_size + 16, pixels);
  732. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  733. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  734. {
  735. pixelsv2 = temp2;
  736. }
  737. else
  738. {
  739. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  740. }
  741. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  742. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  743. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  744. (vector unsigned short)pixelsv2);
  745. temp3 = vec_add(pixelssum1, pixelssum2);
  746. temp3 = vec_sra(temp3, vctwo);
  747. pixelssum1 = vec_add(pixelssum2, vctwo);
  748. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  749. if (rightside)
  750. {
  751. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  752. }
  753. else
  754. {
  755. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  756. }
  757. vec_st(blockv, 0, block);
  758. block += line_size;
  759. pixels += line_size;
  760. }
  761. POWERPC_TBL_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  762. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  763. }
  764. /* next one assumes that ((line_size % 8) == 0) */
  765. void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  766. {
  767. POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
  768. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  769. int j;
  770. POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  771. for (j = 0; j < 2; j++) {
  772. int i;
  773. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  774. const uint32_t b =
  775. (((const struct unaligned_32 *) (pixels + 1))->l);
  776. uint32_t l0 =
  777. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  778. uint32_t h0 =
  779. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  780. uint32_t l1, h1;
  781. pixels += line_size;
  782. for (i = 0; i < h; i += 2) {
  783. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  784. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  785. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  786. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  787. *((uint32_t *) block) =
  788. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  789. pixels += line_size;
  790. block += line_size;
  791. a = (((const struct unaligned_32 *) (pixels))->l);
  792. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  793. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  794. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  795. *((uint32_t *) block) =
  796. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  797. pixels += line_size;
  798. block += line_size;
  799. } pixels += 4 - line_size * (h + 1);
  800. block += 4 - line_size * h;
  801. }
  802. POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  803. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  804. register int i;
  805. register vector unsigned char
  806. pixelsv1, pixelsv2,
  807. pixelsavg;
  808. register vector unsigned char
  809. blockv, temp1, temp2;
  810. register vector unsigned short
  811. pixelssum1, pixelssum2, temp3;
  812. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  813. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  814. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  815. temp1 = vec_ld(0, pixels);
  816. temp2 = vec_ld(16, pixels);
  817. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  818. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  819. {
  820. pixelsv2 = temp2;
  821. }
  822. else
  823. {
  824. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  825. }
  826. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  827. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  828. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  829. (vector unsigned short)pixelsv2);
  830. pixelssum1 = vec_add(pixelssum1, vcone);
  831. POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  832. for (i = 0; i < h ; i++) {
  833. int rightside = ((unsigned long)block & 0x0000000F);
  834. blockv = vec_ld(0, block);
  835. temp1 = vec_ld(line_size, pixels);
  836. temp2 = vec_ld(line_size + 16, pixels);
  837. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  838. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  839. {
  840. pixelsv2 = temp2;
  841. }
  842. else
  843. {
  844. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  845. }
  846. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  847. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  848. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  849. (vector unsigned short)pixelsv2);
  850. temp3 = vec_add(pixelssum1, pixelssum2);
  851. temp3 = vec_sra(temp3, vctwo);
  852. pixelssum1 = vec_add(pixelssum2, vcone);
  853. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  854. if (rightside)
  855. {
  856. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  857. }
  858. else
  859. {
  860. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  861. }
  862. vec_st(blockv, 0, block);
  863. block += line_size;
  864. pixels += line_size;
  865. }
  866. POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  867. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  868. }
  869. /* next one assumes that ((line_size % 16) == 0) */
  870. void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  871. {
  872. POWERPC_TBL_DECLARE(altivec_put_pixels16_xy2_num, 1);
  873. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  874. int j;
  875. POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  876. for (j = 0; j < 4; j++) {
  877. int i;
  878. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  879. const uint32_t b =
  880. (((const struct unaligned_32 *) (pixels + 1))->l);
  881. uint32_t l0 =
  882. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  883. uint32_t h0 =
  884. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  885. uint32_t l1, h1;
  886. pixels += line_size;
  887. for (i = 0; i < h; i += 2) {
  888. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  889. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  890. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  891. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  892. *((uint32_t *) block) =
  893. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  894. pixels += line_size;
  895. block += line_size;
  896. a = (((const struct unaligned_32 *) (pixels))->l);
  897. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  898. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  899. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  900. *((uint32_t *) block) =
  901. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  902. pixels += line_size;
  903. block += line_size;
  904. } pixels += 4 - line_size * (h + 1);
  905. block += 4 - line_size * h;
  906. }
  907. POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  908. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  909. register int i;
  910. register vector unsigned char
  911. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  912. register vector unsigned char
  913. blockv, temp1, temp2;
  914. register vector unsigned short
  915. pixelssum1, pixelssum2, temp3,
  916. pixelssum3, pixelssum4, temp4;
  917. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  918. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  919. temp1 = vec_ld(0, pixels);
  920. temp2 = vec_ld(16, pixels);
  921. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  922. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  923. {
  924. pixelsv2 = temp2;
  925. }
  926. else
  927. {
  928. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  929. }
  930. pixelsv3 = vec_mergel(vczero, pixelsv1);
  931. pixelsv4 = vec_mergel(vczero, pixelsv2);
  932. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  933. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  934. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  935. (vector unsigned short)pixelsv4);
  936. pixelssum3 = vec_add(pixelssum3, vctwo);
  937. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  938. (vector unsigned short)pixelsv2);
  939. pixelssum1 = vec_add(pixelssum1, vctwo);
  940. POWERPC_TBL_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  941. for (i = 0; i < h ; i++) {
  942. blockv = vec_ld(0, block);
  943. temp1 = vec_ld(line_size, pixels);
  944. temp2 = vec_ld(line_size + 16, pixels);
  945. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  946. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  947. {
  948. pixelsv2 = temp2;
  949. }
  950. else
  951. {
  952. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  953. }
  954. pixelsv3 = vec_mergel(vczero, pixelsv1);
  955. pixelsv4 = vec_mergel(vczero, pixelsv2);
  956. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  957. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  958. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  959. (vector unsigned short)pixelsv4);
  960. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  961. (vector unsigned short)pixelsv2);
  962. temp4 = vec_add(pixelssum3, pixelssum4);
  963. temp4 = vec_sra(temp4, vctwo);
  964. temp3 = vec_add(pixelssum1, pixelssum2);
  965. temp3 = vec_sra(temp3, vctwo);
  966. pixelssum3 = vec_add(pixelssum4, vctwo);
  967. pixelssum1 = vec_add(pixelssum2, vctwo);
  968. blockv = vec_packsu(temp3, temp4);
  969. vec_st(blockv, 0, block);
  970. block += line_size;
  971. pixels += line_size;
  972. }
  973. POWERPC_TBL_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  974. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  975. }
  976. /* next one assumes that ((line_size % 16) == 0) */
  977. void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  978. {
  979. POWERPC_TBL_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
  980. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  981. int j;
  982. POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  983. for (j = 0; j < 4; j++) {
  984. int i;
  985. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  986. const uint32_t b =
  987. (((const struct unaligned_32 *) (pixels + 1))->l);
  988. uint32_t l0 =
  989. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  990. uint32_t h0 =
  991. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  992. uint32_t l1, h1;
  993. pixels += line_size;
  994. for (i = 0; i < h; i += 2) {
  995. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  996. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  997. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  998. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  999. *((uint32_t *) block) =
  1000. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1001. pixels += line_size;
  1002. block += line_size;
  1003. a = (((const struct unaligned_32 *) (pixels))->l);
  1004. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1005. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  1006. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1007. *((uint32_t *) block) =
  1008. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1009. pixels += line_size;
  1010. block += line_size;
  1011. } pixels += 4 - line_size * (h + 1);
  1012. block += 4 - line_size * h;
  1013. }
  1014. POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1015. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  1016. register int i;
  1017. register vector unsigned char
  1018. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  1019. register vector unsigned char
  1020. blockv, temp1, temp2;
  1021. register vector unsigned short
  1022. pixelssum1, pixelssum2, temp3,
  1023. pixelssum3, pixelssum4, temp4;
  1024. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  1025. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  1026. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  1027. temp1 = vec_ld(0, pixels);
  1028. temp2 = vec_ld(16, pixels);
  1029. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1030. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  1031. {
  1032. pixelsv2 = temp2;
  1033. }
  1034. else
  1035. {
  1036. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1037. }
  1038. pixelsv3 = vec_mergel(vczero, pixelsv1);
  1039. pixelsv4 = vec_mergel(vczero, pixelsv2);
  1040. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1041. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1042. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  1043. (vector unsigned short)pixelsv4);
  1044. pixelssum3 = vec_add(pixelssum3, vcone);
  1045. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1046. (vector unsigned short)pixelsv2);
  1047. pixelssum1 = vec_add(pixelssum1, vcone);
  1048. POWERPC_TBL_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1049. for (i = 0; i < h ; i++) {
  1050. blockv = vec_ld(0, block);
  1051. temp1 = vec_ld(line_size, pixels);
  1052. temp2 = vec_ld(line_size + 16, pixels);
  1053. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1054. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1055. {
  1056. pixelsv2 = temp2;
  1057. }
  1058. else
  1059. {
  1060. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1061. }
  1062. pixelsv3 = vec_mergel(vczero, pixelsv1);
  1063. pixelsv4 = vec_mergel(vczero, pixelsv2);
  1064. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1065. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1066. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  1067. (vector unsigned short)pixelsv4);
  1068. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1069. (vector unsigned short)pixelsv2);
  1070. temp4 = vec_add(pixelssum3, pixelssum4);
  1071. temp4 = vec_sra(temp4, vctwo);
  1072. temp3 = vec_add(pixelssum1, pixelssum2);
  1073. temp3 = vec_sra(temp3, vctwo);
  1074. pixelssum3 = vec_add(pixelssum4, vcone);
  1075. pixelssum1 = vec_add(pixelssum2, vcone);
  1076. blockv = vec_packsu(temp3, temp4);
  1077. vec_st(blockv, 0, block);
  1078. block += line_size;
  1079. pixels += line_size;
  1080. }
  1081. POWERPC_TBL_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1082. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  1083. }
  1084. int has_altivec(void)
  1085. {
  1086. #ifdef CONFIG_DARWIN
  1087. int sels[2] = {CTL_HW, HW_VECTORUNIT};
  1088. int has_vu = 0;
  1089. size_t len = sizeof(has_vu);
  1090. int err;
  1091. err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
  1092. if (err == 0) return (has_vu != 0);
  1093. #else /* CONFIG_DARWIN */
  1094. /* no Darwin, do it the brute-force way */
  1095. /* this is borrowed from the libmpeg2 library */
  1096. {
  1097. signal (SIGILL, sigill_handler);
  1098. if (sigsetjmp (jmpbuf, 1)) {
  1099. signal (SIGILL, SIG_DFL);
  1100. } else {
  1101. canjump = 1;
  1102. asm volatile ("mtspr 256, %0\n\t"
  1103. "vand %%v0, %%v0, %%v0"
  1104. :
  1105. : "r" (-1));
  1106. signal (SIGILL, SIG_DFL);
  1107. return 1;
  1108. }
  1109. }
  1110. #endif /* CONFIG_DARWIN */
  1111. return 0;
  1112. }