You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

535 lines
17KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
  18. */
  19. #include "../dsputil.h"
  20. #include "dsputil_altivec.h"
  21. #if CONFIG_DARWIN
  22. #include <sys/sysctl.h>
  23. #endif
  24. int pix_abs16x16_x2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  25. {
  26. int s, i;
  27. vector unsigned char *tv, zero;
  28. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  29. vector unsigned int sad;
  30. vector signed int sumdiffs;
  31. s = 0;
  32. zero = vec_splat_u8(0);
  33. sad = vec_splat_u32(0);
  34. for(i=0;i<16;i++) {
  35. /*
  36. Read unaligned pixels into our vectors. The vectors are as follows:
  37. pix1v: pix1[0]-pix1[15]
  38. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  39. */
  40. tv = (vector unsigned char *) pix1;
  41. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  42. tv = (vector unsigned char *) &pix2[0];
  43. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  44. tv = (vector unsigned char *) &pix2[1];
  45. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  46. /* Calculate the average vector */
  47. avgv = vec_avg(pix2v, pix2iv);
  48. /* Calculate a sum of abs differences vector */
  49. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  50. /* Add each 4 pixel group together and put 4 results into sad */
  51. sad = vec_sum4s(t5, sad);
  52. pix1 += line_size;
  53. pix2 += line_size;
  54. }
  55. /* Sum up the four partial sums, and put the result into s */
  56. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  57. sumdiffs = vec_splat(sumdiffs, 3);
  58. vec_ste(sumdiffs, 0, &s);
  59. return s;
  60. }
  61. int pix_abs16x16_y2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  62. {
  63. int s, i;
  64. vector unsigned char *tv, zero;
  65. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  66. vector unsigned int sad;
  67. vector signed int sumdiffs;
  68. uint8_t *pix3 = pix2 + line_size;
  69. s = 0;
  70. zero = vec_splat_u8(0);
  71. sad = vec_splat_u32(0);
  72. /*
  73. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  74. iteration becomes pix2 in the next iteration. We can use this
  75. fact to avoid a potentially expensive unaligned read, each
  76. time around the loop.
  77. Read unaligned pixels into our vectors. The vectors are as follows:
  78. pix2v: pix2[0]-pix2[15]
  79. Split the pixel vectors into shorts
  80. */
  81. tv = (vector unsigned char *) &pix2[0];
  82. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  83. for(i=0;i<16;i++) {
  84. /*
  85. Read unaligned pixels into our vectors. The vectors are as follows:
  86. pix1v: pix1[0]-pix1[15]
  87. pix3v: pix3[0]-pix3[15]
  88. */
  89. tv = (vector unsigned char *) pix1;
  90. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  91. tv = (vector unsigned char *) &pix3[0];
  92. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  93. /* Calculate the average vector */
  94. avgv = vec_avg(pix2v, pix3v);
  95. /* Calculate a sum of abs differences vector */
  96. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  97. /* Add each 4 pixel group together and put 4 results into sad */
  98. sad = vec_sum4s(t5, sad);
  99. pix1 += line_size;
  100. pix2v = pix3v;
  101. pix3 += line_size;
  102. }
  103. /* Sum up the four partial sums, and put the result into s */
  104. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  105. sumdiffs = vec_splat(sumdiffs, 3);
  106. vec_ste(sumdiffs, 0, &s);
  107. return s;
  108. }
  109. int pix_abs16x16_xy2_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  110. {
  111. int s, i;
  112. uint8_t *pix3 = pix2 + line_size;
  113. vector unsigned char *tv, avgv, t5, zero;
  114. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  115. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  116. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  117. vector unsigned short avghv, avglv, two;
  118. vector unsigned short t1, t2, t3, t4;
  119. vector unsigned int sad;
  120. vector signed int sumdiffs;
  121. zero = vec_splat_u8(0);
  122. two = vec_splat_u16(2);
  123. sad = vec_splat_u32(0);
  124. s = 0;
  125. /*
  126. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  127. iteration becomes pix2 in the next iteration. We can use this
  128. fact to avoid a potentially expensive unaligned read, as well
  129. as some splitting, and vector addition each time around the loop.
  130. Read unaligned pixels into our vectors. The vectors are as follows:
  131. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  132. Split the pixel vectors into shorts
  133. */
  134. tv = (vector unsigned char *) &pix2[0];
  135. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  136. tv = (vector unsigned char *) &pix2[1];
  137. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  138. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  139. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  140. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  141. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  142. t1 = vec_add(pix2hv, pix2ihv);
  143. t2 = vec_add(pix2lv, pix2ilv);
  144. for(i=0;i<16;i++) {
  145. /*
  146. Read unaligned pixels into our vectors. The vectors are as follows:
  147. pix1v: pix1[0]-pix1[15]
  148. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  149. */
  150. tv = (vector unsigned char *) pix1;
  151. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  152. tv = (vector unsigned char *) &pix3[0];
  153. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  154. tv = (vector unsigned char *) &pix3[1];
  155. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  156. /*
  157. Note that Altivec does have vec_avg, but this works on vector pairs
  158. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  159. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  160. Instead, we have to split the pixel vectors into vectors of shorts,
  161. and do the averaging by hand.
  162. */
  163. /* Split the pixel vectors into shorts */
  164. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  165. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  166. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  167. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  168. /* Do the averaging on them */
  169. t3 = vec_add(pix3hv, pix3ihv);
  170. t4 = vec_add(pix3lv, pix3ilv);
  171. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  172. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  173. /* Pack the shorts back into a result */
  174. avgv = vec_pack(avghv, avglv);
  175. /* Calculate a sum of abs differences vector */
  176. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  177. /* Add each 4 pixel group together and put 4 results into sad */
  178. sad = vec_sum4s(t5, sad);
  179. pix1 += line_size;
  180. pix3 += line_size;
  181. /* Transfer the calculated values for pix3 into pix2 */
  182. t1 = t3;
  183. t2 = t4;
  184. }
  185. /* Sum up the four partial sums, and put the result into s */
  186. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  187. sumdiffs = vec_splat(sumdiffs, 3);
  188. vec_ste(sumdiffs, 0, &s);
  189. return s;
  190. }
  191. int pix_abs16x16_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  192. {
  193. int i, s;
  194. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  195. vector unsigned char t1, t2, t3,t4, t5;
  196. vector unsigned int sad, zero;
  197. vector signed int sumdiffs;
  198. zero = (vector unsigned int) (0);
  199. sad = (vector unsigned int) (0);
  200. for(i=0;i<16;i++) {
  201. /* Read potentially unaligned pixels into t1 and t2 */
  202. perm1 = vec_lvsl(0, pix1);
  203. pix1v = (vector unsigned char *) pix1;
  204. perm2 = vec_lvsl(0, pix2);
  205. pix2v = (vector unsigned char *) pix2;
  206. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  207. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  208. /* Calculate a sum of abs differences vector */
  209. t3 = vec_max(t1, t2);
  210. t4 = vec_min(t1, t2);
  211. t5 = vec_sub(t3, t4);
  212. /* Add each 4 pixel group together and put 4 results into sad */
  213. sad = vec_sum4s(t5, sad);
  214. pix1 += line_size;
  215. pix2 += line_size;
  216. }
  217. /* Sum up the four partial sums, and put the result into s */
  218. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  219. sumdiffs = vec_splat(sumdiffs, 3);
  220. vec_ste(sumdiffs, 0, &s);
  221. return s;
  222. }
  223. int pix_abs8x8_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  224. {
  225. int i, s;
  226. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  227. vector unsigned char t1, t2, t3,t4, t5;
  228. vector unsigned int sad, zero;
  229. vector signed int sumdiffs;
  230. zero = (vector unsigned int) (0);
  231. sad = (vector unsigned int) (0);
  232. permclear = (vector unsigned char) (255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  233. for(i=0;i<8;i++) {
  234. /* Read potentially unaligned pixels into t1 and t2
  235. Since we're reading 16 pixels, and actually only want 8,
  236. mask out the last 8 pixels. The 0s don't change the sum. */
  237. perm1 = vec_lvsl(0, pix1);
  238. pix1v = (vector unsigned char *) pix1;
  239. perm2 = vec_lvsl(0, pix2);
  240. pix2v = (vector unsigned char *) pix2;
  241. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  242. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  243. /* Calculate a sum of abs differences vector */
  244. t3 = vec_max(t1, t2);
  245. t4 = vec_min(t1, t2);
  246. t5 = vec_sub(t3, t4);
  247. /* Add each 4 pixel group together and put 4 results into sad */
  248. sad = vec_sum4s(t5, sad);
  249. pix1 += line_size;
  250. pix2 += line_size;
  251. }
  252. /* Sum up the four partial sums, and put the result into s */
  253. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  254. sumdiffs = vec_splat(sumdiffs, 3);
  255. vec_ste(sumdiffs, 0, &s);
  256. return s;
  257. }
  258. int pix_norm1_altivec(uint8_t *pix, int line_size)
  259. {
  260. int s, i;
  261. vector unsigned char *tv, zero;
  262. vector unsigned char pixv;
  263. vector unsigned int sv;
  264. vector signed int sum;
  265. zero = vec_splat_u8(0);
  266. sv = vec_splat_u32(0);
  267. s = 0;
  268. for (i = 0; i < 16; i++) {
  269. /* Read in the potentially unaligned pixels */
  270. tv = (vector unsigned char *) pix;
  271. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  272. /* Square the values, and add them to our sum */
  273. sv = vec_msum(pixv, pixv, sv);
  274. pix += line_size;
  275. }
  276. /* Sum up the four partial sums, and put the result into s */
  277. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  278. sum = vec_splat(sum, 3);
  279. vec_ste(sum, 0, &s);
  280. return s;
  281. }
  282. int pix_norm_altivec(uint8_t *pix1, uint8_t *pix2, int line_size)
  283. {
  284. int s, i;
  285. vector unsigned char *tv, zero;
  286. vector unsigned char pix1v, pix2v, t5;
  287. vector unsigned int sv;
  288. vector signed int sum;
  289. zero = vec_splat_u8(0);
  290. sv = vec_splat_u32(0);
  291. s = 0;
  292. for (i = 0; i < 16; i++) {
  293. /* Read in the potentially unaligned pixels */
  294. tv = (vector unsigned char *) pix1;
  295. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  296. tv = (vector unsigned char *) pix2;
  297. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix2));
  298. /*
  299. Since we want to use unsigned chars, we can take advantage
  300. of the fact that abs(a-b)^2 = (a-b)^2.
  301. */
  302. /* Calculate a sum of abs differences vector */
  303. t5 = vec_sub(vec_max(pix1v, pix2v), vec_min(pix1v, pix2v));
  304. /* Square the values and add them to our sum */
  305. sv = vec_msum(t5, t5, sv);
  306. pix1 += line_size;
  307. pix2 += line_size;
  308. }
  309. /* Sum up the four partial sums, and put the result into s */
  310. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  311. sum = vec_splat(sum, 3);
  312. vec_ste(sum, 0, &s);
  313. return s;
  314. }
  315. int pix_sum_altivec(UINT8 * pix, int line_size)
  316. {
  317. vector unsigned char perm, *pixv;
  318. vector unsigned char t1;
  319. vector unsigned int sad, zero;
  320. vector signed int sumdiffs;
  321. int s, i;
  322. zero = (vector unsigned int) (0);
  323. sad = (vector unsigned int) (0);
  324. for (i = 0; i < 16; i++) {
  325. /* Read the potentially unaligned 16 pixels into t1 */
  326. perm = vec_lvsl(0, pix);
  327. pixv = (vector unsigned char *) pix;
  328. t1 = vec_perm(pixv[0], pixv[1], perm);
  329. /* Add each 4 pixel group together and put 4 results into sad */
  330. sad = vec_sum4s(t1, sad);
  331. pix += line_size;
  332. }
  333. /* Sum up the four partial sums, and put the result into s */
  334. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  335. sumdiffs = vec_splat(sumdiffs, 3);
  336. vec_ste(sumdiffs, 0, &s);
  337. return s;
  338. }
  339. void get_pixels_altivec(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
  340. {
  341. int i;
  342. vector unsigned char perm, bytes, *pixv;
  343. vector unsigned char zero = (vector unsigned char) (0);
  344. vector signed short shorts;
  345. for(i=0;i<8;i++)
  346. {
  347. // Read potentially unaligned pixels.
  348. // We're reading 16 pixels, and actually only want 8,
  349. // but we simply ignore the extras.
  350. perm = vec_lvsl(0, pixels);
  351. pixv = (vector unsigned char *) pixels;
  352. bytes = vec_perm(pixv[0], pixv[1], perm);
  353. // convert the bytes into shorts
  354. shorts = (vector signed short)vec_mergeh(zero, bytes);
  355. // save the data to the block, we assume the block is 16-byte aligned
  356. vec_st(shorts, i*16, (vector signed short*)block);
  357. pixels += line_size;
  358. }
  359. }
  360. void diff_pixels_altivec(DCTELEM *restrict block, const UINT8 *s1,
  361. const UINT8 *s2, int stride)
  362. {
  363. int i;
  364. vector unsigned char perm, bytes, *pixv;
  365. vector unsigned char zero = (vector unsigned char) (0);
  366. vector signed short shorts1, shorts2;
  367. for(i=0;i<4;i++)
  368. {
  369. // Read potentially unaligned pixels
  370. // We're reading 16 pixels, and actually only want 8,
  371. // but we simply ignore the extras.
  372. perm = vec_lvsl(0, s1);
  373. pixv = (vector unsigned char *) s1;
  374. bytes = vec_perm(pixv[0], pixv[1], perm);
  375. // convert the bytes into shorts
  376. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  377. // Do the same for the second block of pixels
  378. perm = vec_lvsl(0, s2);
  379. pixv = (vector unsigned char *) s2;
  380. bytes = vec_perm(pixv[0], pixv[1], perm);
  381. // convert the bytes into shorts
  382. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  383. // Do the subtraction
  384. shorts1 = vec_sub(shorts1, shorts2);
  385. // save the data to the block, we assume the block is 16-byte aligned
  386. vec_st(shorts1, 0, (vector signed short*)block);
  387. s1 += stride;
  388. s2 += stride;
  389. block += 8;
  390. // The code below is a copy of the code above... This is a manual
  391. // unroll.
  392. // Read potentially unaligned pixels
  393. // We're reading 16 pixels, and actually only want 8,
  394. // but we simply ignore the extras.
  395. perm = vec_lvsl(0, s1);
  396. pixv = (vector unsigned char *) s1;
  397. bytes = vec_perm(pixv[0], pixv[1], perm);
  398. // convert the bytes into shorts
  399. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  400. // Do the same for the second block of pixels
  401. perm = vec_lvsl(0, s2);
  402. pixv = (vector unsigned char *) s2;
  403. bytes = vec_perm(pixv[0], pixv[1], perm);
  404. // convert the bytes into shorts
  405. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  406. // Do the subtraction
  407. shorts1 = vec_sub(shorts1, shorts2);
  408. // save the data to the block, we assume the block is 16-byte aligned
  409. vec_st(shorts1, 0, (vector signed short*)block);
  410. s1 += stride;
  411. s2 += stride;
  412. block += 8;
  413. }
  414. }
  415. int has_altivec(void)
  416. {
  417. #if CONFIG_DARWIN
  418. int sels[2] = {CTL_HW, HW_VECTORUNIT};
  419. int has_vu = 0;
  420. size_t len = sizeof(has_vu);
  421. int err;
  422. err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
  423. if (err == 0) return (has_vu != 0);
  424. #endif
  425. return 0;
  426. }