You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1810 lines
66KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This library is free software; you can redistribute it and/or
  7. * modify it under the terms of the GNU Lesser General Public
  8. * License as published by the Free Software Foundation; either
  9. * version 2 of the License, or (at your option) any later version.
  10. *
  11. * This library is distributed in the hope that it will be useful,
  12. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  14. * Lesser General Public License for more details.
  15. *
  16. * You should have received a copy of the GNU Lesser General Public
  17. * License along with this library; if not, write to the Free Software
  18. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19. */
  20. #include "../dsputil.h"
  21. #include "gcc_fixes.h"
  22. #include "dsputil_altivec.h"
  23. #ifdef CONFIG_DARWIN
  24. #include <sys/sysctl.h>
  25. #else /* CONFIG_DARWIN */
  26. #ifdef __AMIGAOS4__
  27. #include <exec/exec.h>
  28. #include <interfaces/exec.h>
  29. #include <proto/exec.h>
  30. #else /* __AMIGAOS4__ */
  31. #include <signal.h>
  32. #include <setjmp.h>
  33. static sigjmp_buf jmpbuf;
  34. static volatile sig_atomic_t canjump = 0;
  35. static void sigill_handler (int sig)
  36. {
  37. if (!canjump) {
  38. signal (sig, SIG_DFL);
  39. raise (sig);
  40. }
  41. canjump = 0;
  42. siglongjmp (jmpbuf, 1);
  43. }
  44. #endif /* CONFIG_DARWIN */
  45. #endif /* __AMIGAOS4__ */
  46. int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  47. {
  48. int i;
  49. int s __attribute__((aligned(16)));
  50. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  51. vector unsigned char *tv;
  52. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  53. vector unsigned int sad;
  54. vector signed int sumdiffs;
  55. s = 0;
  56. sad = (vector unsigned int)vec_splat_u32(0);
  57. for(i=0;i<h;i++) {
  58. /*
  59. Read unaligned pixels into our vectors. The vectors are as follows:
  60. pix1v: pix1[0]-pix1[15]
  61. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  62. */
  63. tv = (vector unsigned char *) pix1;
  64. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  65. tv = (vector unsigned char *) &pix2[0];
  66. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  67. tv = (vector unsigned char *) &pix2[1];
  68. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  69. /* Calculate the average vector */
  70. avgv = vec_avg(pix2v, pix2iv);
  71. /* Calculate a sum of abs differences vector */
  72. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  73. /* Add each 4 pixel group together and put 4 results into sad */
  74. sad = vec_sum4s(t5, sad);
  75. pix1 += line_size;
  76. pix2 += line_size;
  77. }
  78. /* Sum up the four partial sums, and put the result into s */
  79. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  80. sumdiffs = vec_splat(sumdiffs, 3);
  81. vec_ste(sumdiffs, 0, &s);
  82. return s;
  83. }
  84. int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  85. {
  86. int i;
  87. int s __attribute__((aligned(16)));
  88. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  89. vector unsigned char *tv;
  90. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  91. vector unsigned int sad;
  92. vector signed int sumdiffs;
  93. uint8_t *pix3 = pix2 + line_size;
  94. s = 0;
  95. sad = (vector unsigned int)vec_splat_u32(0);
  96. /*
  97. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  98. iteration becomes pix2 in the next iteration. We can use this
  99. fact to avoid a potentially expensive unaligned read, each
  100. time around the loop.
  101. Read unaligned pixels into our vectors. The vectors are as follows:
  102. pix2v: pix2[0]-pix2[15]
  103. Split the pixel vectors into shorts
  104. */
  105. tv = (vector unsigned char *) &pix2[0];
  106. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  107. for(i=0;i<h;i++) {
  108. /*
  109. Read unaligned pixels into our vectors. The vectors are as follows:
  110. pix1v: pix1[0]-pix1[15]
  111. pix3v: pix3[0]-pix3[15]
  112. */
  113. tv = (vector unsigned char *) pix1;
  114. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  115. tv = (vector unsigned char *) &pix3[0];
  116. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  117. /* Calculate the average vector */
  118. avgv = vec_avg(pix2v, pix3v);
  119. /* Calculate a sum of abs differences vector */
  120. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  121. /* Add each 4 pixel group together and put 4 results into sad */
  122. sad = vec_sum4s(t5, sad);
  123. pix1 += line_size;
  124. pix2v = pix3v;
  125. pix3 += line_size;
  126. }
  127. /* Sum up the four partial sums, and put the result into s */
  128. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  129. sumdiffs = vec_splat(sumdiffs, 3);
  130. vec_ste(sumdiffs, 0, &s);
  131. return s;
  132. }
  133. int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  134. {
  135. int i;
  136. int s __attribute__((aligned(16)));
  137. uint8_t *pix3 = pix2 + line_size;
  138. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  139. const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
  140. vector unsigned char *tv, avgv, t5;
  141. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  142. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  143. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  144. vector unsigned short avghv, avglv;
  145. vector unsigned short t1, t2, t3, t4;
  146. vector unsigned int sad;
  147. vector signed int sumdiffs;
  148. sad = (vector unsigned int)vec_splat_u32(0);
  149. s = 0;
  150. /*
  151. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  152. iteration becomes pix2 in the next iteration. We can use this
  153. fact to avoid a potentially expensive unaligned read, as well
  154. as some splitting, and vector addition each time around the loop.
  155. Read unaligned pixels into our vectors. The vectors are as follows:
  156. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  157. Split the pixel vectors into shorts
  158. */
  159. tv = (vector unsigned char *) &pix2[0];
  160. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  161. tv = (vector unsigned char *) &pix2[1];
  162. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  163. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  164. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  165. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  166. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  167. t1 = vec_add(pix2hv, pix2ihv);
  168. t2 = vec_add(pix2lv, pix2ilv);
  169. for(i=0;i<h;i++) {
  170. /*
  171. Read unaligned pixels into our vectors. The vectors are as follows:
  172. pix1v: pix1[0]-pix1[15]
  173. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  174. */
  175. tv = (vector unsigned char *) pix1;
  176. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  177. tv = (vector unsigned char *) &pix3[0];
  178. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  179. tv = (vector unsigned char *) &pix3[1];
  180. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  181. /*
  182. Note that Altivec does have vec_avg, but this works on vector pairs
  183. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  184. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  185. Instead, we have to split the pixel vectors into vectors of shorts,
  186. and do the averaging by hand.
  187. */
  188. /* Split the pixel vectors into shorts */
  189. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  190. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  191. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  192. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  193. /* Do the averaging on them */
  194. t3 = vec_add(pix3hv, pix3ihv);
  195. t4 = vec_add(pix3lv, pix3ilv);
  196. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  197. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  198. /* Pack the shorts back into a result */
  199. avgv = vec_pack(avghv, avglv);
  200. /* Calculate a sum of abs differences vector */
  201. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  202. /* Add each 4 pixel group together and put 4 results into sad */
  203. sad = vec_sum4s(t5, sad);
  204. pix1 += line_size;
  205. pix3 += line_size;
  206. /* Transfer the calculated values for pix3 into pix2 */
  207. t1 = t3;
  208. t2 = t4;
  209. }
  210. /* Sum up the four partial sums, and put the result into s */
  211. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  212. sumdiffs = vec_splat(sumdiffs, 3);
  213. vec_ste(sumdiffs, 0, &s);
  214. return s;
  215. }
  216. int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  217. {
  218. int i;
  219. int s __attribute__((aligned(16)));
  220. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  221. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  222. vector unsigned char t1, t2, t3,t4, t5;
  223. vector unsigned int sad;
  224. vector signed int sumdiffs;
  225. sad = (vector unsigned int)vec_splat_u32(0);
  226. for(i=0;i<h;i++) {
  227. /* Read potentially unaligned pixels into t1 and t2 */
  228. perm1 = vec_lvsl(0, pix1);
  229. pix1v = (vector unsigned char *) pix1;
  230. perm2 = vec_lvsl(0, pix2);
  231. pix2v = (vector unsigned char *) pix2;
  232. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  233. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  234. /* Calculate a sum of abs differences vector */
  235. t3 = vec_max(t1, t2);
  236. t4 = vec_min(t1, t2);
  237. t5 = vec_sub(t3, t4);
  238. /* Add each 4 pixel group together and put 4 results into sad */
  239. sad = vec_sum4s(t5, sad);
  240. pix1 += line_size;
  241. pix2 += line_size;
  242. }
  243. /* Sum up the four partial sums, and put the result into s */
  244. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  245. sumdiffs = vec_splat(sumdiffs, 3);
  246. vec_ste(sumdiffs, 0, &s);
  247. return s;
  248. }
  249. int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  250. {
  251. int i;
  252. int s __attribute__((aligned(16)));
  253. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  254. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  255. vector unsigned char t1, t2, t3,t4, t5;
  256. vector unsigned int sad;
  257. vector signed int sumdiffs;
  258. sad = (vector unsigned int)vec_splat_u32(0);
  259. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  260. for(i=0;i<h;i++) {
  261. /* Read potentially unaligned pixels into t1 and t2
  262. Since we're reading 16 pixels, and actually only want 8,
  263. mask out the last 8 pixels. The 0s don't change the sum. */
  264. perm1 = vec_lvsl(0, pix1);
  265. pix1v = (vector unsigned char *) pix1;
  266. perm2 = vec_lvsl(0, pix2);
  267. pix2v = (vector unsigned char *) pix2;
  268. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  269. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  270. /* Calculate a sum of abs differences vector */
  271. t3 = vec_max(t1, t2);
  272. t4 = vec_min(t1, t2);
  273. t5 = vec_sub(t3, t4);
  274. /* Add each 4 pixel group together and put 4 results into sad */
  275. sad = vec_sum4s(t5, sad);
  276. pix1 += line_size;
  277. pix2 += line_size;
  278. }
  279. /* Sum up the four partial sums, and put the result into s */
  280. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  281. sumdiffs = vec_splat(sumdiffs, 3);
  282. vec_ste(sumdiffs, 0, &s);
  283. return s;
  284. }
  285. int pix_norm1_altivec(uint8_t *pix, int line_size)
  286. {
  287. int i;
  288. int s __attribute__((aligned(16)));
  289. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  290. vector unsigned char *tv;
  291. vector unsigned char pixv;
  292. vector unsigned int sv;
  293. vector signed int sum;
  294. sv = (vector unsigned int)vec_splat_u32(0);
  295. s = 0;
  296. for (i = 0; i < 16; i++) {
  297. /* Read in the potentially unaligned pixels */
  298. tv = (vector unsigned char *) pix;
  299. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  300. /* Square the values, and add them to our sum */
  301. sv = vec_msum(pixv, pixv, sv);
  302. pix += line_size;
  303. }
  304. /* Sum up the four partial sums, and put the result into s */
  305. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  306. sum = vec_splat(sum, 3);
  307. vec_ste(sum, 0, &s);
  308. return s;
  309. }
  310. /**
  311. * Sum of Squared Errors for a 8x8 block.
  312. * AltiVec-enhanced.
  313. * It's the sad8_altivec code above w/ squaring added.
  314. */
  315. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  316. {
  317. int i;
  318. int s __attribute__((aligned(16)));
  319. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  320. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  321. vector unsigned char t1, t2, t3,t4, t5;
  322. vector unsigned int sum;
  323. vector signed int sumsqr;
  324. sum = (vector unsigned int)vec_splat_u32(0);
  325. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  326. for(i=0;i<h;i++) {
  327. /* Read potentially unaligned pixels into t1 and t2
  328. Since we're reading 16 pixels, and actually only want 8,
  329. mask out the last 8 pixels. The 0s don't change the sum. */
  330. perm1 = vec_lvsl(0, pix1);
  331. pix1v = (vector unsigned char *) pix1;
  332. perm2 = vec_lvsl(0, pix2);
  333. pix2v = (vector unsigned char *) pix2;
  334. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  335. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  336. /*
  337. Since we want to use unsigned chars, we can take advantage
  338. of the fact that abs(a-b)^2 = (a-b)^2.
  339. */
  340. /* Calculate abs differences vector */
  341. t3 = vec_max(t1, t2);
  342. t4 = vec_min(t1, t2);
  343. t5 = vec_sub(t3, t4);
  344. /* Square the values and add them to our sum */
  345. sum = vec_msum(t5, t5, sum);
  346. pix1 += line_size;
  347. pix2 += line_size;
  348. }
  349. /* Sum up the four partial sums, and put the result into s */
  350. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  351. sumsqr = vec_splat(sumsqr, 3);
  352. vec_ste(sumsqr, 0, &s);
  353. return s;
  354. }
  355. /**
  356. * Sum of Squared Errors for a 16x16 block.
  357. * AltiVec-enhanced.
  358. * It's the sad16_altivec code above w/ squaring added.
  359. */
  360. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  361. {
  362. int i;
  363. int s __attribute__((aligned(16)));
  364. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  365. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  366. vector unsigned char t1, t2, t3,t4, t5;
  367. vector unsigned int sum;
  368. vector signed int sumsqr;
  369. sum = (vector unsigned int)vec_splat_u32(0);
  370. for(i=0;i<h;i++) {
  371. /* Read potentially unaligned pixels into t1 and t2 */
  372. perm1 = vec_lvsl(0, pix1);
  373. pix1v = (vector unsigned char *) pix1;
  374. perm2 = vec_lvsl(0, pix2);
  375. pix2v = (vector unsigned char *) pix2;
  376. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  377. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  378. /*
  379. Since we want to use unsigned chars, we can take advantage
  380. of the fact that abs(a-b)^2 = (a-b)^2.
  381. */
  382. /* Calculate abs differences vector */
  383. t3 = vec_max(t1, t2);
  384. t4 = vec_min(t1, t2);
  385. t5 = vec_sub(t3, t4);
  386. /* Square the values and add them to our sum */
  387. sum = vec_msum(t5, t5, sum);
  388. pix1 += line_size;
  389. pix2 += line_size;
  390. }
  391. /* Sum up the four partial sums, and put the result into s */
  392. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  393. sumsqr = vec_splat(sumsqr, 3);
  394. vec_ste(sumsqr, 0, &s);
  395. return s;
  396. }
  397. int pix_sum_altivec(uint8_t * pix, int line_size)
  398. {
  399. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  400. vector unsigned char perm, *pixv;
  401. vector unsigned char t1;
  402. vector unsigned int sad;
  403. vector signed int sumdiffs;
  404. int i;
  405. int s __attribute__((aligned(16)));
  406. sad = (vector unsigned int)vec_splat_u32(0);
  407. for (i = 0; i < 16; i++) {
  408. /* Read the potentially unaligned 16 pixels into t1 */
  409. perm = vec_lvsl(0, pix);
  410. pixv = (vector unsigned char *) pix;
  411. t1 = vec_perm(pixv[0], pixv[1], perm);
  412. /* Add each 4 pixel group together and put 4 results into sad */
  413. sad = vec_sum4s(t1, sad);
  414. pix += line_size;
  415. }
  416. /* Sum up the four partial sums, and put the result into s */
  417. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  418. sumdiffs = vec_splat(sumdiffs, 3);
  419. vec_ste(sumdiffs, 0, &s);
  420. return s;
  421. }
  422. void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  423. {
  424. int i;
  425. vector unsigned char perm, bytes, *pixv;
  426. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  427. vector signed short shorts;
  428. for(i=0;i<8;i++)
  429. {
  430. // Read potentially unaligned pixels.
  431. // We're reading 16 pixels, and actually only want 8,
  432. // but we simply ignore the extras.
  433. perm = vec_lvsl(0, pixels);
  434. pixv = (vector unsigned char *) pixels;
  435. bytes = vec_perm(pixv[0], pixv[1], perm);
  436. // convert the bytes into shorts
  437. shorts = (vector signed short)vec_mergeh(zero, bytes);
  438. // save the data to the block, we assume the block is 16-byte aligned
  439. vec_st(shorts, i*16, (vector signed short*)block);
  440. pixels += line_size;
  441. }
  442. }
  443. void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  444. const uint8_t *s2, int stride)
  445. {
  446. int i;
  447. vector unsigned char perm, bytes, *pixv;
  448. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  449. vector signed short shorts1, shorts2;
  450. for(i=0;i<4;i++)
  451. {
  452. // Read potentially unaligned pixels
  453. // We're reading 16 pixels, and actually only want 8,
  454. // but we simply ignore the extras.
  455. perm = vec_lvsl(0, s1);
  456. pixv = (vector unsigned char *) s1;
  457. bytes = vec_perm(pixv[0], pixv[1], perm);
  458. // convert the bytes into shorts
  459. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  460. // Do the same for the second block of pixels
  461. perm = vec_lvsl(0, s2);
  462. pixv = (vector unsigned char *) s2;
  463. bytes = vec_perm(pixv[0], pixv[1], perm);
  464. // convert the bytes into shorts
  465. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  466. // Do the subtraction
  467. shorts1 = vec_sub(shorts1, shorts2);
  468. // save the data to the block, we assume the block is 16-byte aligned
  469. vec_st(shorts1, 0, (vector signed short*)block);
  470. s1 += stride;
  471. s2 += stride;
  472. block += 8;
  473. // The code below is a copy of the code above... This is a manual
  474. // unroll.
  475. // Read potentially unaligned pixels
  476. // We're reading 16 pixels, and actually only want 8,
  477. // but we simply ignore the extras.
  478. perm = vec_lvsl(0, s1);
  479. pixv = (vector unsigned char *) s1;
  480. bytes = vec_perm(pixv[0], pixv[1], perm);
  481. // convert the bytes into shorts
  482. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  483. // Do the same for the second block of pixels
  484. perm = vec_lvsl(0, s2);
  485. pixv = (vector unsigned char *) s2;
  486. bytes = vec_perm(pixv[0], pixv[1], perm);
  487. // convert the bytes into shorts
  488. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  489. // Do the subtraction
  490. shorts1 = vec_sub(shorts1, shorts2);
  491. // save the data to the block, we assume the block is 16-byte aligned
  492. vec_st(shorts1, 0, (vector signed short*)block);
  493. s1 += stride;
  494. s2 += stride;
  495. block += 8;
  496. }
  497. }
  498. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  499. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  500. int i;
  501. for(i=0; i+7<w; i++){
  502. dst[i+0] += src[i+0];
  503. dst[i+1] += src[i+1];
  504. dst[i+2] += src[i+2];
  505. dst[i+3] += src[i+3];
  506. dst[i+4] += src[i+4];
  507. dst[i+5] += src[i+5];
  508. dst[i+6] += src[i+6];
  509. dst[i+7] += src[i+7];
  510. }
  511. for(; i<w; i++)
  512. dst[i+0] += src[i+0];
  513. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  514. register int i;
  515. register vector unsigned char vdst, vsrc;
  516. /* dst and src are 16 bytes-aligned (guaranteed) */
  517. for(i = 0 ; (i + 15) < w ; i++)
  518. {
  519. vdst = vec_ld(i << 4, (unsigned char*)dst);
  520. vsrc = vec_ld(i << 4, (unsigned char*)src);
  521. vdst = vec_add(vsrc, vdst);
  522. vec_st(vdst, i << 4, (unsigned char*)dst);
  523. }
  524. /* if w is not a multiple of 16 */
  525. for (; (i < w) ; i++)
  526. {
  527. dst[i] = src[i];
  528. }
  529. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  530. }
  531. /* next one assumes that ((line_size % 16) == 0) */
  532. void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  533. {
  534. POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
  535. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  536. int i;
  537. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  538. for(i=0; i<h; i++) {
  539. *((uint32_t*)(block)) = LD32(pixels);
  540. *((uint32_t*)(block+4)) = LD32(pixels+4);
  541. *((uint32_t*)(block+8)) = LD32(pixels+8);
  542. *((uint32_t*)(block+12)) = LD32(pixels+12);
  543. pixels+=line_size;
  544. block +=line_size;
  545. }
  546. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  547. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  548. register vector unsigned char pixelsv1, pixelsv2;
  549. register vector unsigned char pixelsv1B, pixelsv2B;
  550. register vector unsigned char pixelsv1C, pixelsv2C;
  551. register vector unsigned char pixelsv1D, pixelsv2D;
  552. register vector unsigned char perm = vec_lvsl(0, pixels);
  553. int i;
  554. register int line_size_2 = line_size << 1;
  555. register int line_size_3 = line_size + line_size_2;
  556. register int line_size_4 = line_size << 2;
  557. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  558. // hand-unrolling the loop by 4 gains about 15%
  559. // mininum execution time goes from 74 to 60 cycles
  560. // it's faster than -funroll-loops, but using
  561. // -funroll-loops w/ this is bad - 74 cycles again.
  562. // all this is on a 7450, tuning for the 7450
  563. #if 0
  564. for(i=0; i<h; i++) {
  565. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  566. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  567. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  568. 0, (unsigned char*)block);
  569. pixels+=line_size;
  570. block +=line_size;
  571. }
  572. #else
  573. for(i=0; i<h; i+=4) {
  574. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  575. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  576. pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
  577. pixelsv2B = vec_ld(16 + line_size, (unsigned char*)pixels);
  578. pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
  579. pixelsv2C = vec_ld(16 + line_size_2, (unsigned char*)pixels);
  580. pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
  581. pixelsv2D = vec_ld(16 + line_size_3, (unsigned char*)pixels);
  582. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  583. 0, (unsigned char*)block);
  584. vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
  585. line_size, (unsigned char*)block);
  586. vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
  587. line_size_2, (unsigned char*)block);
  588. vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
  589. line_size_3, (unsigned char*)block);
  590. pixels+=line_size_4;
  591. block +=line_size_4;
  592. }
  593. #endif
  594. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  595. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  596. }
  597. /* next one assumes that ((line_size % 16) == 0) */
  598. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  599. void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  600. {
  601. POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
  602. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  603. int i;
  604. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  605. for(i=0; i<h; i++) {
  606. op_avg(*((uint32_t*)(block)),LD32(pixels));
  607. op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
  608. op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
  609. op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
  610. pixels+=line_size;
  611. block +=line_size;
  612. }
  613. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  614. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  615. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  616. register vector unsigned char perm = vec_lvsl(0, pixels);
  617. int i;
  618. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  619. for(i=0; i<h; i++) {
  620. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  621. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  622. blockv = vec_ld(0, block);
  623. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  624. blockv = vec_avg(blockv,pixelsv);
  625. vec_st(blockv, 0, (unsigned char*)block);
  626. pixels+=line_size;
  627. block +=line_size;
  628. }
  629. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  630. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  631. }
  632. /* next one assumes that ((line_size % 8) == 0) */
  633. void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  634. {
  635. POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
  636. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  637. int i;
  638. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  639. for (i = 0; i < h; i++) {
  640. *((uint32_t *) (block)) =
  641. (((*((uint32_t *) (block))) |
  642. ((((const struct unaligned_32 *) (pixels))->l))) -
  643. ((((*((uint32_t *) (block))) ^
  644. ((((const struct unaligned_32 *) (pixels))->
  645. l))) & 0xFEFEFEFEUL) >> 1));
  646. *((uint32_t *) (block + 4)) =
  647. (((*((uint32_t *) (block + 4))) |
  648. ((((const struct unaligned_32 *) (pixels + 4))->l))) -
  649. ((((*((uint32_t *) (block + 4))) ^
  650. ((((const struct unaligned_32 *) (pixels +
  651. 4))->
  652. l))) & 0xFEFEFEFEUL) >> 1));
  653. pixels += line_size;
  654. block += line_size;
  655. }
  656. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  657. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  658. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  659. int i;
  660. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  661. for (i = 0; i < h; i++) {
  662. /*
  663. block is 8 bytes-aligned, so we're either in the
  664. left block (16 bytes-aligned) or in the right block (not)
  665. */
  666. int rightside = ((unsigned long)block & 0x0000000F);
  667. blockv = vec_ld(0, block);
  668. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  669. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  670. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  671. if (rightside)
  672. {
  673. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  674. }
  675. else
  676. {
  677. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  678. }
  679. blockv = vec_avg(blockv, pixelsv);
  680. vec_st(blockv, 0, block);
  681. pixels += line_size;
  682. block += line_size;
  683. }
  684. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  685. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  686. }
  687. /* next one assumes that ((line_size % 8) == 0) */
  688. void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  689. {
  690. POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
  691. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  692. int j;
  693. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  694. for (j = 0; j < 2; j++) {
  695. int i;
  696. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  697. const uint32_t b =
  698. (((const struct unaligned_32 *) (pixels + 1))->l);
  699. uint32_t l0 =
  700. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  701. uint32_t h0 =
  702. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  703. uint32_t l1, h1;
  704. pixels += line_size;
  705. for (i = 0; i < h; i += 2) {
  706. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  707. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  708. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  709. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  710. *((uint32_t *) block) =
  711. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  712. pixels += line_size;
  713. block += line_size;
  714. a = (((const struct unaligned_32 *) (pixels))->l);
  715. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  716. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  717. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  718. *((uint32_t *) block) =
  719. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  720. pixels += line_size;
  721. block += line_size;
  722. } pixels += 4 - line_size * (h + 1);
  723. block += 4 - line_size * h;
  724. }
  725. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  726. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  727. register int i;
  728. register vector unsigned char
  729. pixelsv1, pixelsv2,
  730. pixelsavg;
  731. register vector unsigned char
  732. blockv, temp1, temp2;
  733. register vector unsigned short
  734. pixelssum1, pixelssum2, temp3;
  735. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  736. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  737. temp1 = vec_ld(0, pixels);
  738. temp2 = vec_ld(16, pixels);
  739. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  740. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  741. {
  742. pixelsv2 = temp2;
  743. }
  744. else
  745. {
  746. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  747. }
  748. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  749. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  750. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  751. (vector unsigned short)pixelsv2);
  752. pixelssum1 = vec_add(pixelssum1, vctwo);
  753. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  754. for (i = 0; i < h ; i++) {
  755. int rightside = ((unsigned long)block & 0x0000000F);
  756. blockv = vec_ld(0, block);
  757. temp1 = vec_ld(line_size, pixels);
  758. temp2 = vec_ld(line_size + 16, pixels);
  759. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  760. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  761. {
  762. pixelsv2 = temp2;
  763. }
  764. else
  765. {
  766. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  767. }
  768. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  769. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  770. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  771. (vector unsigned short)pixelsv2);
  772. temp3 = vec_add(pixelssum1, pixelssum2);
  773. temp3 = vec_sra(temp3, vctwo);
  774. pixelssum1 = vec_add(pixelssum2, vctwo);
  775. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  776. if (rightside)
  777. {
  778. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  779. }
  780. else
  781. {
  782. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  783. }
  784. vec_st(blockv, 0, block);
  785. block += line_size;
  786. pixels += line_size;
  787. }
  788. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  789. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  790. }
  791. /* next one assumes that ((line_size % 8) == 0) */
  792. void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  793. {
  794. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
  795. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  796. int j;
  797. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  798. for (j = 0; j < 2; j++) {
  799. int i;
  800. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  801. const uint32_t b =
  802. (((const struct unaligned_32 *) (pixels + 1))->l);
  803. uint32_t l0 =
  804. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  805. uint32_t h0 =
  806. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  807. uint32_t l1, h1;
  808. pixels += line_size;
  809. for (i = 0; i < h; i += 2) {
  810. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  811. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  812. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  813. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  814. *((uint32_t *) block) =
  815. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  816. pixels += line_size;
  817. block += line_size;
  818. a = (((const struct unaligned_32 *) (pixels))->l);
  819. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  820. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  821. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  822. *((uint32_t *) block) =
  823. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  824. pixels += line_size;
  825. block += line_size;
  826. } pixels += 4 - line_size * (h + 1);
  827. block += 4 - line_size * h;
  828. }
  829. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  830. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  831. register int i;
  832. register vector unsigned char
  833. pixelsv1, pixelsv2,
  834. pixelsavg;
  835. register vector unsigned char
  836. blockv, temp1, temp2;
  837. register vector unsigned short
  838. pixelssum1, pixelssum2, temp3;
  839. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  840. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  841. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  842. temp1 = vec_ld(0, pixels);
  843. temp2 = vec_ld(16, pixels);
  844. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  845. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  846. {
  847. pixelsv2 = temp2;
  848. }
  849. else
  850. {
  851. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  852. }
  853. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  854. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  855. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  856. (vector unsigned short)pixelsv2);
  857. pixelssum1 = vec_add(pixelssum1, vcone);
  858. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  859. for (i = 0; i < h ; i++) {
  860. int rightside = ((unsigned long)block & 0x0000000F);
  861. blockv = vec_ld(0, block);
  862. temp1 = vec_ld(line_size, pixels);
  863. temp2 = vec_ld(line_size + 16, pixels);
  864. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  865. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  866. {
  867. pixelsv2 = temp2;
  868. }
  869. else
  870. {
  871. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  872. }
  873. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  874. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  875. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  876. (vector unsigned short)pixelsv2);
  877. temp3 = vec_add(pixelssum1, pixelssum2);
  878. temp3 = vec_sra(temp3, vctwo);
  879. pixelssum1 = vec_add(pixelssum2, vcone);
  880. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  881. if (rightside)
  882. {
  883. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  884. }
  885. else
  886. {
  887. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  888. }
  889. vec_st(blockv, 0, block);
  890. block += line_size;
  891. pixels += line_size;
  892. }
  893. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  894. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  895. }
  896. /* next one assumes that ((line_size % 16) == 0) */
  897. void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  898. {
  899. POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
  900. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  901. int j;
  902. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  903. for (j = 0; j < 4; j++) {
  904. int i;
  905. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  906. const uint32_t b =
  907. (((const struct unaligned_32 *) (pixels + 1))->l);
  908. uint32_t l0 =
  909. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  910. uint32_t h0 =
  911. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  912. uint32_t l1, h1;
  913. pixels += line_size;
  914. for (i = 0; i < h; i += 2) {
  915. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  916. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  917. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  918. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  919. *((uint32_t *) block) =
  920. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  921. pixels += line_size;
  922. block += line_size;
  923. a = (((const struct unaligned_32 *) (pixels))->l);
  924. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  925. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  926. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  927. *((uint32_t *) block) =
  928. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  929. pixels += line_size;
  930. block += line_size;
  931. } pixels += 4 - line_size * (h + 1);
  932. block += 4 - line_size * h;
  933. }
  934. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  935. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  936. register int i;
  937. register vector unsigned char
  938. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  939. register vector unsigned char
  940. blockv, temp1, temp2;
  941. register vector unsigned short
  942. pixelssum1, pixelssum2, temp3,
  943. pixelssum3, pixelssum4, temp4;
  944. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  945. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  946. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  947. temp1 = vec_ld(0, pixels);
  948. temp2 = vec_ld(16, pixels);
  949. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  950. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  951. {
  952. pixelsv2 = temp2;
  953. }
  954. else
  955. {
  956. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  957. }
  958. pixelsv3 = vec_mergel(vczero, pixelsv1);
  959. pixelsv4 = vec_mergel(vczero, pixelsv2);
  960. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  961. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  962. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  963. (vector unsigned short)pixelsv4);
  964. pixelssum3 = vec_add(pixelssum3, vctwo);
  965. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  966. (vector unsigned short)pixelsv2);
  967. pixelssum1 = vec_add(pixelssum1, vctwo);
  968. for (i = 0; i < h ; i++) {
  969. blockv = vec_ld(0, block);
  970. temp1 = vec_ld(line_size, pixels);
  971. temp2 = vec_ld(line_size + 16, pixels);
  972. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  973. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  974. {
  975. pixelsv2 = temp2;
  976. }
  977. else
  978. {
  979. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  980. }
  981. pixelsv3 = vec_mergel(vczero, pixelsv1);
  982. pixelsv4 = vec_mergel(vczero, pixelsv2);
  983. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  984. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  985. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  986. (vector unsigned short)pixelsv4);
  987. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  988. (vector unsigned short)pixelsv2);
  989. temp4 = vec_add(pixelssum3, pixelssum4);
  990. temp4 = vec_sra(temp4, vctwo);
  991. temp3 = vec_add(pixelssum1, pixelssum2);
  992. temp3 = vec_sra(temp3, vctwo);
  993. pixelssum3 = vec_add(pixelssum4, vctwo);
  994. pixelssum1 = vec_add(pixelssum2, vctwo);
  995. blockv = vec_packsu(temp3, temp4);
  996. vec_st(blockv, 0, block);
  997. block += line_size;
  998. pixels += line_size;
  999. }
  1000. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  1001. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  1002. }
  1003. /* next one assumes that ((line_size % 16) == 0) */
  1004. void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  1005. {
  1006. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1007. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  1008. int j;
  1009. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1010. for (j = 0; j < 4; j++) {
  1011. int i;
  1012. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1013. const uint32_t b =
  1014. (((const struct unaligned_32 *) (pixels + 1))->l);
  1015. uint32_t l0 =
  1016. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  1017. uint32_t h0 =
  1018. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1019. uint32_t l1, h1;
  1020. pixels += line_size;
  1021. for (i = 0; i < h; i += 2) {
  1022. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1023. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1024. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  1025. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1026. *((uint32_t *) block) =
  1027. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1028. pixels += line_size;
  1029. block += line_size;
  1030. a = (((const struct unaligned_32 *) (pixels))->l);
  1031. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1032. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  1033. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1034. *((uint32_t *) block) =
  1035. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1036. pixels += line_size;
  1037. block += line_size;
  1038. } pixels += 4 - line_size * (h + 1);
  1039. block += 4 - line_size * h;
  1040. }
  1041. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1042. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  1043. register int i;
  1044. register vector unsigned char
  1045. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  1046. register vector unsigned char
  1047. blockv, temp1, temp2;
  1048. register vector unsigned short
  1049. pixelssum1, pixelssum2, temp3,
  1050. pixelssum3, pixelssum4, temp4;
  1051. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  1052. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  1053. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  1054. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1055. temp1 = vec_ld(0, pixels);
  1056. temp2 = vec_ld(16, pixels);
  1057. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1058. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  1059. {
  1060. pixelsv2 = temp2;
  1061. }
  1062. else
  1063. {
  1064. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1065. }
  1066. pixelsv3 = vec_mergel(vczero, pixelsv1);
  1067. pixelsv4 = vec_mergel(vczero, pixelsv2);
  1068. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1069. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1070. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  1071. (vector unsigned short)pixelsv4);
  1072. pixelssum3 = vec_add(pixelssum3, vcone);
  1073. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1074. (vector unsigned short)pixelsv2);
  1075. pixelssum1 = vec_add(pixelssum1, vcone);
  1076. for (i = 0; i < h ; i++) {
  1077. blockv = vec_ld(0, block);
  1078. temp1 = vec_ld(line_size, pixels);
  1079. temp2 = vec_ld(line_size + 16, pixels);
  1080. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1081. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1082. {
  1083. pixelsv2 = temp2;
  1084. }
  1085. else
  1086. {
  1087. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1088. }
  1089. pixelsv3 = vec_mergel(vczero, pixelsv1);
  1090. pixelsv4 = vec_mergel(vczero, pixelsv2);
  1091. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1092. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1093. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  1094. (vector unsigned short)pixelsv4);
  1095. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1096. (vector unsigned short)pixelsv2);
  1097. temp4 = vec_add(pixelssum3, pixelssum4);
  1098. temp4 = vec_sra(temp4, vctwo);
  1099. temp3 = vec_add(pixelssum1, pixelssum2);
  1100. temp3 = vec_sra(temp3, vctwo);
  1101. pixelssum3 = vec_add(pixelssum4, vcone);
  1102. pixelssum1 = vec_add(pixelssum2, vcone);
  1103. blockv = vec_packsu(temp3, temp4);
  1104. vec_st(blockv, 0, block);
  1105. block += line_size;
  1106. pixels += line_size;
  1107. }
  1108. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1109. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  1110. }
  1111. int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1112. POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
  1113. int sum;
  1114. register const_vector unsigned char vzero = (const_vector unsigned char)vec_splat_u8(0);
  1115. register vector signed short temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
  1116. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
  1117. {
  1118. register const_vector signed short vprod1 = (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  1119. register const_vector signed short vprod2 = (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  1120. register const_vector signed short vprod3 = (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  1121. register const_vector unsigned char perm1 = (const_vector unsigned char)
  1122. AVV(0x02, 0x03, 0x00, 0x01,
  1123. 0x06, 0x07, 0x04, 0x05,
  1124. 0x0A, 0x0B, 0x08, 0x09,
  1125. 0x0E, 0x0F, 0x0C, 0x0D);
  1126. register const_vector unsigned char perm2 = (const_vector unsigned char)
  1127. AVV(0x04, 0x05, 0x06, 0x07,
  1128. 0x00, 0x01, 0x02, 0x03,
  1129. 0x0C, 0x0D, 0x0E, 0x0F,
  1130. 0x08, 0x09, 0x0A, 0x0B);
  1131. register const_vector unsigned char perm3 = (const_vector unsigned char)
  1132. AVV(0x08, 0x09, 0x0A, 0x0B,
  1133. 0x0C, 0x0D, 0x0E, 0x0F,
  1134. 0x00, 0x01, 0x02, 0x03,
  1135. 0x04, 0x05, 0x06, 0x07);
  1136. #define ONEITERBUTTERFLY(i, res) \
  1137. { \
  1138. register vector unsigned char src1, src2, srcO; \
  1139. register vector unsigned char dst1, dst2, dstO; \
  1140. register vector signed short srcV, dstV; \
  1141. register vector signed short but0, but1, but2, op1, op2, op3; \
  1142. src1 = vec_ld(stride * i, src); \
  1143. if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
  1144. src2 = vec_ld((stride * i) + 16, src); \
  1145. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  1146. dst1 = vec_ld(stride * i, dst); \
  1147. if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
  1148. dst2 = vec_ld((stride * i) + 16, dst); \
  1149. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  1150. /* promote the unsigned chars to signed shorts */ \
  1151. /* we're in the 8x8 function, we only care for the first 8 */ \
  1152. srcV = \
  1153. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1154. (vector signed char)srcO); \
  1155. dstV = \
  1156. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1157. (vector signed char)dstO); \
  1158. /* substractions inside the first butterfly */ \
  1159. but0 = vec_sub(srcV, dstV); \
  1160. op1 = vec_perm(but0, but0, perm1); \
  1161. but1 = vec_mladd(but0, vprod1, op1); \
  1162. op2 = vec_perm(but1, but1, perm2); \
  1163. but2 = vec_mladd(but1, vprod2, op2); \
  1164. op3 = vec_perm(but2, but2, perm3); \
  1165. res = vec_mladd(but2, vprod3, op3); \
  1166. }
  1167. ONEITERBUTTERFLY(0, temp0);
  1168. ONEITERBUTTERFLY(1, temp1);
  1169. ONEITERBUTTERFLY(2, temp2);
  1170. ONEITERBUTTERFLY(3, temp3);
  1171. ONEITERBUTTERFLY(4, temp4);
  1172. ONEITERBUTTERFLY(5, temp5);
  1173. ONEITERBUTTERFLY(6, temp6);
  1174. ONEITERBUTTERFLY(7, temp7);
  1175. }
  1176. #undef ONEITERBUTTERFLY
  1177. {
  1178. register vector signed int vsum;
  1179. register vector signed short line0 = vec_add(temp0, temp1);
  1180. register vector signed short line1 = vec_sub(temp0, temp1);
  1181. register vector signed short line2 = vec_add(temp2, temp3);
  1182. register vector signed short line3 = vec_sub(temp2, temp3);
  1183. register vector signed short line4 = vec_add(temp4, temp5);
  1184. register vector signed short line5 = vec_sub(temp4, temp5);
  1185. register vector signed short line6 = vec_add(temp6, temp7);
  1186. register vector signed short line7 = vec_sub(temp6, temp7);
  1187. register vector signed short line0B = vec_add(line0, line2);
  1188. register vector signed short line2B = vec_sub(line0, line2);
  1189. register vector signed short line1B = vec_add(line1, line3);
  1190. register vector signed short line3B = vec_sub(line1, line3);
  1191. register vector signed short line4B = vec_add(line4, line6);
  1192. register vector signed short line6B = vec_sub(line4, line6);
  1193. register vector signed short line5B = vec_add(line5, line7);
  1194. register vector signed short line7B = vec_sub(line5, line7);
  1195. register vector signed short line0C = vec_add(line0B, line4B);
  1196. register vector signed short line4C = vec_sub(line0B, line4B);
  1197. register vector signed short line1C = vec_add(line1B, line5B);
  1198. register vector signed short line5C = vec_sub(line1B, line5B);
  1199. register vector signed short line2C = vec_add(line2B, line6B);
  1200. register vector signed short line6C = vec_sub(line2B, line6B);
  1201. register vector signed short line3C = vec_add(line3B, line7B);
  1202. register vector signed short line7C = vec_sub(line3B, line7B);
  1203. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1204. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1205. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1206. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1207. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1208. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1209. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1210. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1211. vsum = vec_sums(vsum, (vector signed int)vzero);
  1212. vsum = vec_splat(vsum, 3);
  1213. vec_ste(vsum, 0, &sum);
  1214. }
  1215. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
  1216. return sum;
  1217. }
  1218. /*
  1219. 16x8 works with 16 elements ; it allows to avoid replicating
  1220. loads, and give the compiler more rooms for scheduling.
  1221. It's only used from inside hadamard8_diff16_altivec.
  1222. Unfortunately, it seems gcc-3.3 is a bit dumb, and
  1223. the compiled code has a LOT of spill code, it seems
  1224. gcc (unlike xlc) cannot keep everything in registers
  1225. by itself. The following code include hand-made
  1226. registers allocation. It's not clean, but on
  1227. a 7450 the resulting code is much faster (best case
  1228. fall from 700+ cycles to 550).
  1229. xlc doesn't add spill code, but it doesn't know how to
  1230. schedule for the 7450, and its code isn't much faster than
  1231. gcc-3.3 on the 7450 (but uses 25% less instructions...)
  1232. On the 970, the hand-made RA is still a win (arount 690
  1233. vs. around 780), but xlc goes to around 660 on the
  1234. regular C code...
  1235. */
  1236. static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
  1237. int sum;
  1238. register vector signed short
  1239. temp0 REG_v(v0),
  1240. temp1 REG_v(v1),
  1241. temp2 REG_v(v2),
  1242. temp3 REG_v(v3),
  1243. temp4 REG_v(v4),
  1244. temp5 REG_v(v5),
  1245. temp6 REG_v(v6),
  1246. temp7 REG_v(v7);
  1247. register vector signed short
  1248. temp0S REG_v(v8),
  1249. temp1S REG_v(v9),
  1250. temp2S REG_v(v10),
  1251. temp3S REG_v(v11),
  1252. temp4S REG_v(v12),
  1253. temp5S REG_v(v13),
  1254. temp6S REG_v(v14),
  1255. temp7S REG_v(v15);
  1256. register const_vector unsigned char vzero REG_v(v31)= (const_vector unsigned char)vec_splat_u8(0);
  1257. {
  1258. register const_vector signed short vprod1 REG_v(v16)= (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  1259. register const_vector signed short vprod2 REG_v(v17)= (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  1260. register const_vector signed short vprod3 REG_v(v18)= (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  1261. register const_vector unsigned char perm1 REG_v(v19)= (const_vector unsigned char)
  1262. AVV(0x02, 0x03, 0x00, 0x01,
  1263. 0x06, 0x07, 0x04, 0x05,
  1264. 0x0A, 0x0B, 0x08, 0x09,
  1265. 0x0E, 0x0F, 0x0C, 0x0D);
  1266. register const_vector unsigned char perm2 REG_v(v20)= (const_vector unsigned char)
  1267. AVV(0x04, 0x05, 0x06, 0x07,
  1268. 0x00, 0x01, 0x02, 0x03,
  1269. 0x0C, 0x0D, 0x0E, 0x0F,
  1270. 0x08, 0x09, 0x0A, 0x0B);
  1271. register const_vector unsigned char perm3 REG_v(v21)= (const_vector unsigned char)
  1272. AVV(0x08, 0x09, 0x0A, 0x0B,
  1273. 0x0C, 0x0D, 0x0E, 0x0F,
  1274. 0x00, 0x01, 0x02, 0x03,
  1275. 0x04, 0x05, 0x06, 0x07);
  1276. #define ONEITERBUTTERFLY(i, res1, res2) \
  1277. { \
  1278. register vector unsigned char src1 REG_v(v22), \
  1279. src2 REG_v(v23), \
  1280. dst1 REG_v(v24), \
  1281. dst2 REG_v(v25), \
  1282. srcO REG_v(v22), \
  1283. dstO REG_v(v23); \
  1284. \
  1285. register vector signed short srcV REG_v(v24), \
  1286. dstV REG_v(v25), \
  1287. srcW REG_v(v26), \
  1288. dstW REG_v(v27), \
  1289. but0 REG_v(v28), \
  1290. but0S REG_v(v29), \
  1291. op1 REG_v(v30), \
  1292. but1 REG_v(v22), \
  1293. op1S REG_v(v23), \
  1294. but1S REG_v(v24), \
  1295. op2 REG_v(v25), \
  1296. but2 REG_v(v26), \
  1297. op2S REG_v(v27), \
  1298. but2S REG_v(v28), \
  1299. op3 REG_v(v29), \
  1300. op3S REG_v(v30); \
  1301. \
  1302. src1 = vec_ld(stride * i, src); \
  1303. src2 = vec_ld((stride * i) + 16, src); \
  1304. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  1305. dst1 = vec_ld(stride * i, dst); \
  1306. dst2 = vec_ld((stride * i) + 16, dst); \
  1307. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  1308. /* promote the unsigned chars to signed shorts */ \
  1309. srcV = \
  1310. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1311. (vector signed char)srcO); \
  1312. dstV = \
  1313. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1314. (vector signed char)dstO); \
  1315. srcW = \
  1316. (vector signed short)vec_mergel((vector signed char)vzero, \
  1317. (vector signed char)srcO); \
  1318. dstW = \
  1319. (vector signed short)vec_mergel((vector signed char)vzero, \
  1320. (vector signed char)dstO); \
  1321. /* substractions inside the first butterfly */ \
  1322. but0 = vec_sub(srcV, dstV); \
  1323. but0S = vec_sub(srcW, dstW); \
  1324. op1 = vec_perm(but0, but0, perm1); \
  1325. but1 = vec_mladd(but0, vprod1, op1); \
  1326. op1S = vec_perm(but0S, but0S, perm1); \
  1327. but1S = vec_mladd(but0S, vprod1, op1S); \
  1328. op2 = vec_perm(but1, but1, perm2); \
  1329. but2 = vec_mladd(but1, vprod2, op2); \
  1330. op2S = vec_perm(but1S, but1S, perm2); \
  1331. but2S = vec_mladd(but1S, vprod2, op2S); \
  1332. op3 = vec_perm(but2, but2, perm3); \
  1333. res1 = vec_mladd(but2, vprod3, op3); \
  1334. op3S = vec_perm(but2S, but2S, perm3); \
  1335. res2 = vec_mladd(but2S, vprod3, op3S); \
  1336. }
  1337. ONEITERBUTTERFLY(0, temp0, temp0S);
  1338. ONEITERBUTTERFLY(1, temp1, temp1S);
  1339. ONEITERBUTTERFLY(2, temp2, temp2S);
  1340. ONEITERBUTTERFLY(3, temp3, temp3S);
  1341. ONEITERBUTTERFLY(4, temp4, temp4S);
  1342. ONEITERBUTTERFLY(5, temp5, temp5S);
  1343. ONEITERBUTTERFLY(6, temp6, temp6S);
  1344. ONEITERBUTTERFLY(7, temp7, temp7S);
  1345. }
  1346. #undef ONEITERBUTTERFLY
  1347. {
  1348. register vector signed int vsum;
  1349. register vector signed short line0S, line1S, line2S, line3S, line4S,
  1350. line5S, line6S, line7S, line0BS,line2BS,
  1351. line1BS,line3BS,line4BS,line6BS,line5BS,
  1352. line7BS,line0CS,line4CS,line1CS,line5CS,
  1353. line2CS,line6CS,line3CS,line7CS;
  1354. register vector signed short line0 = vec_add(temp0, temp1);
  1355. register vector signed short line1 = vec_sub(temp0, temp1);
  1356. register vector signed short line2 = vec_add(temp2, temp3);
  1357. register vector signed short line3 = vec_sub(temp2, temp3);
  1358. register vector signed short line4 = vec_add(temp4, temp5);
  1359. register vector signed short line5 = vec_sub(temp4, temp5);
  1360. register vector signed short line6 = vec_add(temp6, temp7);
  1361. register vector signed short line7 = vec_sub(temp6, temp7);
  1362. register vector signed short line0B = vec_add(line0, line2);
  1363. register vector signed short line2B = vec_sub(line0, line2);
  1364. register vector signed short line1B = vec_add(line1, line3);
  1365. register vector signed short line3B = vec_sub(line1, line3);
  1366. register vector signed short line4B = vec_add(line4, line6);
  1367. register vector signed short line6B = vec_sub(line4, line6);
  1368. register vector signed short line5B = vec_add(line5, line7);
  1369. register vector signed short line7B = vec_sub(line5, line7);
  1370. register vector signed short line0C = vec_add(line0B, line4B);
  1371. register vector signed short line4C = vec_sub(line0B, line4B);
  1372. register vector signed short line1C = vec_add(line1B, line5B);
  1373. register vector signed short line5C = vec_sub(line1B, line5B);
  1374. register vector signed short line2C = vec_add(line2B, line6B);
  1375. register vector signed short line6C = vec_sub(line2B, line6B);
  1376. register vector signed short line3C = vec_add(line3B, line7B);
  1377. register vector signed short line7C = vec_sub(line3B, line7B);
  1378. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1379. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1380. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1381. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1382. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1383. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1384. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1385. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1386. line0S = vec_add(temp0S, temp1S);
  1387. line1S = vec_sub(temp0S, temp1S);
  1388. line2S = vec_add(temp2S, temp3S);
  1389. line3S = vec_sub(temp2S, temp3S);
  1390. line4S = vec_add(temp4S, temp5S);
  1391. line5S = vec_sub(temp4S, temp5S);
  1392. line6S = vec_add(temp6S, temp7S);
  1393. line7S = vec_sub(temp6S, temp7S);
  1394. line0BS = vec_add(line0S, line2S);
  1395. line2BS = vec_sub(line0S, line2S);
  1396. line1BS = vec_add(line1S, line3S);
  1397. line3BS = vec_sub(line1S, line3S);
  1398. line4BS = vec_add(line4S, line6S);
  1399. line6BS = vec_sub(line4S, line6S);
  1400. line5BS = vec_add(line5S, line7S);
  1401. line7BS = vec_sub(line5S, line7S);
  1402. line0CS = vec_add(line0BS, line4BS);
  1403. line4CS = vec_sub(line0BS, line4BS);
  1404. line1CS = vec_add(line1BS, line5BS);
  1405. line5CS = vec_sub(line1BS, line5BS);
  1406. line2CS = vec_add(line2BS, line6BS);
  1407. line6CS = vec_sub(line2BS, line6BS);
  1408. line3CS = vec_add(line3BS, line7BS);
  1409. line7CS = vec_sub(line3BS, line7BS);
  1410. vsum = vec_sum4s(vec_abs(line0CS), vsum);
  1411. vsum = vec_sum4s(vec_abs(line1CS), vsum);
  1412. vsum = vec_sum4s(vec_abs(line2CS), vsum);
  1413. vsum = vec_sum4s(vec_abs(line3CS), vsum);
  1414. vsum = vec_sum4s(vec_abs(line4CS), vsum);
  1415. vsum = vec_sum4s(vec_abs(line5CS), vsum);
  1416. vsum = vec_sum4s(vec_abs(line6CS), vsum);
  1417. vsum = vec_sum4s(vec_abs(line7CS), vsum);
  1418. vsum = vec_sums(vsum, (vector signed int)vzero);
  1419. vsum = vec_splat(vsum, 3);
  1420. vec_ste(vsum, 0, &sum);
  1421. }
  1422. return sum;
  1423. }
  1424. int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1425. POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
  1426. int score;
  1427. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
  1428. score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1429. if (h==16) {
  1430. dst += 8*stride;
  1431. src += 8*stride;
  1432. score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1433. }
  1434. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
  1435. return score;
  1436. }
  1437. int has_altivec(void)
  1438. {
  1439. #ifdef __AMIGAOS4__
  1440. ULONG result = 0;
  1441. extern struct ExecIFace *IExec;
  1442. IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
  1443. if (result == VECTORTYPE_ALTIVEC) return 1;
  1444. return 0;
  1445. #else /* __AMIGAOS4__ */
  1446. #ifdef CONFIG_DARWIN
  1447. int sels[2] = {CTL_HW, HW_VECTORUNIT};
  1448. int has_vu = 0;
  1449. size_t len = sizeof(has_vu);
  1450. int err;
  1451. err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
  1452. if (err == 0) return (has_vu != 0);
  1453. #else /* CONFIG_DARWIN */
  1454. /* no Darwin, do it the brute-force way */
  1455. /* this is borrowed from the libmpeg2 library */
  1456. {
  1457. signal (SIGILL, sigill_handler);
  1458. if (sigsetjmp (jmpbuf, 1)) {
  1459. signal (SIGILL, SIG_DFL);
  1460. } else {
  1461. canjump = 1;
  1462. asm volatile ("mtspr 256, %0\n\t"
  1463. "vand %%v0, %%v0, %%v0"
  1464. :
  1465. : "r" (-1));
  1466. signal (SIGILL, SIG_DFL);
  1467. return 1;
  1468. }
  1469. }
  1470. #endif /* CONFIG_DARWIN */
  1471. return 0;
  1472. #endif /* __AMIGAOS4__ */
  1473. }
  1474. /* next one assumes that ((line_size % 8) == 0) */
  1475. void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  1476. {
  1477. POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
  1478. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  1479. int j;
  1480. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1481. for (j = 0; j < 2; j++) {
  1482. int i;
  1483. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1484. const uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1485. uint32_t l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  1486. uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1487. uint32_t l1, h1;
  1488. pixels += line_size;
  1489. for (i = 0; i < h; i += 2) {
  1490. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1491. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1492. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  1493. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1494. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  1495. pixels += line_size;
  1496. block += line_size;
  1497. a = (((const struct unaligned_32 *) (pixels))->l);
  1498. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1499. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  1500. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1501. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  1502. pixels += line_size;
  1503. block += line_size;
  1504. } pixels += 4 - line_size * (h + 1);
  1505. block += 4 - line_size * h;
  1506. }
  1507. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1508. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  1509. register int i;
  1510. register vector unsigned char
  1511. pixelsv1, pixelsv2,
  1512. pixelsavg;
  1513. register vector unsigned char
  1514. blockv, temp1, temp2, blocktemp;
  1515. register vector unsigned short
  1516. pixelssum1, pixelssum2, temp3;
  1517. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  1518. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  1519. temp1 = vec_ld(0, pixels);
  1520. temp2 = vec_ld(16, pixels);
  1521. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1522. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  1523. {
  1524. pixelsv2 = temp2;
  1525. }
  1526. else
  1527. {
  1528. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1529. }
  1530. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1531. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1532. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1533. (vector unsigned short)pixelsv2);
  1534. pixelssum1 = vec_add(pixelssum1, vctwo);
  1535. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1536. for (i = 0; i < h ; i++) {
  1537. int rightside = ((unsigned long)block & 0x0000000F);
  1538. blockv = vec_ld(0, block);
  1539. temp1 = vec_ld(line_size, pixels);
  1540. temp2 = vec_ld(line_size + 16, pixels);
  1541. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1542. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1543. {
  1544. pixelsv2 = temp2;
  1545. }
  1546. else
  1547. {
  1548. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1549. }
  1550. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1551. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1552. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1553. (vector unsigned short)pixelsv2);
  1554. temp3 = vec_add(pixelssum1, pixelssum2);
  1555. temp3 = vec_sra(temp3, vctwo);
  1556. pixelssum1 = vec_add(pixelssum2, vctwo);
  1557. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  1558. if (rightside)
  1559. {
  1560. blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  1561. }
  1562. else
  1563. {
  1564. blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  1565. }
  1566. blockv = vec_avg(blocktemp, blockv);
  1567. vec_st(blockv, 0, block);
  1568. block += line_size;
  1569. pixels += line_size;
  1570. }
  1571. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1572. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  1573. }