You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1860 lines
69KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "../dsputil.h"
  23. #include "gcc_fixes.h"
  24. #include "dsputil_altivec.h"
  25. #ifdef CONFIG_DARWIN
  26. #include <sys/sysctl.h>
  27. #else /* CONFIG_DARWIN */
  28. #ifdef __AMIGAOS4__
  29. #include <exec/exec.h>
  30. #include <interfaces/exec.h>
  31. #include <proto/exec.h>
  32. #else /* __AMIGAOS4__ */
  33. #include <signal.h>
  34. #include <setjmp.h>
  35. static sigjmp_buf jmpbuf;
  36. static volatile sig_atomic_t canjump = 0;
  37. static void sigill_handler (int sig)
  38. {
  39. if (!canjump) {
  40. signal (sig, SIG_DFL);
  41. raise (sig);
  42. }
  43. canjump = 0;
  44. siglongjmp (jmpbuf, 1);
  45. }
  46. #endif /* CONFIG_DARWIN */
  47. #endif /* __AMIGAOS4__ */
  48. int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  49. {
  50. int i;
  51. int s __attribute__((aligned(16)));
  52. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  53. vector unsigned char *tv;
  54. vector unsigned char pix1v, pix2v, pix2iv, avgv, t5;
  55. vector unsigned int sad;
  56. vector signed int sumdiffs;
  57. s = 0;
  58. sad = (vector unsigned int)vec_splat_u32(0);
  59. for(i=0;i<h;i++) {
  60. /*
  61. Read unaligned pixels into our vectors. The vectors are as follows:
  62. pix1v: pix1[0]-pix1[15]
  63. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  64. */
  65. tv = (vector unsigned char *) pix1;
  66. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  67. tv = (vector unsigned char *) &pix2[0];
  68. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  69. tv = (vector unsigned char *) &pix2[1];
  70. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  71. /* Calculate the average vector */
  72. avgv = vec_avg(pix2v, pix2iv);
  73. /* Calculate a sum of abs differences vector */
  74. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  75. /* Add each 4 pixel group together and put 4 results into sad */
  76. sad = vec_sum4s(t5, sad);
  77. pix1 += line_size;
  78. pix2 += line_size;
  79. }
  80. /* Sum up the four partial sums, and put the result into s */
  81. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  82. sumdiffs = vec_splat(sumdiffs, 3);
  83. vec_ste(sumdiffs, 0, &s);
  84. return s;
  85. }
  86. int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  87. {
  88. int i;
  89. int s __attribute__((aligned(16)));
  90. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  91. vector unsigned char *tv;
  92. vector unsigned char pix1v, pix2v, pix3v, avgv, t5;
  93. vector unsigned int sad;
  94. vector signed int sumdiffs;
  95. uint8_t *pix3 = pix2 + line_size;
  96. s = 0;
  97. sad = (vector unsigned int)vec_splat_u32(0);
  98. /*
  99. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  100. iteration becomes pix2 in the next iteration. We can use this
  101. fact to avoid a potentially expensive unaligned read, each
  102. time around the loop.
  103. Read unaligned pixels into our vectors. The vectors are as follows:
  104. pix2v: pix2[0]-pix2[15]
  105. Split the pixel vectors into shorts
  106. */
  107. tv = (vector unsigned char *) &pix2[0];
  108. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  109. for(i=0;i<h;i++) {
  110. /*
  111. Read unaligned pixels into our vectors. The vectors are as follows:
  112. pix1v: pix1[0]-pix1[15]
  113. pix3v: pix3[0]-pix3[15]
  114. */
  115. tv = (vector unsigned char *) pix1;
  116. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  117. tv = (vector unsigned char *) &pix3[0];
  118. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  119. /* Calculate the average vector */
  120. avgv = vec_avg(pix2v, pix3v);
  121. /* Calculate a sum of abs differences vector */
  122. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  123. /* Add each 4 pixel group together and put 4 results into sad */
  124. sad = vec_sum4s(t5, sad);
  125. pix1 += line_size;
  126. pix2v = pix3v;
  127. pix3 += line_size;
  128. }
  129. /* Sum up the four partial sums, and put the result into s */
  130. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  131. sumdiffs = vec_splat(sumdiffs, 3);
  132. vec_ste(sumdiffs, 0, &s);
  133. return s;
  134. }
  135. int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  136. {
  137. int i;
  138. int s __attribute__((aligned(16)));
  139. uint8_t *pix3 = pix2 + line_size;
  140. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  141. const_vector unsigned short two = (const_vector unsigned short)vec_splat_u16(2);
  142. vector unsigned char *tv, avgv, t5;
  143. vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv;
  144. vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv;
  145. vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv;
  146. vector unsigned short avghv, avglv;
  147. vector unsigned short t1, t2, t3, t4;
  148. vector unsigned int sad;
  149. vector signed int sumdiffs;
  150. sad = (vector unsigned int)vec_splat_u32(0);
  151. s = 0;
  152. /*
  153. Due to the fact that pix3 = pix2 + line_size, the pix3 of one
  154. iteration becomes pix2 in the next iteration. We can use this
  155. fact to avoid a potentially expensive unaligned read, as well
  156. as some splitting, and vector addition each time around the loop.
  157. Read unaligned pixels into our vectors. The vectors are as follows:
  158. pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16]
  159. Split the pixel vectors into shorts
  160. */
  161. tv = (vector unsigned char *) &pix2[0];
  162. pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0]));
  163. tv = (vector unsigned char *) &pix2[1];
  164. pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1]));
  165. pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v);
  166. pix2lv = (vector unsigned short) vec_mergel(zero, pix2v);
  167. pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv);
  168. pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv);
  169. t1 = vec_add(pix2hv, pix2ihv);
  170. t2 = vec_add(pix2lv, pix2ilv);
  171. for(i=0;i<h;i++) {
  172. /*
  173. Read unaligned pixels into our vectors. The vectors are as follows:
  174. pix1v: pix1[0]-pix1[15]
  175. pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16]
  176. */
  177. tv = (vector unsigned char *) pix1;
  178. pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1));
  179. tv = (vector unsigned char *) &pix3[0];
  180. pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0]));
  181. tv = (vector unsigned char *) &pix3[1];
  182. pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1]));
  183. /*
  184. Note that Altivec does have vec_avg, but this works on vector pairs
  185. and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding
  186. would mean that, for example, avg(3,0,0,1) = 2, when it should be 1.
  187. Instead, we have to split the pixel vectors into vectors of shorts,
  188. and do the averaging by hand.
  189. */
  190. /* Split the pixel vectors into shorts */
  191. pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v);
  192. pix3lv = (vector unsigned short) vec_mergel(zero, pix3v);
  193. pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv);
  194. pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv);
  195. /* Do the averaging on them */
  196. t3 = vec_add(pix3hv, pix3ihv);
  197. t4 = vec_add(pix3lv, pix3ilv);
  198. avghv = vec_sr(vec_add(vec_add(t1, t3), two), two);
  199. avglv = vec_sr(vec_add(vec_add(t2, t4), two), two);
  200. /* Pack the shorts back into a result */
  201. avgv = vec_pack(avghv, avglv);
  202. /* Calculate a sum of abs differences vector */
  203. t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv));
  204. /* Add each 4 pixel group together and put 4 results into sad */
  205. sad = vec_sum4s(t5, sad);
  206. pix1 += line_size;
  207. pix3 += line_size;
  208. /* Transfer the calculated values for pix3 into pix2 */
  209. t1 = t3;
  210. t2 = t4;
  211. }
  212. /* Sum up the four partial sums, and put the result into s */
  213. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  214. sumdiffs = vec_splat(sumdiffs, 3);
  215. vec_ste(sumdiffs, 0, &s);
  216. return s;
  217. }
  218. int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  219. {
  220. int i;
  221. int s __attribute__((aligned(16)));
  222. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  223. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  224. vector unsigned char t1, t2, t3,t4, t5;
  225. vector unsigned int sad;
  226. vector signed int sumdiffs;
  227. sad = (vector unsigned int)vec_splat_u32(0);
  228. for(i=0;i<h;i++) {
  229. /* Read potentially unaligned pixels into t1 and t2 */
  230. perm1 = vec_lvsl(0, pix1);
  231. pix1v = (vector unsigned char *) pix1;
  232. perm2 = vec_lvsl(0, pix2);
  233. pix2v = (vector unsigned char *) pix2;
  234. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  235. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  236. /* Calculate a sum of abs differences vector */
  237. t3 = vec_max(t1, t2);
  238. t4 = vec_min(t1, t2);
  239. t5 = vec_sub(t3, t4);
  240. /* Add each 4 pixel group together and put 4 results into sad */
  241. sad = vec_sum4s(t5, sad);
  242. pix1 += line_size;
  243. pix2 += line_size;
  244. }
  245. /* Sum up the four partial sums, and put the result into s */
  246. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  247. sumdiffs = vec_splat(sumdiffs, 3);
  248. vec_ste(sumdiffs, 0, &s);
  249. return s;
  250. }
  251. int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  252. {
  253. int i;
  254. int s __attribute__((aligned(16)));
  255. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  256. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  257. vector unsigned char t1, t2, t3,t4, t5;
  258. vector unsigned int sad;
  259. vector signed int sumdiffs;
  260. sad = (vector unsigned int)vec_splat_u32(0);
  261. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  262. for(i=0;i<h;i++) {
  263. /* Read potentially unaligned pixels into t1 and t2
  264. Since we're reading 16 pixels, and actually only want 8,
  265. mask out the last 8 pixels. The 0s don't change the sum. */
  266. perm1 = vec_lvsl(0, pix1);
  267. pix1v = (vector unsigned char *) pix1;
  268. perm2 = vec_lvsl(0, pix2);
  269. pix2v = (vector unsigned char *) pix2;
  270. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  271. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  272. /* Calculate a sum of abs differences vector */
  273. t3 = vec_max(t1, t2);
  274. t4 = vec_min(t1, t2);
  275. t5 = vec_sub(t3, t4);
  276. /* Add each 4 pixel group together and put 4 results into sad */
  277. sad = vec_sum4s(t5, sad);
  278. pix1 += line_size;
  279. pix2 += line_size;
  280. }
  281. /* Sum up the four partial sums, and put the result into s */
  282. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  283. sumdiffs = vec_splat(sumdiffs, 3);
  284. vec_ste(sumdiffs, 0, &s);
  285. return s;
  286. }
  287. int pix_norm1_altivec(uint8_t *pix, int line_size)
  288. {
  289. int i;
  290. int s __attribute__((aligned(16)));
  291. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  292. vector unsigned char *tv;
  293. vector unsigned char pixv;
  294. vector unsigned int sv;
  295. vector signed int sum;
  296. sv = (vector unsigned int)vec_splat_u32(0);
  297. s = 0;
  298. for (i = 0; i < 16; i++) {
  299. /* Read in the potentially unaligned pixels */
  300. tv = (vector unsigned char *) pix;
  301. pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix));
  302. /* Square the values, and add them to our sum */
  303. sv = vec_msum(pixv, pixv, sv);
  304. pix += line_size;
  305. }
  306. /* Sum up the four partial sums, and put the result into s */
  307. sum = vec_sums((vector signed int) sv, (vector signed int) zero);
  308. sum = vec_splat(sum, 3);
  309. vec_ste(sum, 0, &s);
  310. return s;
  311. }
  312. /**
  313. * Sum of Squared Errors for a 8x8 block.
  314. * AltiVec-enhanced.
  315. * It's the sad8_altivec code above w/ squaring added.
  316. */
  317. int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  318. {
  319. int i;
  320. int s __attribute__((aligned(16)));
  321. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  322. vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v;
  323. vector unsigned char t1, t2, t3,t4, t5;
  324. vector unsigned int sum;
  325. vector signed int sumsqr;
  326. sum = (vector unsigned int)vec_splat_u32(0);
  327. permclear = (vector unsigned char)AVV(255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0);
  328. for(i=0;i<h;i++) {
  329. /* Read potentially unaligned pixels into t1 and t2
  330. Since we're reading 16 pixels, and actually only want 8,
  331. mask out the last 8 pixels. The 0s don't change the sum. */
  332. perm1 = vec_lvsl(0, pix1);
  333. pix1v = (vector unsigned char *) pix1;
  334. perm2 = vec_lvsl(0, pix2);
  335. pix2v = (vector unsigned char *) pix2;
  336. t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear);
  337. t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear);
  338. /*
  339. Since we want to use unsigned chars, we can take advantage
  340. of the fact that abs(a-b)^2 = (a-b)^2.
  341. */
  342. /* Calculate abs differences vector */
  343. t3 = vec_max(t1, t2);
  344. t4 = vec_min(t1, t2);
  345. t5 = vec_sub(t3, t4);
  346. /* Square the values and add them to our sum */
  347. sum = vec_msum(t5, t5, sum);
  348. pix1 += line_size;
  349. pix2 += line_size;
  350. }
  351. /* Sum up the four partial sums, and put the result into s */
  352. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  353. sumsqr = vec_splat(sumsqr, 3);
  354. vec_ste(sumsqr, 0, &s);
  355. return s;
  356. }
  357. /**
  358. * Sum of Squared Errors for a 16x16 block.
  359. * AltiVec-enhanced.
  360. * It's the sad16_altivec code above w/ squaring added.
  361. */
  362. int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
  363. {
  364. int i;
  365. int s __attribute__((aligned(16)));
  366. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  367. vector unsigned char perm1, perm2, *pix1v, *pix2v;
  368. vector unsigned char t1, t2, t3,t4, t5;
  369. vector unsigned int sum;
  370. vector signed int sumsqr;
  371. sum = (vector unsigned int)vec_splat_u32(0);
  372. for(i=0;i<h;i++) {
  373. /* Read potentially unaligned pixels into t1 and t2 */
  374. perm1 = vec_lvsl(0, pix1);
  375. pix1v = (vector unsigned char *) pix1;
  376. perm2 = vec_lvsl(0, pix2);
  377. pix2v = (vector unsigned char *) pix2;
  378. t1 = vec_perm(pix1v[0], pix1v[1], perm1);
  379. t2 = vec_perm(pix2v[0], pix2v[1], perm2);
  380. /*
  381. Since we want to use unsigned chars, we can take advantage
  382. of the fact that abs(a-b)^2 = (a-b)^2.
  383. */
  384. /* Calculate abs differences vector */
  385. t3 = vec_max(t1, t2);
  386. t4 = vec_min(t1, t2);
  387. t5 = vec_sub(t3, t4);
  388. /* Square the values and add them to our sum */
  389. sum = vec_msum(t5, t5, sum);
  390. pix1 += line_size;
  391. pix2 += line_size;
  392. }
  393. /* Sum up the four partial sums, and put the result into s */
  394. sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero);
  395. sumsqr = vec_splat(sumsqr, 3);
  396. vec_ste(sumsqr, 0, &s);
  397. return s;
  398. }
  399. int pix_sum_altivec(uint8_t * pix, int line_size)
  400. {
  401. const_vector unsigned int zero = (const_vector unsigned int)vec_splat_u32(0);
  402. vector unsigned char perm, *pixv;
  403. vector unsigned char t1;
  404. vector unsigned int sad;
  405. vector signed int sumdiffs;
  406. int i;
  407. int s __attribute__((aligned(16)));
  408. sad = (vector unsigned int)vec_splat_u32(0);
  409. for (i = 0; i < 16; i++) {
  410. /* Read the potentially unaligned 16 pixels into t1 */
  411. perm = vec_lvsl(0, pix);
  412. pixv = (vector unsigned char *) pix;
  413. t1 = vec_perm(pixv[0], pixv[1], perm);
  414. /* Add each 4 pixel group together and put 4 results into sad */
  415. sad = vec_sum4s(t1, sad);
  416. pix += line_size;
  417. }
  418. /* Sum up the four partial sums, and put the result into s */
  419. sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero);
  420. sumdiffs = vec_splat(sumdiffs, 3);
  421. vec_ste(sumdiffs, 0, &s);
  422. return s;
  423. }
  424. void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
  425. {
  426. int i;
  427. vector unsigned char perm, bytes, *pixv;
  428. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  429. vector signed short shorts;
  430. for(i=0;i<8;i++)
  431. {
  432. // Read potentially unaligned pixels.
  433. // We're reading 16 pixels, and actually only want 8,
  434. // but we simply ignore the extras.
  435. perm = vec_lvsl(0, pixels);
  436. pixv = (vector unsigned char *) pixels;
  437. bytes = vec_perm(pixv[0], pixv[1], perm);
  438. // convert the bytes into shorts
  439. shorts = (vector signed short)vec_mergeh(zero, bytes);
  440. // save the data to the block, we assume the block is 16-byte aligned
  441. vec_st(shorts, i*16, (vector signed short*)block);
  442. pixels += line_size;
  443. }
  444. }
  445. void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1,
  446. const uint8_t *s2, int stride)
  447. {
  448. int i;
  449. vector unsigned char perm, bytes, *pixv;
  450. const_vector unsigned char zero = (const_vector unsigned char)vec_splat_u8(0);
  451. vector signed short shorts1, shorts2;
  452. for(i=0;i<4;i++)
  453. {
  454. // Read potentially unaligned pixels
  455. // We're reading 16 pixels, and actually only want 8,
  456. // but we simply ignore the extras.
  457. perm = vec_lvsl(0, s1);
  458. pixv = (vector unsigned char *) s1;
  459. bytes = vec_perm(pixv[0], pixv[1], perm);
  460. // convert the bytes into shorts
  461. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  462. // Do the same for the second block of pixels
  463. perm = vec_lvsl(0, s2);
  464. pixv = (vector unsigned char *) s2;
  465. bytes = vec_perm(pixv[0], pixv[1], perm);
  466. // convert the bytes into shorts
  467. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  468. // Do the subtraction
  469. shorts1 = vec_sub(shorts1, shorts2);
  470. // save the data to the block, we assume the block is 16-byte aligned
  471. vec_st(shorts1, 0, (vector signed short*)block);
  472. s1 += stride;
  473. s2 += stride;
  474. block += 8;
  475. // The code below is a copy of the code above... This is a manual
  476. // unroll.
  477. // Read potentially unaligned pixels
  478. // We're reading 16 pixels, and actually only want 8,
  479. // but we simply ignore the extras.
  480. perm = vec_lvsl(0, s1);
  481. pixv = (vector unsigned char *) s1;
  482. bytes = vec_perm(pixv[0], pixv[1], perm);
  483. // convert the bytes into shorts
  484. shorts1 = (vector signed short)vec_mergeh(zero, bytes);
  485. // Do the same for the second block of pixels
  486. perm = vec_lvsl(0, s2);
  487. pixv = (vector unsigned char *) s2;
  488. bytes = vec_perm(pixv[0], pixv[1], perm);
  489. // convert the bytes into shorts
  490. shorts2 = (vector signed short)vec_mergeh(zero, bytes);
  491. // Do the subtraction
  492. shorts1 = vec_sub(shorts1, shorts2);
  493. // save the data to the block, we assume the block is 16-byte aligned
  494. vec_st(shorts1, 0, (vector signed short*)block);
  495. s1 += stride;
  496. s2 += stride;
  497. block += 8;
  498. }
  499. }
  500. void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) {
  501. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  502. int i;
  503. for(i=0; i+7<w; i++){
  504. dst[i+0] += src[i+0];
  505. dst[i+1] += src[i+1];
  506. dst[i+2] += src[i+2];
  507. dst[i+3] += src[i+3];
  508. dst[i+4] += src[i+4];
  509. dst[i+5] += src[i+5];
  510. dst[i+6] += src[i+6];
  511. dst[i+7] += src[i+7];
  512. }
  513. for(; i<w; i++)
  514. dst[i+0] += src[i+0];
  515. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  516. register int i;
  517. register vector unsigned char vdst, vsrc;
  518. /* dst and src are 16 bytes-aligned (guaranteed) */
  519. for(i = 0 ; (i + 15) < w ; i+=16)
  520. {
  521. vdst = vec_ld(i, (unsigned char*)dst);
  522. vsrc = vec_ld(i, (unsigned char*)src);
  523. vdst = vec_add(vsrc, vdst);
  524. vec_st(vdst, i, (unsigned char*)dst);
  525. }
  526. /* if w is not a multiple of 16 */
  527. for (; (i < w) ; i++)
  528. {
  529. dst[i] = src[i];
  530. }
  531. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  532. }
  533. /* next one assumes that ((line_size % 16) == 0) */
  534. void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  535. {
  536. POWERPC_PERF_DECLARE(altivec_put_pixels16_num, 1);
  537. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  538. int i;
  539. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  540. for(i=0; i<h; i++) {
  541. *((uint32_t*)(block)) = LD32(pixels);
  542. *((uint32_t*)(block+4)) = LD32(pixels+4);
  543. *((uint32_t*)(block+8)) = LD32(pixels+8);
  544. *((uint32_t*)(block+12)) = LD32(pixels+12);
  545. pixels+=line_size;
  546. block +=line_size;
  547. }
  548. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  549. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  550. register vector unsigned char pixelsv1, pixelsv2;
  551. register vector unsigned char pixelsv1B, pixelsv2B;
  552. register vector unsigned char pixelsv1C, pixelsv2C;
  553. register vector unsigned char pixelsv1D, pixelsv2D;
  554. register vector unsigned char perm = vec_lvsl(0, pixels);
  555. int i;
  556. register int line_size_2 = line_size << 1;
  557. register int line_size_3 = line_size + line_size_2;
  558. register int line_size_4 = line_size << 2;
  559. POWERPC_PERF_START_COUNT(altivec_put_pixels16_num, 1);
  560. // hand-unrolling the loop by 4 gains about 15%
  561. // mininum execution time goes from 74 to 60 cycles
  562. // it's faster than -funroll-loops, but using
  563. // -funroll-loops w/ this is bad - 74 cycles again.
  564. // all this is on a 7450, tuning for the 7450
  565. #if 0
  566. for(i=0; i<h; i++) {
  567. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  568. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  569. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  570. 0, (unsigned char*)block);
  571. pixels+=line_size;
  572. block +=line_size;
  573. }
  574. #else
  575. for(i=0; i<h; i+=4) {
  576. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  577. pixelsv2 = vec_ld(15, (unsigned char*)pixels);
  578. pixelsv1B = vec_ld(line_size, (unsigned char*)pixels);
  579. pixelsv2B = vec_ld(15 + line_size, (unsigned char*)pixels);
  580. pixelsv1C = vec_ld(line_size_2, (unsigned char*)pixels);
  581. pixelsv2C = vec_ld(15 + line_size_2, (unsigned char*)pixels);
  582. pixelsv1D = vec_ld(line_size_3, (unsigned char*)pixels);
  583. pixelsv2D = vec_ld(15 + line_size_3, (unsigned char*)pixels);
  584. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  585. 0, (unsigned char*)block);
  586. vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
  587. line_size, (unsigned char*)block);
  588. vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
  589. line_size_2, (unsigned char*)block);
  590. vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
  591. line_size_3, (unsigned char*)block);
  592. pixels+=line_size_4;
  593. block +=line_size_4;
  594. }
  595. #endif
  596. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_num, 1);
  597. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  598. }
  599. /* next one assumes that ((line_size % 16) == 0) */
  600. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  601. void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  602. {
  603. POWERPC_PERF_DECLARE(altivec_avg_pixels16_num, 1);
  604. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  605. int i;
  606. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  607. for(i=0; i<h; i++) {
  608. op_avg(*((uint32_t*)(block)),LD32(pixels));
  609. op_avg(*((uint32_t*)(block+4)),LD32(pixels+4));
  610. op_avg(*((uint32_t*)(block+8)),LD32(pixels+8));
  611. op_avg(*((uint32_t*)(block+12)),LD32(pixels+12));
  612. pixels+=line_size;
  613. block +=line_size;
  614. }
  615. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  616. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  617. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  618. register vector unsigned char perm = vec_lvsl(0, pixels);
  619. int i;
  620. POWERPC_PERF_START_COUNT(altivec_avg_pixels16_num, 1);
  621. for(i=0; i<h; i++) {
  622. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  623. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  624. blockv = vec_ld(0, block);
  625. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  626. blockv = vec_avg(blockv,pixelsv);
  627. vec_st(blockv, 0, (unsigned char*)block);
  628. pixels+=line_size;
  629. block +=line_size;
  630. }
  631. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels16_num, 1);
  632. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  633. }
  634. /* next one assumes that ((line_size % 8) == 0) */
  635. void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  636. {
  637. POWERPC_PERF_DECLARE(altivec_avg_pixels8_num, 1);
  638. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  639. int i;
  640. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  641. for (i = 0; i < h; i++) {
  642. *((uint32_t *) (block)) =
  643. (((*((uint32_t *) (block))) |
  644. ((((const struct unaligned_32 *) (pixels))->l))) -
  645. ((((*((uint32_t *) (block))) ^
  646. ((((const struct unaligned_32 *) (pixels))->
  647. l))) & 0xFEFEFEFEUL) >> 1));
  648. *((uint32_t *) (block + 4)) =
  649. (((*((uint32_t *) (block + 4))) |
  650. ((((const struct unaligned_32 *) (pixels + 4))->l))) -
  651. ((((*((uint32_t *) (block + 4))) ^
  652. ((((const struct unaligned_32 *) (pixels +
  653. 4))->
  654. l))) & 0xFEFEFEFEUL) >> 1));
  655. pixels += line_size;
  656. block += line_size;
  657. }
  658. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  659. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  660. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  661. int i;
  662. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_num, 1);
  663. for (i = 0; i < h; i++) {
  664. /*
  665. block is 8 bytes-aligned, so we're either in the
  666. left block (16 bytes-aligned) or in the right block (not)
  667. */
  668. int rightside = ((unsigned long)block & 0x0000000F);
  669. blockv = vec_ld(0, block);
  670. pixelsv1 = vec_ld(0, (unsigned char*)pixels);
  671. pixelsv2 = vec_ld(16, (unsigned char*)pixels);
  672. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  673. if (rightside)
  674. {
  675. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  676. }
  677. else
  678. {
  679. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  680. }
  681. blockv = vec_avg(blockv, pixelsv);
  682. vec_st(blockv, 0, block);
  683. pixels += line_size;
  684. block += line_size;
  685. }
  686. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_num, 1);
  687. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  688. }
  689. /* next one assumes that ((line_size % 8) == 0) */
  690. void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  691. {
  692. POWERPC_PERF_DECLARE(altivec_put_pixels8_xy2_num, 1);
  693. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  694. int j;
  695. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  696. for (j = 0; j < 2; j++) {
  697. int i;
  698. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  699. const uint32_t b =
  700. (((const struct unaligned_32 *) (pixels + 1))->l);
  701. uint32_t l0 =
  702. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  703. uint32_t h0 =
  704. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  705. uint32_t l1, h1;
  706. pixels += line_size;
  707. for (i = 0; i < h; i += 2) {
  708. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  709. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  710. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  711. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  712. *((uint32_t *) block) =
  713. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  714. pixels += line_size;
  715. block += line_size;
  716. a = (((const struct unaligned_32 *) (pixels))->l);
  717. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  718. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  719. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  720. *((uint32_t *) block) =
  721. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  722. pixels += line_size;
  723. block += line_size;
  724. } pixels += 4 - line_size * (h + 1);
  725. block += 4 - line_size * h;
  726. }
  727. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  728. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  729. register int i;
  730. register vector unsigned char
  731. pixelsv1, pixelsv2,
  732. pixelsavg;
  733. register vector unsigned char
  734. blockv, temp1, temp2;
  735. register vector unsigned short
  736. pixelssum1, pixelssum2, temp3;
  737. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  738. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  739. temp1 = vec_ld(0, pixels);
  740. temp2 = vec_ld(16, pixels);
  741. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  742. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  743. {
  744. pixelsv2 = temp2;
  745. }
  746. else
  747. {
  748. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  749. }
  750. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  751. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  752. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  753. (vector unsigned short)pixelsv2);
  754. pixelssum1 = vec_add(pixelssum1, vctwo);
  755. POWERPC_PERF_START_COUNT(altivec_put_pixels8_xy2_num, 1);
  756. for (i = 0; i < h ; i++) {
  757. int rightside = ((unsigned long)block & 0x0000000F);
  758. blockv = vec_ld(0, block);
  759. temp1 = vec_ld(line_size, pixels);
  760. temp2 = vec_ld(line_size + 16, pixels);
  761. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  762. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  763. {
  764. pixelsv2 = temp2;
  765. }
  766. else
  767. {
  768. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  769. }
  770. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  771. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  772. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  773. (vector unsigned short)pixelsv2);
  774. temp3 = vec_add(pixelssum1, pixelssum2);
  775. temp3 = vec_sra(temp3, vctwo);
  776. pixelssum1 = vec_add(pixelssum2, vctwo);
  777. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  778. if (rightside)
  779. {
  780. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  781. }
  782. else
  783. {
  784. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  785. }
  786. vec_st(blockv, 0, block);
  787. block += line_size;
  788. pixels += line_size;
  789. }
  790. POWERPC_PERF_STOP_COUNT(altivec_put_pixels8_xy2_num, 1);
  791. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  792. }
  793. /* next one assumes that ((line_size % 8) == 0) */
  794. void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  795. {
  796. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels8_xy2_num, 1);
  797. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  798. int j;
  799. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  800. for (j = 0; j < 2; j++) {
  801. int i;
  802. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  803. const uint32_t b =
  804. (((const struct unaligned_32 *) (pixels + 1))->l);
  805. uint32_t l0 =
  806. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  807. uint32_t h0 =
  808. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  809. uint32_t l1, h1;
  810. pixels += line_size;
  811. for (i = 0; i < h; i += 2) {
  812. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  813. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  814. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  815. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  816. *((uint32_t *) block) =
  817. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  818. pixels += line_size;
  819. block += line_size;
  820. a = (((const struct unaligned_32 *) (pixels))->l);
  821. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  822. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  823. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  824. *((uint32_t *) block) =
  825. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  826. pixels += line_size;
  827. block += line_size;
  828. } pixels += 4 - line_size * (h + 1);
  829. block += 4 - line_size * h;
  830. }
  831. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  832. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  833. register int i;
  834. register vector unsigned char
  835. pixelsv1, pixelsv2,
  836. pixelsavg;
  837. register vector unsigned char
  838. blockv, temp1, temp2;
  839. register vector unsigned short
  840. pixelssum1, pixelssum2, temp3;
  841. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  842. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  843. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  844. temp1 = vec_ld(0, pixels);
  845. temp2 = vec_ld(16, pixels);
  846. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  847. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  848. {
  849. pixelsv2 = temp2;
  850. }
  851. else
  852. {
  853. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  854. }
  855. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  856. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  857. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  858. (vector unsigned short)pixelsv2);
  859. pixelssum1 = vec_add(pixelssum1, vcone);
  860. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  861. for (i = 0; i < h ; i++) {
  862. int rightside = ((unsigned long)block & 0x0000000F);
  863. blockv = vec_ld(0, block);
  864. temp1 = vec_ld(line_size, pixels);
  865. temp2 = vec_ld(line_size + 16, pixels);
  866. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  867. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  868. {
  869. pixelsv2 = temp2;
  870. }
  871. else
  872. {
  873. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  874. }
  875. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  876. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  877. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  878. (vector unsigned short)pixelsv2);
  879. temp3 = vec_add(pixelssum1, pixelssum2);
  880. temp3 = vec_sra(temp3, vctwo);
  881. pixelssum1 = vec_add(pixelssum2, vcone);
  882. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  883. if (rightside)
  884. {
  885. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  886. }
  887. else
  888. {
  889. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  890. }
  891. vec_st(blockv, 0, block);
  892. block += line_size;
  893. pixels += line_size;
  894. }
  895. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels8_xy2_num, 1);
  896. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  897. }
  898. /* next one assumes that ((line_size % 16) == 0) */
  899. void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  900. {
  901. POWERPC_PERF_DECLARE(altivec_put_pixels16_xy2_num, 1);
  902. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  903. int j;
  904. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  905. for (j = 0; j < 4; j++) {
  906. int i;
  907. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  908. const uint32_t b =
  909. (((const struct unaligned_32 *) (pixels + 1))->l);
  910. uint32_t l0 =
  911. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  912. uint32_t h0 =
  913. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  914. uint32_t l1, h1;
  915. pixels += line_size;
  916. for (i = 0; i < h; i += 2) {
  917. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  918. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  919. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  920. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  921. *((uint32_t *) block) =
  922. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  923. pixels += line_size;
  924. block += line_size;
  925. a = (((const struct unaligned_32 *) (pixels))->l);
  926. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  927. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  928. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  929. *((uint32_t *) block) =
  930. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  931. pixels += line_size;
  932. block += line_size;
  933. } pixels += 4 - line_size * (h + 1);
  934. block += 4 - line_size * h;
  935. }
  936. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  937. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  938. register int i;
  939. register vector unsigned char
  940. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  941. register vector unsigned char
  942. blockv, temp1, temp2;
  943. register vector unsigned short
  944. pixelssum1, pixelssum2, temp3,
  945. pixelssum3, pixelssum4, temp4;
  946. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  947. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  948. POWERPC_PERF_START_COUNT(altivec_put_pixels16_xy2_num, 1);
  949. temp1 = vec_ld(0, pixels);
  950. temp2 = vec_ld(16, pixels);
  951. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  952. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  953. {
  954. pixelsv2 = temp2;
  955. }
  956. else
  957. {
  958. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  959. }
  960. pixelsv3 = vec_mergel(vczero, pixelsv1);
  961. pixelsv4 = vec_mergel(vczero, pixelsv2);
  962. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  963. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  964. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  965. (vector unsigned short)pixelsv4);
  966. pixelssum3 = vec_add(pixelssum3, vctwo);
  967. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  968. (vector unsigned short)pixelsv2);
  969. pixelssum1 = vec_add(pixelssum1, vctwo);
  970. for (i = 0; i < h ; i++) {
  971. blockv = vec_ld(0, block);
  972. temp1 = vec_ld(line_size, pixels);
  973. temp2 = vec_ld(line_size + 16, pixels);
  974. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  975. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  976. {
  977. pixelsv2 = temp2;
  978. }
  979. else
  980. {
  981. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  982. }
  983. pixelsv3 = vec_mergel(vczero, pixelsv1);
  984. pixelsv4 = vec_mergel(vczero, pixelsv2);
  985. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  986. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  987. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  988. (vector unsigned short)pixelsv4);
  989. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  990. (vector unsigned short)pixelsv2);
  991. temp4 = vec_add(pixelssum3, pixelssum4);
  992. temp4 = vec_sra(temp4, vctwo);
  993. temp3 = vec_add(pixelssum1, pixelssum2);
  994. temp3 = vec_sra(temp3, vctwo);
  995. pixelssum3 = vec_add(pixelssum4, vctwo);
  996. pixelssum1 = vec_add(pixelssum2, vctwo);
  997. blockv = vec_packsu(temp3, temp4);
  998. vec_st(blockv, 0, block);
  999. block += line_size;
  1000. pixels += line_size;
  1001. }
  1002. POWERPC_PERF_STOP_COUNT(altivec_put_pixels16_xy2_num, 1);
  1003. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  1004. }
  1005. /* next one assumes that ((line_size % 16) == 0) */
  1006. void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h)
  1007. {
  1008. POWERPC_PERF_DECLARE(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1009. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  1010. int j;
  1011. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1012. for (j = 0; j < 4; j++) {
  1013. int i;
  1014. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1015. const uint32_t b =
  1016. (((const struct unaligned_32 *) (pixels + 1))->l);
  1017. uint32_t l0 =
  1018. (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  1019. uint32_t h0 =
  1020. ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1021. uint32_t l1, h1;
  1022. pixels += line_size;
  1023. for (i = 0; i < h; i += 2) {
  1024. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1025. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1026. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  1027. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1028. *((uint32_t *) block) =
  1029. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1030. pixels += line_size;
  1031. block += line_size;
  1032. a = (((const struct unaligned_32 *) (pixels))->l);
  1033. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1034. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x01010101UL;
  1035. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1036. *((uint32_t *) block) =
  1037. h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL);
  1038. pixels += line_size;
  1039. block += line_size;
  1040. } pixels += 4 - line_size * (h + 1);
  1041. block += 4 - line_size * h;
  1042. }
  1043. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1044. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  1045. register int i;
  1046. register vector unsigned char
  1047. pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  1048. register vector unsigned char
  1049. blockv, temp1, temp2;
  1050. register vector unsigned short
  1051. pixelssum1, pixelssum2, temp3,
  1052. pixelssum3, pixelssum4, temp4;
  1053. register const_vector unsigned char vczero = (const_vector unsigned char)vec_splat_u8(0);
  1054. register const_vector unsigned short vcone = (const_vector unsigned short)vec_splat_u16(1);
  1055. register const_vector unsigned short vctwo = (const_vector unsigned short)vec_splat_u16(2);
  1056. POWERPC_PERF_START_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1057. temp1 = vec_ld(0, pixels);
  1058. temp2 = vec_ld(16, pixels);
  1059. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1060. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F)
  1061. {
  1062. pixelsv2 = temp2;
  1063. }
  1064. else
  1065. {
  1066. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1067. }
  1068. pixelsv3 = vec_mergel(vczero, pixelsv1);
  1069. pixelsv4 = vec_mergel(vczero, pixelsv2);
  1070. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1071. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1072. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  1073. (vector unsigned short)pixelsv4);
  1074. pixelssum3 = vec_add(pixelssum3, vcone);
  1075. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1076. (vector unsigned short)pixelsv2);
  1077. pixelssum1 = vec_add(pixelssum1, vcone);
  1078. for (i = 0; i < h ; i++) {
  1079. blockv = vec_ld(0, block);
  1080. temp1 = vec_ld(line_size, pixels);
  1081. temp2 = vec_ld(line_size + 16, pixels);
  1082. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1083. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1084. {
  1085. pixelsv2 = temp2;
  1086. }
  1087. else
  1088. {
  1089. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1090. }
  1091. pixelsv3 = vec_mergel(vczero, pixelsv1);
  1092. pixelsv4 = vec_mergel(vczero, pixelsv2);
  1093. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1094. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1095. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  1096. (vector unsigned short)pixelsv4);
  1097. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1098. (vector unsigned short)pixelsv2);
  1099. temp4 = vec_add(pixelssum3, pixelssum4);
  1100. temp4 = vec_sra(temp4, vctwo);
  1101. temp3 = vec_add(pixelssum1, pixelssum2);
  1102. temp3 = vec_sra(temp3, vctwo);
  1103. pixelssum3 = vec_add(pixelssum4, vcone);
  1104. pixelssum1 = vec_add(pixelssum2, vcone);
  1105. blockv = vec_packsu(temp3, temp4);
  1106. vec_st(blockv, 0, block);
  1107. block += line_size;
  1108. pixels += line_size;
  1109. }
  1110. POWERPC_PERF_STOP_COUNT(altivec_put_no_rnd_pixels16_xy2_num, 1);
  1111. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  1112. }
  1113. int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1114. POWERPC_PERF_DECLARE(altivec_hadamard8_diff8x8_num, 1);
  1115. int sum;
  1116. register const_vector unsigned char vzero =
  1117. (const_vector unsigned char)vec_splat_u8(0);
  1118. register vector signed short temp0, temp1, temp2, temp3, temp4,
  1119. temp5, temp6, temp7;
  1120. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff8x8_num, 1);
  1121. {
  1122. register const_vector signed short vprod1 =(const_vector signed short)
  1123. AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  1124. register const_vector signed short vprod2 =(const_vector signed short)
  1125. AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  1126. register const_vector signed short vprod3 =(const_vector signed short)
  1127. AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  1128. register const_vector unsigned char perm1 = (const_vector unsigned char)
  1129. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  1130. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  1131. register const_vector unsigned char perm2 = (const_vector unsigned char)
  1132. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  1133. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  1134. register const_vector unsigned char perm3 = (const_vector unsigned char)
  1135. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  1136. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  1137. #define ONEITERBUTTERFLY(i, res) \
  1138. { \
  1139. register vector unsigned char src1, src2, srcO; \
  1140. register vector unsigned char dst1, dst2, dstO; \
  1141. register vector signed short srcV, dstV; \
  1142. register vector signed short but0, but1, but2, op1, op2, op3; \
  1143. src1 = vec_ld(stride * i, src); \
  1144. if ((((stride * i) + (unsigned long)src) & 0x0000000F) > 8) \
  1145. src2 = vec_ld((stride * i) + 16, src); \
  1146. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  1147. dst1 = vec_ld(stride * i, dst); \
  1148. if ((((stride * i) + (unsigned long)dst) & 0x0000000F) > 8) \
  1149. dst2 = vec_ld((stride * i) + 16, dst); \
  1150. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  1151. /* promote the unsigned chars to signed shorts */ \
  1152. /* we're in the 8x8 function, we only care for the first 8 */ \
  1153. srcV = \
  1154. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1155. (vector signed char)srcO); \
  1156. dstV = \
  1157. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1158. (vector signed char)dstO); \
  1159. /* substractions inside the first butterfly */ \
  1160. but0 = vec_sub(srcV, dstV); \
  1161. op1 = vec_perm(but0, but0, perm1); \
  1162. but1 = vec_mladd(but0, vprod1, op1); \
  1163. op2 = vec_perm(but1, but1, perm2); \
  1164. but2 = vec_mladd(but1, vprod2, op2); \
  1165. op3 = vec_perm(but2, but2, perm3); \
  1166. res = vec_mladd(but2, vprod3, op3); \
  1167. }
  1168. ONEITERBUTTERFLY(0, temp0);
  1169. ONEITERBUTTERFLY(1, temp1);
  1170. ONEITERBUTTERFLY(2, temp2);
  1171. ONEITERBUTTERFLY(3, temp3);
  1172. ONEITERBUTTERFLY(4, temp4);
  1173. ONEITERBUTTERFLY(5, temp5);
  1174. ONEITERBUTTERFLY(6, temp6);
  1175. ONEITERBUTTERFLY(7, temp7);
  1176. }
  1177. #undef ONEITERBUTTERFLY
  1178. {
  1179. register vector signed int vsum;
  1180. register vector signed short line0 = vec_add(temp0, temp1);
  1181. register vector signed short line1 = vec_sub(temp0, temp1);
  1182. register vector signed short line2 = vec_add(temp2, temp3);
  1183. register vector signed short line3 = vec_sub(temp2, temp3);
  1184. register vector signed short line4 = vec_add(temp4, temp5);
  1185. register vector signed short line5 = vec_sub(temp4, temp5);
  1186. register vector signed short line6 = vec_add(temp6, temp7);
  1187. register vector signed short line7 = vec_sub(temp6, temp7);
  1188. register vector signed short line0B = vec_add(line0, line2);
  1189. register vector signed short line2B = vec_sub(line0, line2);
  1190. register vector signed short line1B = vec_add(line1, line3);
  1191. register vector signed short line3B = vec_sub(line1, line3);
  1192. register vector signed short line4B = vec_add(line4, line6);
  1193. register vector signed short line6B = vec_sub(line4, line6);
  1194. register vector signed short line5B = vec_add(line5, line7);
  1195. register vector signed short line7B = vec_sub(line5, line7);
  1196. register vector signed short line0C = vec_add(line0B, line4B);
  1197. register vector signed short line4C = vec_sub(line0B, line4B);
  1198. register vector signed short line1C = vec_add(line1B, line5B);
  1199. register vector signed short line5C = vec_sub(line1B, line5B);
  1200. register vector signed short line2C = vec_add(line2B, line6B);
  1201. register vector signed short line6C = vec_sub(line2B, line6B);
  1202. register vector signed short line3C = vec_add(line3B, line7B);
  1203. register vector signed short line7C = vec_sub(line3B, line7B);
  1204. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1205. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1206. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1207. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1208. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1209. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1210. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1211. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1212. vsum = vec_sums(vsum, (vector signed int)vzero);
  1213. vsum = vec_splat(vsum, 3);
  1214. vec_ste(vsum, 0, &sum);
  1215. }
  1216. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff8x8_num, 1);
  1217. return sum;
  1218. }
  1219. /*
  1220. 16x8 works with 16 elements ; it allows to avoid replicating
  1221. loads, and give the compiler more rooms for scheduling.
  1222. It's only used from inside hadamard8_diff16_altivec.
  1223. Unfortunately, it seems gcc-3.3 is a bit dumb, and
  1224. the compiled code has a LOT of spill code, it seems
  1225. gcc (unlike xlc) cannot keep everything in registers
  1226. by itself. The following code include hand-made
  1227. registers allocation. It's not clean, but on
  1228. a 7450 the resulting code is much faster (best case
  1229. fall from 700+ cycles to 550).
  1230. xlc doesn't add spill code, but it doesn't know how to
  1231. schedule for the 7450, and its code isn't much faster than
  1232. gcc-3.3 on the 7450 (but uses 25% less instructions...)
  1233. On the 970, the hand-made RA is still a win (arount 690
  1234. vs. around 780), but xlc goes to around 660 on the
  1235. regular C code...
  1236. */
  1237. static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) {
  1238. int sum;
  1239. register vector signed short
  1240. temp0 REG_v(v0),
  1241. temp1 REG_v(v1),
  1242. temp2 REG_v(v2),
  1243. temp3 REG_v(v3),
  1244. temp4 REG_v(v4),
  1245. temp5 REG_v(v5),
  1246. temp6 REG_v(v6),
  1247. temp7 REG_v(v7);
  1248. register vector signed short
  1249. temp0S REG_v(v8),
  1250. temp1S REG_v(v9),
  1251. temp2S REG_v(v10),
  1252. temp3S REG_v(v11),
  1253. temp4S REG_v(v12),
  1254. temp5S REG_v(v13),
  1255. temp6S REG_v(v14),
  1256. temp7S REG_v(v15);
  1257. register const_vector unsigned char vzero REG_v(v31)=
  1258. (const_vector unsigned char)vec_splat_u8(0);
  1259. {
  1260. register const_vector signed short vprod1 REG_v(v16)=
  1261. (const_vector signed short)AVV( 1,-1, 1,-1, 1,-1, 1,-1);
  1262. register const_vector signed short vprod2 REG_v(v17)=
  1263. (const_vector signed short)AVV( 1, 1,-1,-1, 1, 1,-1,-1);
  1264. register const_vector signed short vprod3 REG_v(v18)=
  1265. (const_vector signed short)AVV( 1, 1, 1, 1,-1,-1,-1,-1);
  1266. register const_vector unsigned char perm1 REG_v(v19)=
  1267. (const_vector unsigned char)
  1268. AVV(0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05,
  1269. 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D);
  1270. register const_vector unsigned char perm2 REG_v(v20)=
  1271. (const_vector unsigned char)
  1272. AVV(0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03,
  1273. 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B);
  1274. register const_vector unsigned char perm3 REG_v(v21)=
  1275. (const_vector unsigned char)
  1276. AVV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
  1277. 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07);
  1278. #define ONEITERBUTTERFLY(i, res1, res2) \
  1279. { \
  1280. register vector unsigned char src1 REG_v(v22), \
  1281. src2 REG_v(v23), \
  1282. dst1 REG_v(v24), \
  1283. dst2 REG_v(v25), \
  1284. srcO REG_v(v22), \
  1285. dstO REG_v(v23); \
  1286. \
  1287. register vector signed short srcV REG_v(v24), \
  1288. dstV REG_v(v25), \
  1289. srcW REG_v(v26), \
  1290. dstW REG_v(v27), \
  1291. but0 REG_v(v28), \
  1292. but0S REG_v(v29), \
  1293. op1 REG_v(v30), \
  1294. but1 REG_v(v22), \
  1295. op1S REG_v(v23), \
  1296. but1S REG_v(v24), \
  1297. op2 REG_v(v25), \
  1298. but2 REG_v(v26), \
  1299. op2S REG_v(v27), \
  1300. but2S REG_v(v28), \
  1301. op3 REG_v(v29), \
  1302. op3S REG_v(v30); \
  1303. \
  1304. src1 = vec_ld(stride * i, src); \
  1305. src2 = vec_ld((stride * i) + 16, src); \
  1306. srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \
  1307. dst1 = vec_ld(stride * i, dst); \
  1308. dst2 = vec_ld((stride * i) + 16, dst); \
  1309. dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \
  1310. /* promote the unsigned chars to signed shorts */ \
  1311. srcV = \
  1312. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1313. (vector signed char)srcO); \
  1314. dstV = \
  1315. (vector signed short)vec_mergeh((vector signed char)vzero, \
  1316. (vector signed char)dstO); \
  1317. srcW = \
  1318. (vector signed short)vec_mergel((vector signed char)vzero, \
  1319. (vector signed char)srcO); \
  1320. dstW = \
  1321. (vector signed short)vec_mergel((vector signed char)vzero, \
  1322. (vector signed char)dstO); \
  1323. /* substractions inside the first butterfly */ \
  1324. but0 = vec_sub(srcV, dstV); \
  1325. but0S = vec_sub(srcW, dstW); \
  1326. op1 = vec_perm(but0, but0, perm1); \
  1327. but1 = vec_mladd(but0, vprod1, op1); \
  1328. op1S = vec_perm(but0S, but0S, perm1); \
  1329. but1S = vec_mladd(but0S, vprod1, op1S); \
  1330. op2 = vec_perm(but1, but1, perm2); \
  1331. but2 = vec_mladd(but1, vprod2, op2); \
  1332. op2S = vec_perm(but1S, but1S, perm2); \
  1333. but2S = vec_mladd(but1S, vprod2, op2S); \
  1334. op3 = vec_perm(but2, but2, perm3); \
  1335. res1 = vec_mladd(but2, vprod3, op3); \
  1336. op3S = vec_perm(but2S, but2S, perm3); \
  1337. res2 = vec_mladd(but2S, vprod3, op3S); \
  1338. }
  1339. ONEITERBUTTERFLY(0, temp0, temp0S);
  1340. ONEITERBUTTERFLY(1, temp1, temp1S);
  1341. ONEITERBUTTERFLY(2, temp2, temp2S);
  1342. ONEITERBUTTERFLY(3, temp3, temp3S);
  1343. ONEITERBUTTERFLY(4, temp4, temp4S);
  1344. ONEITERBUTTERFLY(5, temp5, temp5S);
  1345. ONEITERBUTTERFLY(6, temp6, temp6S);
  1346. ONEITERBUTTERFLY(7, temp7, temp7S);
  1347. }
  1348. #undef ONEITERBUTTERFLY
  1349. {
  1350. register vector signed int vsum;
  1351. register vector signed short line0S, line1S, line2S, line3S, line4S,
  1352. line5S, line6S, line7S, line0BS,line2BS,
  1353. line1BS,line3BS,line4BS,line6BS,line5BS,
  1354. line7BS,line0CS,line4CS,line1CS,line5CS,
  1355. line2CS,line6CS,line3CS,line7CS;
  1356. register vector signed short line0 = vec_add(temp0, temp1);
  1357. register vector signed short line1 = vec_sub(temp0, temp1);
  1358. register vector signed short line2 = vec_add(temp2, temp3);
  1359. register vector signed short line3 = vec_sub(temp2, temp3);
  1360. register vector signed short line4 = vec_add(temp4, temp5);
  1361. register vector signed short line5 = vec_sub(temp4, temp5);
  1362. register vector signed short line6 = vec_add(temp6, temp7);
  1363. register vector signed short line7 = vec_sub(temp6, temp7);
  1364. register vector signed short line0B = vec_add(line0, line2);
  1365. register vector signed short line2B = vec_sub(line0, line2);
  1366. register vector signed short line1B = vec_add(line1, line3);
  1367. register vector signed short line3B = vec_sub(line1, line3);
  1368. register vector signed short line4B = vec_add(line4, line6);
  1369. register vector signed short line6B = vec_sub(line4, line6);
  1370. register vector signed short line5B = vec_add(line5, line7);
  1371. register vector signed short line7B = vec_sub(line5, line7);
  1372. register vector signed short line0C = vec_add(line0B, line4B);
  1373. register vector signed short line4C = vec_sub(line0B, line4B);
  1374. register vector signed short line1C = vec_add(line1B, line5B);
  1375. register vector signed short line5C = vec_sub(line1B, line5B);
  1376. register vector signed short line2C = vec_add(line2B, line6B);
  1377. register vector signed short line6C = vec_sub(line2B, line6B);
  1378. register vector signed short line3C = vec_add(line3B, line7B);
  1379. register vector signed short line7C = vec_sub(line3B, line7B);
  1380. vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0));
  1381. vsum = vec_sum4s(vec_abs(line1C), vsum);
  1382. vsum = vec_sum4s(vec_abs(line2C), vsum);
  1383. vsum = vec_sum4s(vec_abs(line3C), vsum);
  1384. vsum = vec_sum4s(vec_abs(line4C), vsum);
  1385. vsum = vec_sum4s(vec_abs(line5C), vsum);
  1386. vsum = vec_sum4s(vec_abs(line6C), vsum);
  1387. vsum = vec_sum4s(vec_abs(line7C), vsum);
  1388. line0S = vec_add(temp0S, temp1S);
  1389. line1S = vec_sub(temp0S, temp1S);
  1390. line2S = vec_add(temp2S, temp3S);
  1391. line3S = vec_sub(temp2S, temp3S);
  1392. line4S = vec_add(temp4S, temp5S);
  1393. line5S = vec_sub(temp4S, temp5S);
  1394. line6S = vec_add(temp6S, temp7S);
  1395. line7S = vec_sub(temp6S, temp7S);
  1396. line0BS = vec_add(line0S, line2S);
  1397. line2BS = vec_sub(line0S, line2S);
  1398. line1BS = vec_add(line1S, line3S);
  1399. line3BS = vec_sub(line1S, line3S);
  1400. line4BS = vec_add(line4S, line6S);
  1401. line6BS = vec_sub(line4S, line6S);
  1402. line5BS = vec_add(line5S, line7S);
  1403. line7BS = vec_sub(line5S, line7S);
  1404. line0CS = vec_add(line0BS, line4BS);
  1405. line4CS = vec_sub(line0BS, line4BS);
  1406. line1CS = vec_add(line1BS, line5BS);
  1407. line5CS = vec_sub(line1BS, line5BS);
  1408. line2CS = vec_add(line2BS, line6BS);
  1409. line6CS = vec_sub(line2BS, line6BS);
  1410. line3CS = vec_add(line3BS, line7BS);
  1411. line7CS = vec_sub(line3BS, line7BS);
  1412. vsum = vec_sum4s(vec_abs(line0CS), vsum);
  1413. vsum = vec_sum4s(vec_abs(line1CS), vsum);
  1414. vsum = vec_sum4s(vec_abs(line2CS), vsum);
  1415. vsum = vec_sum4s(vec_abs(line3CS), vsum);
  1416. vsum = vec_sum4s(vec_abs(line4CS), vsum);
  1417. vsum = vec_sum4s(vec_abs(line5CS), vsum);
  1418. vsum = vec_sum4s(vec_abs(line6CS), vsum);
  1419. vsum = vec_sum4s(vec_abs(line7CS), vsum);
  1420. vsum = vec_sums(vsum, (vector signed int)vzero);
  1421. vsum = vec_splat(vsum, 3);
  1422. vec_ste(vsum, 0, &sum);
  1423. }
  1424. return sum;
  1425. }
  1426. int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
  1427. POWERPC_PERF_DECLARE(altivec_hadamard8_diff16_num, 1);
  1428. int score;
  1429. POWERPC_PERF_START_COUNT(altivec_hadamard8_diff16_num, 1);
  1430. score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1431. if (h==16) {
  1432. dst += 8*stride;
  1433. src += 8*stride;
  1434. score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8);
  1435. }
  1436. POWERPC_PERF_STOP_COUNT(altivec_hadamard8_diff16_num, 1);
  1437. return score;
  1438. }
  1439. int has_altivec(void)
  1440. {
  1441. #ifdef __AMIGAOS4__
  1442. ULONG result = 0;
  1443. extern struct ExecIFace *IExec;
  1444. IExec->GetCPUInfoTags(GCIT_VectorUnit, &result, TAG_DONE);
  1445. if (result == VECTORTYPE_ALTIVEC) return 1;
  1446. return 0;
  1447. #else /* __AMIGAOS4__ */
  1448. #ifdef CONFIG_DARWIN
  1449. int sels[2] = {CTL_HW, HW_VECTORUNIT};
  1450. int has_vu = 0;
  1451. size_t len = sizeof(has_vu);
  1452. int err;
  1453. err = sysctl(sels, 2, &has_vu, &len, NULL, 0);
  1454. if (err == 0) return (has_vu != 0);
  1455. #else /* CONFIG_DARWIN */
  1456. /* no Darwin, do it the brute-force way */
  1457. /* this is borrowed from the libmpeg2 library */
  1458. {
  1459. signal (SIGILL, sigill_handler);
  1460. if (sigsetjmp (jmpbuf, 1)) {
  1461. signal (SIGILL, SIG_DFL);
  1462. } else {
  1463. canjump = 1;
  1464. asm volatile ("mtspr 256, %0\n\t"
  1465. "vand %%v0, %%v0, %%v0"
  1466. :
  1467. : "r" (-1));
  1468. signal (SIGILL, SIG_DFL);
  1469. return 1;
  1470. }
  1471. }
  1472. #endif /* CONFIG_DARWIN */
  1473. return 0;
  1474. #endif /* __AMIGAOS4__ */
  1475. }
  1476. static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
  1477. int blocksize)
  1478. {
  1479. int i;
  1480. vector float m, a;
  1481. vector bool int t0, t1;
  1482. const vector unsigned int v_31 = //XXX
  1483. vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
  1484. for(i=0; i<blocksize; i+=4) {
  1485. m = vec_ld(0, mag+i);
  1486. a = vec_ld(0, ang+i);
  1487. t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
  1488. t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
  1489. a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
  1490. t0 = (vector bool int)vec_and(a, t1);
  1491. t1 = (vector bool int)vec_andc(a, t1);
  1492. a = vec_sub(m, (vector float)t1);
  1493. m = vec_add(m, (vector float)t0);
  1494. vec_stl(a, 0, ang+i);
  1495. vec_stl(m, 0, mag+i);
  1496. }
  1497. }
  1498. /* next one assumes that ((line_size % 8) == 0) */
  1499. void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h)
  1500. {
  1501. POWERPC_PERF_DECLARE(altivec_avg_pixels8_xy2_num, 1);
  1502. #ifdef ALTIVEC_USE_REFERENCE_C_CODE
  1503. int j;
  1504. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1505. for (j = 0; j < 2; j++) {
  1506. int i;
  1507. const uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1508. const uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1509. uint32_t l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  1510. uint32_t h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1511. uint32_t l1, h1;
  1512. pixels += line_size;
  1513. for (i = 0; i < h; i += 2) {
  1514. uint32_t a = (((const struct unaligned_32 *) (pixels))->l);
  1515. uint32_t b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1516. l1 = (a & 0x03030303UL) + (b & 0x03030303UL);
  1517. h1 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1518. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  1519. pixels += line_size;
  1520. block += line_size;
  1521. a = (((const struct unaligned_32 *) (pixels))->l);
  1522. b = (((const struct unaligned_32 *) (pixels + 1))->l);
  1523. l0 = (a & 0x03030303UL) + (b & 0x03030303UL) + 0x02020202UL;
  1524. h0 = ((a & 0xFCFCFCFCUL) >> 2) + ((b & 0xFCFCFCFCUL) >> 2);
  1525. *((uint32_t *) block) = rnd_avg32(*((uint32_t *) block), h0 + h1 + (((l0 + l1) >> 2) & 0x0F0F0F0FUL));
  1526. pixels += line_size;
  1527. block += line_size;
  1528. } pixels += 4 - line_size * (h + 1);
  1529. block += 4 - line_size * h;
  1530. }
  1531. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1532. #else /* ALTIVEC_USE_REFERENCE_C_CODE */
  1533. register int i;
  1534. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  1535. register vector unsigned char blockv, temp1, temp2, blocktemp;
  1536. register vector unsigned short pixelssum1, pixelssum2, temp3;
  1537. register const_vector unsigned char vczero = (const_vector unsigned char)
  1538. vec_splat_u8(0);
  1539. register const_vector unsigned short vctwo = (const_vector unsigned short)
  1540. vec_splat_u16(2);
  1541. temp1 = vec_ld(0, pixels);
  1542. temp2 = vec_ld(16, pixels);
  1543. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  1544. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  1545. pixelsv2 = temp2;
  1546. } else {
  1547. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  1548. }
  1549. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1550. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1551. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  1552. (vector unsigned short)pixelsv2);
  1553. pixelssum1 = vec_add(pixelssum1, vctwo);
  1554. POWERPC_PERF_START_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1555. for (i = 0; i < h ; i++) {
  1556. int rightside = ((unsigned long)block & 0x0000000F);
  1557. blockv = vec_ld(0, block);
  1558. temp1 = vec_ld(line_size, pixels);
  1559. temp2 = vec_ld(line_size + 16, pixels);
  1560. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  1561. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F)
  1562. {
  1563. pixelsv2 = temp2;
  1564. } else {
  1565. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  1566. }
  1567. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  1568. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  1569. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  1570. (vector unsigned short)pixelsv2);
  1571. temp3 = vec_add(pixelssum1, pixelssum2);
  1572. temp3 = vec_sra(temp3, vctwo);
  1573. pixelssum1 = vec_add(pixelssum2, vctwo);
  1574. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  1575. if (rightside) {
  1576. blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  1577. } else {
  1578. blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  1579. }
  1580. blockv = vec_avg(blocktemp, blockv);
  1581. vec_st(blockv, 0, block);
  1582. block += line_size;
  1583. pixels += line_size;
  1584. }
  1585. POWERPC_PERF_STOP_COUNT(altivec_avg_pixels8_xy2_num, 1);
  1586. #endif /* ALTIVEC_USE_REFERENCE_C_CODE */
  1587. }
  1588. void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx)
  1589. {
  1590. c->pix_abs[0][1] = sad16_x2_altivec;
  1591. c->pix_abs[0][2] = sad16_y2_altivec;
  1592. c->pix_abs[0][3] = sad16_xy2_altivec;
  1593. c->pix_abs[0][0] = sad16_altivec;
  1594. c->pix_abs[1][0] = sad8_altivec;
  1595. c->sad[0]= sad16_altivec;
  1596. c->sad[1]= sad8_altivec;
  1597. c->pix_norm1 = pix_norm1_altivec;
  1598. c->sse[1]= sse8_altivec;
  1599. c->sse[0]= sse16_altivec;
  1600. c->pix_sum = pix_sum_altivec;
  1601. c->diff_pixels = diff_pixels_altivec;
  1602. c->get_pixels = get_pixels_altivec;
  1603. c->add_bytes= add_bytes_altivec;
  1604. c->put_pixels_tab[0][0] = put_pixels16_altivec;
  1605. /* the two functions do the same thing, so use the same code */
  1606. c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec;
  1607. c->avg_pixels_tab[0][0] = avg_pixels16_altivec;
  1608. c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
  1609. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
  1610. c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
  1611. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
  1612. c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
  1613. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
  1614. c->hadamard8_diff[0] = hadamard8_diff16_altivec;
  1615. c->hadamard8_diff[1] = hadamard8_diff8x8_altivec;
  1616. #ifdef CONFIG_VORBIS_DECODER
  1617. c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec;
  1618. #endif
  1619. }