You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

796 lines
22KB

  1. /*
  2. * Altivec optimized snow DSP utils
  3. * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  4. *
  5. * This library is free software; you can redistribute it and/or
  6. * modify it under the terms of the GNU Lesser General Public
  7. * License as published by the Free Software Foundation; either
  8. * version 2 of the License, or (at your option) any later version.
  9. *
  10. * This library is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  13. * Lesser General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU Lesser General Public
  16. * License along with this library; if not, write to the Free Software
  17. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18. *
  19. *
  20. */
  21. #include "../dsputil.h"
  22. #include "gcc_fixes.h"
  23. #include "dsputil_altivec.h"
  24. #include "../snow.h"
  25. #undef NDEBUG
  26. #include <assert.h>
  27. //FIXME remove this replication
  28. #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
  29. static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
  30. {
  31. int offset;
  32. DWTELEM * buffer;
  33. // av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
  34. assert(buf->data_stack_top >= 0);
  35. // assert(!buf->line[line]);
  36. if (buf->line[line])
  37. return buf->line[line];
  38. offset = buf->line_width * line;
  39. buffer = buf->data_stack[buf->data_stack_top];
  40. buf->data_stack_top--;
  41. buf->line[line] = buffer;
  42. // av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
  43. return buffer;
  44. }
  45. //altivec code
  46. void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width)
  47. {
  48. const int w2= (width+1)>>1;
  49. DECLARE_ALIGNED_16(DWTELEM, temp[(width>>1)]);
  50. const int w_l= (width>>1);
  51. const int w_r= w2 - 1;
  52. int i;
  53. vector signed int t1, t2, x, y, tmp1, tmp2;
  54. vector signed int *vbuf, *vtmp;
  55. vector unsigned char align;
  56. { // Lift 0
  57. DWTELEM * const ref = b + w2 - 1;
  58. DWTELEM b_0 = b[0];
  59. vbuf = (vector signed int *)b;
  60. tmp1 = vec_ld (0, ref);
  61. align = vec_lvsl (0, ref);
  62. tmp2 = vec_ld (15, ref);
  63. t1= vec_perm(tmp1, tmp2, align);
  64. i = 0;
  65. for (i=0; i<w_l-15; i+=16) {
  66. #if 0
  67. b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
  68. b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);
  69. b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);
  70. b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);
  71. #else
  72. tmp1 = vec_ld (0, ref+4+i);
  73. tmp2 = vec_ld (15, ref+4+i);
  74. t2 = vec_perm(tmp1, tmp2, align);
  75. y = vec_add(t1,vec_sld(t1,t2,4));
  76. y = vec_add(vec_add(y,y),y);
  77. tmp1 = vec_ld (0, ref+8+i);
  78. y = vec_add(y, vec_splat_s32(4));
  79. y = vec_sra(y, vec_splat_u32(3));
  80. tmp2 = vec_ld (15, ref+8+i);
  81. *vbuf = vec_sub(*vbuf, y);
  82. t1=t2;
  83. vbuf++;
  84. t2 = vec_perm(tmp1, tmp2, align);
  85. y = vec_add(t1,vec_sld(t1,t2,4));
  86. y = vec_add(vec_add(y,y),y);
  87. tmp1 = vec_ld (0, ref+12+i);
  88. y = vec_add(y, vec_splat_s32(4));
  89. y = vec_sra(y, vec_splat_u32(3));
  90. tmp2 = vec_ld (15, ref+12+i);
  91. *vbuf = vec_sub(*vbuf, y);
  92. t1=t2;
  93. vbuf++;
  94. t2 = vec_perm(tmp1, tmp2, align);
  95. y = vec_add(t1,vec_sld(t1,t2,4));
  96. y = vec_add(vec_add(y,y),y);
  97. tmp1 = vec_ld (0, ref+16+i);
  98. y = vec_add(y, vec_splat_s32(4));
  99. y = vec_sra(y, vec_splat_u32(3));
  100. tmp2 = vec_ld (15, ref+16+i);
  101. *vbuf = vec_sub(*vbuf, y);
  102. t1=t2;
  103. t2 = vec_perm(tmp1, tmp2, align);
  104. y = vec_add(t1,vec_sld(t1,t2,4));
  105. y = vec_add(vec_add(y,y),y);
  106. vbuf++;
  107. y = vec_add(y, vec_splat_s32(4));
  108. y = vec_sra(y, vec_splat_u32(3));
  109. *vbuf = vec_sub(*vbuf, y);
  110. t1=t2;
  111. vbuf++;
  112. #endif
  113. }
  114. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  115. b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  116. }
  117. { // Lift 1
  118. DWTELEM * const dst = b+w2;
  119. i = 0;
  120. for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
  121. dst[i] = dst[i] - (b[i] + b[i + 1]);
  122. }
  123. align = vec_lvsl(0, b+i);
  124. tmp1 = vec_ld(0, b+i);
  125. vbuf = (vector signed int*) (dst + i);
  126. tmp2 = vec_ld(15, b+i);
  127. t1 = vec_perm(tmp1, tmp2, align);
  128. for (; i<w_r-3; i+=4) {
  129. #if 0
  130. dst[i] = dst[i] - (b[i] + b[i + 1]);
  131. dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]);
  132. dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]);
  133. dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]);
  134. #else
  135. tmp1 = vec_ld(0, b+4+i);
  136. tmp2 = vec_ld(15, b+4+i);
  137. t2 = vec_perm(tmp1, tmp2, align);
  138. y = vec_add(t1, vec_sld(t1,t2,4));
  139. *vbuf = vec_sub (*vbuf, y);
  140. vbuf++;
  141. t1 = t2;
  142. #endif
  143. }
  144. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  145. }
  146. { // Lift 2
  147. DWTELEM * const ref = b+w2 - 1;
  148. DWTELEM b_0 = b[0];
  149. vbuf= (vector signed int *) b;
  150. tmp1 = vec_ld (0, ref);
  151. align = vec_lvsl (0, ref);
  152. tmp2 = vec_ld (15, ref);
  153. t1= vec_perm(tmp1, tmp2, align);
  154. i = 0;
  155. for (; i<w_l-15; i+=16) {
  156. #if 0
  157. b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4);
  158. b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4);
  159. b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4);
  160. b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4);
  161. #else
  162. tmp1 = vec_ld (0, ref+4+i);
  163. tmp2 = vec_ld (15, ref+4+i);
  164. t2 = vec_perm(tmp1, tmp2, align);
  165. y = vec_add(t1,vec_sld(t1,t2,4));
  166. y = vec_sub(vec_splat_s32(8),y);
  167. tmp1 = vec_ld (0, ref+8+i);
  168. x = vec_sl(*vbuf,vec_splat_u32(2));
  169. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  170. tmp2 = vec_ld (15, ref+8+i);
  171. *vbuf = vec_sub( *vbuf, y);
  172. t1 = t2;
  173. vbuf++;
  174. t2 = vec_perm(tmp1, tmp2, align);
  175. y = vec_add(t1,vec_sld(t1,t2,4));
  176. y = vec_sub(vec_splat_s32(8),y);
  177. tmp1 = vec_ld (0, ref+12+i);
  178. x = vec_sl(*vbuf,vec_splat_u32(2));
  179. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  180. tmp2 = vec_ld (15, ref+12+i);
  181. *vbuf = vec_sub( *vbuf, y);
  182. t1 = t2;
  183. vbuf++;
  184. t2 = vec_perm(tmp1, tmp2, align);
  185. y = vec_add(t1,vec_sld(t1,t2,4));
  186. y = vec_sub(vec_splat_s32(8),y);
  187. tmp1 = vec_ld (0, ref+16+i);
  188. x = vec_sl(*vbuf,vec_splat_u32(2));
  189. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  190. tmp2 = vec_ld (15, ref+16+i);
  191. *vbuf = vec_sub( *vbuf, y);
  192. t1 = t2;
  193. vbuf++;
  194. t2 = vec_perm(tmp1, tmp2, align);
  195. y = vec_add(t1,vec_sld(t1,t2,4));
  196. y = vec_sub(vec_splat_s32(8),y);
  197. t1 = t2;
  198. x = vec_sl(*vbuf,vec_splat_u32(2));
  199. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  200. *vbuf = vec_sub( *vbuf, y);
  201. vbuf++;
  202. #endif
  203. }
  204. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  205. b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
  206. }
  207. { // Lift 3
  208. DWTELEM * const src = b+w2;
  209. vbuf = (vector signed int *)b;
  210. vtmp = (vector signed int *)temp;
  211. i = 0;
  212. align = vec_lvsl(0, src);
  213. for (; i<w_r-3; i+=4) {
  214. #if 0
  215. temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1);
  216. temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1);
  217. temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1);
  218. temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1);
  219. #else
  220. tmp1 = vec_ld(0,src+i);
  221. t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4));
  222. tmp2 = vec_ld(15,src+i);
  223. t1 = vec_sub(vec_splat_s32(0),t1); //bad!
  224. t1 = vec_add(t1,vec_add(t1,t1));
  225. t2 = vec_perm(tmp1 ,tmp2 ,align);
  226. t1 = vec_sra(t1,vec_splat_u32(1));
  227. vbuf++;
  228. *vtmp = vec_sub(t2,t1);
  229. vtmp++;
  230. #endif
  231. }
  232. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1);
  233. }
  234. {
  235. //Interleave
  236. int a;
  237. vector signed int *t = (vector signed int *)temp,
  238. *v = (vector signed int *)b;
  239. snow_interleave_line_header(&i, width, b, temp);
  240. for (; (i & 0xE) != 0xE; i-=2){
  241. b[i+1] = temp[i>>1];
  242. b[i] = b[i>>1];
  243. }
  244. for (i-=14; i>=0; i-=16){
  245. a=i/4;
  246. v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]);
  247. v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]);
  248. v[a+1]=vec_mergel(v[a>>1],t[a>>1]);
  249. v[a]=vec_mergeh(v[a>>1],t[a>>1]);
  250. }
  251. }
  252. }
  253. void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width)
  254. {
  255. int i, w4 = width/4;
  256. vector signed int *v0, *v1,*v2,*v3,*v4,*v5;
  257. vector signed int t1, t2;
  258. v0=(vector signed int *)b0;
  259. v1=(vector signed int *)b1;
  260. v2=(vector signed int *)b2;
  261. v3=(vector signed int *)b3;
  262. v4=(vector signed int *)b4;
  263. v5=(vector signed int *)b5;
  264. for (i=0; i< w4;i++)
  265. {
  266. #if 0
  267. b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
  268. b3[i] -= ((b2[i] + b4[i]));
  269. b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4;
  270. b1[i] += (3*(b0[i] + b2[i]))>>1;
  271. #else
  272. t1 = vec_add(v3[i], v5[i]);
  273. t2 = vec_add(t1, vec_add(t1,t1));
  274. t1 = vec_add(t2, vec_splat_s32(4));
  275. v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3)));
  276. v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i]));
  277. t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i]));
  278. t2 = vec_sl(v2[i], vec_splat_u32(2));
  279. v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4)));
  280. t1 = vec_add(v0[i], v2[i]);
  281. t2 = vec_add(t1, vec_add(t1,t1));
  282. v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1)));
  283. #endif
  284. }
  285. for(i*=4; i < width; i++)
  286. {
  287. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  288. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  289. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  290. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  291. }
  292. }
  293. static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
  294. const int obmc_stride,
  295. uint8_t * * block, int b_w,
  296. int b_h, int src_x, int src_y,
  297. int src_stride, slice_buffer * sb,
  298. int add, uint8_t * dst8)
  299. {
  300. int y, x;
  301. DWTELEM * dst;
  302. vector bool int mask;
  303. vector signed int vs;
  304. vector unsigned short h1, h2, l1, l2;
  305. vector unsigned char ih, il, tmp1, tmp2, align;
  306. vector unsigned char b0,b1,b2,b3;
  307. for(y=0; y<b_h; y++){
  308. //FIXME ugly missue of obmc_stride
  309. uint8_t *obmc1= obmc + y*obmc_stride;
  310. uint8_t *obmc2= obmc1+ (obmc_stride>>1);
  311. uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
  312. uint8_t *obmc4= obmc3+ (obmc_stride>>1);
  313. #if 1
  314. vector unsigned char ob1;
  315. vector unsigned char ob2;
  316. vector unsigned char ob3;
  317. vector unsigned char ob4;
  318. #endif
  319. DECLARE_ALIGNED_16(int, vbuf[16]);
  320. vector signed int *v = (vector signed int *)vbuf, *d;
  321. dst = slice_buffer_get_line(sb, src_y + y);
  322. d = (vector signed int *)(dst + src_x);
  323. #if 0
  324. for(x=0; x<b_w; x++){
  325. vbuf[x] = obmc1[x] * block[3][x + y*src_stride]
  326. +obmc2[x] * block[2][x + y*src_stride]
  327. +obmc3[x] * block[1][x + y*src_stride]
  328. +obmc4[x] * block[0][x + y*src_stride];
  329. }
  330. #else
  331. // load blocks
  332. //FIXME i could avoid some loads!
  333. tmp1 = vec_ld(0, &block[3][y*src_stride]);
  334. align = vec_lvsl(0, &block[3][y*src_stride]);
  335. tmp2 = vec_ld(15, &block[3][y*src_stride]);
  336. b3 = vec_perm(tmp1,tmp2,align);
  337. tmp1 = vec_ld(0, &block[2][y*src_stride]);
  338. align = vec_lvsl(0, &block[2][y*src_stride]);
  339. tmp2 = vec_ld(15, &block[2][y*src_stride]);
  340. b2 = vec_perm(tmp1,tmp2,align);
  341. tmp1 = vec_ld(0, &block[1][y*src_stride]);
  342. align = vec_lvsl(0, &block[1][y*src_stride]);
  343. tmp2 = vec_ld(15, &block[1][y*src_stride]);
  344. b1 = vec_perm(tmp1,tmp2,align);
  345. tmp1 = vec_ld(0, &block[0][y*src_stride]);
  346. align = vec_lvsl(0, &block[0][y*src_stride]);
  347. tmp2 = vec_ld(15, &block[0][y*src_stride]);
  348. b0 = vec_perm(tmp1,tmp2,align);
  349. // load obmcs
  350. tmp1 = vec_ld(0, obmc1);
  351. align = vec_lvsl(0, obmc1);
  352. tmp2 = vec_ld(15, obmc1);
  353. ob1 = vec_perm(tmp1,tmp2,align);
  354. tmp1 = vec_ld(0, obmc2);
  355. align = vec_lvsl(0, obmc2);
  356. tmp2 = vec_ld(15, obmc2);
  357. ob2 = vec_perm(tmp1,tmp2,align);
  358. tmp1 = vec_ld(0, obmc3);
  359. align = vec_lvsl(0, obmc3);
  360. tmp2 = vec_ld(15, obmc3);
  361. ob3 = vec_perm(tmp1,tmp2,align);
  362. tmp1 = vec_ld(0, obmc4);
  363. align = vec_lvsl(0, obmc4);
  364. tmp2 = vec_ld(15, obmc4);
  365. ob4 = vec_perm(tmp1,tmp2,align);
  366. h1 = vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b, a,b,a,b,
  367. a,b,a,b, a,b,a,b ] */
  368. h2 = vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d, c,d,c,d,
  369. c,d,c,d, c,d,c,d ] */
  370. ih = vec_mergeh(h1,h2); /* ih <- [ a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d ]*/
  371. l1 = vec_mergeh(b3, b2);
  372. l2 = vec_mergeh(b1, b0);
  373. il = vec_mergeh(l1,l2);
  374. v[0] = vec_msum(ih, il, vec_splat_u32(0));
  375. //step1
  376. h1 = vec_mergeh(ob1, ob2);
  377. h2 = vec_mergeh(ob3, ob4);
  378. ih = vec_mergel(h1,h2);
  379. l1 = vec_mergeh(b3, b2);
  380. l2 = vec_mergeh(b1, b0);
  381. il = vec_mergel(l1,l2);
  382. v[1] = vec_msum(ih, il, vec_splat_u32(0));
  383. #endif
  384. if(add)
  385. {
  386. for(x=0; x<b_w/4; x++)
  387. {
  388. v[x] = vec_add(v[x], d[x]);
  389. v[x] = vec_sra(vec_add(v[x],
  390. vec_sl( vec_splat_s32(1),
  391. vec_splat_u32(7))),
  392. vec_splat_u32(8));
  393. mask = vec_sl((vector signed int) vec_cmpeq(v[x],v[x]),
  394. vec_splat_u32(8));
  395. mask = vec_and(v[x],vec_nor(mask,mask));
  396. mask = (vector signed int) vec_cmpeq((vector signed int)mask,
  397. (vector signed int)vec_splat_u32(0));
  398. vs = vec_sra(v[x],vec_splat_u32(8));
  399. vs = vec_sra(v[x],vec_splat_u32(8));
  400. vs = vec_sra(v[x],vec_splat_u32(15));
  401. vs = vec_nor(vs,vs);
  402. v[x]= vec_sel(v[x],vs,mask);
  403. }
  404. for(x=0; x<b_w; x++)
  405. dst8[x + y*src_stride] = vbuf[x];
  406. }
  407. else
  408. for(x=0; x<b_w/4; x++)
  409. d[x] = vec_sub(d[x], v[x]);
  410. }
  411. }
  412. static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
  413. const int obmc_stride,
  414. uint8_t * * block, int b_w,
  415. int b_h, int src_x, int src_y,
  416. int src_stride, slice_buffer * sb,
  417. int add, uint8_t * dst8)
  418. {
  419. int y, x;
  420. DWTELEM * dst;
  421. vector unsigned short h1, h2, l1, l2;
  422. vector unsigned char ih, il, tmp1, tmp2, align;
  423. vector unsigned char b0,b1,b2,b3;
  424. for(y=0; y<b_h; y++){
  425. //FIXME ugly missue of obmc_stride
  426. uint8_t *obmc1= obmc + y*obmc_stride;
  427. uint8_t *obmc2= obmc1+ (obmc_stride>>1);
  428. uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
  429. uint8_t *obmc4= obmc3+ (obmc_stride>>1);
  430. vector unsigned char ob1;
  431. vector unsigned char ob2;
  432. vector unsigned char ob3;
  433. vector unsigned char ob4;
  434. DECLARE_ALIGNED_16(int, vbuf[b_w]);
  435. vector signed int *v = (vector signed int *)vbuf, *d;
  436. dst = slice_buffer_get_line(sb, src_y + y);
  437. d = (vector signed int *)(dst + src_x);
  438. // load blocks
  439. tmp1 = vec_ld(0, &block[3][y*src_stride]);
  440. align = vec_lvsl(0, &block[3][y*src_stride]);
  441. tmp2 = vec_ld(15, &block[3][y*src_stride]);
  442. b3 = vec_perm(tmp1,tmp2,align);
  443. tmp1 = vec_ld(0, &block[2][y*src_stride]);
  444. align = vec_lvsl(0, &block[2][y*src_stride]);
  445. tmp2 = vec_ld(15, &block[2][y*src_stride]);
  446. b2 = vec_perm(tmp1,tmp2,align);
  447. tmp1 = vec_ld(0, &block[1][y*src_stride]);
  448. align = vec_lvsl(0, &block[1][y*src_stride]);
  449. tmp2 = vec_ld(15, &block[1][y*src_stride]);
  450. b1 = vec_perm(tmp1,tmp2,align);
  451. tmp1 = vec_ld(0, &block[0][y*src_stride]);
  452. align = vec_lvsl(0, &block[0][y*src_stride]);
  453. tmp2 = vec_ld(15, &block[0][y*src_stride]);
  454. b0 = vec_perm(tmp1,tmp2,align);
  455. // load obmcs
  456. tmp1 = vec_ld(0, obmc1);
  457. align = vec_lvsl(0, obmc1);
  458. tmp2 = vec_ld(15, obmc1);
  459. ob1 = vec_perm(tmp1,tmp2,align);
  460. tmp1 = vec_ld(0, obmc2);
  461. align = vec_lvsl(0, obmc2);
  462. tmp2 = vec_ld(15, obmc2);
  463. ob2 = vec_perm(tmp1,tmp2,align);
  464. tmp1 = vec_ld(0, obmc3);
  465. align = vec_lvsl(0, obmc3);
  466. tmp2 = vec_ld(15, obmc3);
  467. ob3 = vec_perm(tmp1,tmp2,align);
  468. tmp1 = vec_ld(0, obmc4);
  469. align = vec_lvsl(0, obmc4);
  470. tmp2 = vec_ld(15, obmc4);
  471. ob4 = vec_perm(tmp1,tmp2,align);
  472. //step0
  473. h1 = vec_mergeh(ob1, ob2); /*h1 <- [ a,b,a,b,
  474. a,b,a,b,
  475. a,b,a,b,
  476. a,b,a,b ] */
  477. h2 = vec_mergeh(ob3, ob4); /*h2 <- [ c,d,c,d,
  478. c,d,c,d,
  479. c,d,c,d,
  480. c,d,c,d ] */
  481. ih = vec_mergeh(h1,h2); /* ih <- [ a,b,c,d,a,b,c,d,a,b,c,d,a,b,c,d ]*/
  482. l1 = vec_mergeh(b3, b2);
  483. l2 = vec_mergeh(b1, b0);
  484. il = vec_mergeh(l1,l2);
  485. v[0] = vec_msum(ih, il, vec_splat_u32(0));
  486. //step1
  487. h1 = vec_mergeh(ob1, ob2);
  488. h2 = vec_mergeh(ob3, ob4);
  489. ih = vec_mergel(h1,h2);
  490. l1 = vec_mergeh(b3, b2);
  491. l2 = vec_mergeh(b1, b0);
  492. il = vec_mergel(l1,l2);
  493. v[1] = vec_msum(ih, il, vec_splat_u32(0));
  494. //step2
  495. h1 = vec_mergel(ob1, ob2);
  496. h2 = vec_mergel(ob3, ob4);
  497. ih = vec_mergeh(h1,h2);
  498. l1 = vec_mergel(b3, b2);
  499. l2 = vec_mergel(b1, b0);
  500. il = vec_mergeh(l1,l2);
  501. v[2] = vec_msum(ih, il, vec_splat_u32(0));
  502. //step3
  503. h1 = vec_mergel(ob1, ob2);
  504. h2 = vec_mergel(ob3, ob4);
  505. ih = vec_mergel(h1,h2);
  506. l1 = vec_mergel(b3, b2);
  507. l2 = vec_mergel(b1, b0);
  508. il = vec_mergel(l1,l2);
  509. v[3] = vec_msum(ih, il, vec_splat_u32(0));
  510. #if 1
  511. for(x=0; x<b_w; x++)
  512. if(add){
  513. vbuf[x] += dst[x + src_x];
  514. vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;
  515. if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);
  516. dst8[x + y*src_stride] = vbuf[x];
  517. }else{
  518. dst[x + src_x] -= vbuf[x];
  519. }
  520. #else
  521. if(add)
  522. {
  523. for(x=0; x<b_w/4; x++)
  524. {
  525. v[x] = vec_add(v[x], d[x]);
  526. v[x] = vec_sra(vec_add(v[x],
  527. vec_sl( vec_splat_s32(1),
  528. vec_splat_u32(7))),
  529. vec_splat_u32(8));
  530. mask = vec_sl((vector signed int) vec_cmpeq(v[x],v[x]),vec_splat_u32(8));
  531. mask = vec_and(v[x],vec_nor(mask,mask));
  532. mask = (vector signed int) vec_cmpeq((vector signed int)mask,(vector signed int)vec_splat_u32(0));
  533. vs = vec_sra(v[x],vec_splat_u32(8));
  534. vs = vec_sra(v[x],vec_splat_u32(8));
  535. vs = vec_sra(v[x],vec_splat_u32(15));
  536. vs = vec_nor(vs,vs);
  537. v[x]= vec_sel(v[x],vs,mask);
  538. }
  539. for(x=0; x<b_w; x++)
  540. dst8[x + y*src_stride] = vbuf[x];
  541. }
  542. else
  543. for(x=0; x<b_w/4; x++)
  544. d[x] = vec_sub(d[x], v[x]);
  545. #endif
  546. }
  547. }
  548. void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
  549. uint8_t * * block, int b_w, int b_h,
  550. int src_x, int src_y, int src_stride,
  551. slice_buffer * sb, int add,
  552. uint8_t * dst8)
  553. {
  554. if (b_w == 16)
  555. inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block, b_w,
  556. b_h, src_x, src_y, src_stride,
  557. sb, add, dst8);
  558. else if (b_w == 8 && ! src_x&15 )
  559. inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
  560. b_w, b_h, src_x, src_y,
  561. src_stride, sb, add, dst8);
  562. else
  563. ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
  564. src_y, src_stride, sb, add, dst8);
  565. }