You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

787 lines
23KB

  1. /*
  2. * Altivec optimized snow DSP utils
  3. * Copyright (c) 2006 Luca Barbato <lu_zero@gentoo.org>
  4. *
  5. * This file is part of FFmpeg.
  6. *
  7. * FFmpeg is free software; you can redistribute it and/or
  8. * modify it under the terms of the GNU Lesser General Public
  9. * License as published by the Free Software Foundation; either
  10. * version 2.1 of the License, or (at your option) any later version.
  11. *
  12. * FFmpeg is distributed in the hope that it will be useful,
  13. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15. * Lesser General Public License for more details.
  16. *
  17. * You should have received a copy of the GNU Lesser General Public
  18. * License along with FFmpeg; if not, write to the Free Software
  19. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20. */
  21. #include "dsputil.h"
  22. #include "gcc_fixes.h"
  23. #include "dsputil_altivec.h"
  24. #include "snow.h"
  25. #undef NDEBUG
  26. #include <assert.h>
  27. //FIXME remove this replication
  28. #define slice_buffer_get_line(slice_buf, line_num) ((slice_buf)->line[line_num] ? (slice_buf)->line[line_num] : slice_buffer_load_line((slice_buf), (line_num)))
  29. static DWTELEM * slice_buffer_load_line(slice_buffer * buf, int line)
  30. {
  31. int offset;
  32. DWTELEM * buffer;
  33. // av_log(NULL, AV_LOG_DEBUG, "Cache hit: %d\n", line);
  34. assert(buf->data_stack_top >= 0);
  35. // assert(!buf->line[line]);
  36. if (buf->line[line])
  37. return buf->line[line];
  38. offset = buf->line_width * line;
  39. buffer = buf->data_stack[buf->data_stack_top];
  40. buf->data_stack_top--;
  41. buf->line[line] = buffer;
  42. // av_log(NULL, AV_LOG_DEBUG, "slice_buffer_load_line: line: %d remaining: %d\n", line, buf->data_stack_top + 1);
  43. return buffer;
  44. }
  45. //altivec code
  46. void ff_snow_horizontal_compose97i_altivec(DWTELEM *b, int width)
  47. {
  48. const int w2= (width+1)>>1;
  49. DECLARE_ALIGNED_16(DWTELEM, temp[(width>>1)]);
  50. const int w_l= (width>>1);
  51. const int w_r= w2 - 1;
  52. int i;
  53. vector signed int t1, t2, x, y, tmp1, tmp2;
  54. vector signed int *vbuf, *vtmp;
  55. vector unsigned char align;
  56. { // Lift 0
  57. DWTELEM * const ref = b + w2 - 1;
  58. DWTELEM b_0 = b[0];
  59. vbuf = (vector signed int *)b;
  60. tmp1 = vec_ld (0, ref);
  61. align = vec_lvsl (0, ref);
  62. tmp2 = vec_ld (15, ref);
  63. t1= vec_perm(tmp1, tmp2, align);
  64. i = 0;
  65. for (i=0; i<w_l-15; i+=16) {
  66. #if 0
  67. b[i+0] = b[i+0] - ((3 * (ref[i+0] + ref[i+1]) + 4) >> 3);
  68. b[i+1] = b[i+1] - ((3 * (ref[i+1] + ref[i+2]) + 4) >> 3);
  69. b[i+2] = b[i+2] - ((3 * (ref[i+2] + ref[i+3]) + 4) >> 3);
  70. b[i+3] = b[i+3] - ((3 * (ref[i+3] + ref[i+4]) + 4) >> 3);
  71. #else
  72. tmp1 = vec_ld (0, ref+4+i);
  73. tmp2 = vec_ld (15, ref+4+i);
  74. t2 = vec_perm(tmp1, tmp2, align);
  75. y = vec_add(t1,vec_sld(t1,t2,4));
  76. y = vec_add(vec_add(y,y),y);
  77. tmp1 = vec_ld (0, ref+8+i);
  78. y = vec_add(y, vec_splat_s32(4));
  79. y = vec_sra(y, vec_splat_u32(3));
  80. tmp2 = vec_ld (15, ref+8+i);
  81. *vbuf = vec_sub(*vbuf, y);
  82. t1=t2;
  83. vbuf++;
  84. t2 = vec_perm(tmp1, tmp2, align);
  85. y = vec_add(t1,vec_sld(t1,t2,4));
  86. y = vec_add(vec_add(y,y),y);
  87. tmp1 = vec_ld (0, ref+12+i);
  88. y = vec_add(y, vec_splat_s32(4));
  89. y = vec_sra(y, vec_splat_u32(3));
  90. tmp2 = vec_ld (15, ref+12+i);
  91. *vbuf = vec_sub(*vbuf, y);
  92. t1=t2;
  93. vbuf++;
  94. t2 = vec_perm(tmp1, tmp2, align);
  95. y = vec_add(t1,vec_sld(t1,t2,4));
  96. y = vec_add(vec_add(y,y),y);
  97. tmp1 = vec_ld (0, ref+16+i);
  98. y = vec_add(y, vec_splat_s32(4));
  99. y = vec_sra(y, vec_splat_u32(3));
  100. tmp2 = vec_ld (15, ref+16+i);
  101. *vbuf = vec_sub(*vbuf, y);
  102. t1=t2;
  103. t2 = vec_perm(tmp1, tmp2, align);
  104. y = vec_add(t1,vec_sld(t1,t2,4));
  105. y = vec_add(vec_add(y,y),y);
  106. vbuf++;
  107. y = vec_add(y, vec_splat_s32(4));
  108. y = vec_sra(y, vec_splat_u32(3));
  109. *vbuf = vec_sub(*vbuf, y);
  110. t1=t2;
  111. vbuf++;
  112. #endif
  113. }
  114. snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
  115. b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
  116. }
  117. { // Lift 1
  118. DWTELEM * const dst = b+w2;
  119. i = 0;
  120. for(; (((long)&dst[i]) & 0xF) && i<w_r; i++){
  121. dst[i] = dst[i] - (b[i] + b[i + 1]);
  122. }
  123. align = vec_lvsl(0, b+i);
  124. tmp1 = vec_ld(0, b+i);
  125. vbuf = (vector signed int*) (dst + i);
  126. tmp2 = vec_ld(15, b+i);
  127. t1 = vec_perm(tmp1, tmp2, align);
  128. for (; i<w_r-3; i+=4) {
  129. #if 0
  130. dst[i] = dst[i] - (b[i] + b[i + 1]);
  131. dst[i+1] = dst[i+1] - (b[i+1] + b[i + 2]);
  132. dst[i+2] = dst[i+2] - (b[i+2] + b[i + 3]);
  133. dst[i+3] = dst[i+3] - (b[i+3] + b[i + 4]);
  134. #else
  135. tmp1 = vec_ld(0, b+4+i);
  136. tmp2 = vec_ld(15, b+4+i);
  137. t2 = vec_perm(tmp1, tmp2, align);
  138. y = vec_add(t1, vec_sld(t1,t2,4));
  139. *vbuf = vec_sub (*vbuf, y);
  140. vbuf++;
  141. t1 = t2;
  142. #endif
  143. }
  144. snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
  145. }
  146. { // Lift 2
  147. DWTELEM * const ref = b+w2 - 1;
  148. DWTELEM b_0 = b[0];
  149. vbuf= (vector signed int *) b;
  150. tmp1 = vec_ld (0, ref);
  151. align = vec_lvsl (0, ref);
  152. tmp2 = vec_ld (15, ref);
  153. t1= vec_perm(tmp1, tmp2, align);
  154. i = 0;
  155. for (; i<w_l-15; i+=16) {
  156. #if 0
  157. b[i] = b[i] - (((8 -(ref[i] + ref[i+1])) - (b[i] <<2)) >> 4);
  158. b[i+1] = b[i+1] - (((8 -(ref[i+1] + ref[i+2])) - (b[i+1]<<2)) >> 4);
  159. b[i+2] = b[i+2] - (((8 -(ref[i+2] + ref[i+3])) - (b[i+2]<<2)) >> 4);
  160. b[i+3] = b[i+3] - (((8 -(ref[i+3] + ref[i+4])) - (b[i+3]<<2)) >> 4);
  161. #else
  162. tmp1 = vec_ld (0, ref+4+i);
  163. tmp2 = vec_ld (15, ref+4+i);
  164. t2 = vec_perm(tmp1, tmp2, align);
  165. y = vec_add(t1,vec_sld(t1,t2,4));
  166. y = vec_sub(vec_splat_s32(8),y);
  167. tmp1 = vec_ld (0, ref+8+i);
  168. x = vec_sl(*vbuf,vec_splat_u32(2));
  169. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  170. tmp2 = vec_ld (15, ref+8+i);
  171. *vbuf = vec_sub( *vbuf, y);
  172. t1 = t2;
  173. vbuf++;
  174. t2 = vec_perm(tmp1, tmp2, align);
  175. y = vec_add(t1,vec_sld(t1,t2,4));
  176. y = vec_sub(vec_splat_s32(8),y);
  177. tmp1 = vec_ld (0, ref+12+i);
  178. x = vec_sl(*vbuf,vec_splat_u32(2));
  179. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  180. tmp2 = vec_ld (15, ref+12+i);
  181. *vbuf = vec_sub( *vbuf, y);
  182. t1 = t2;
  183. vbuf++;
  184. t2 = vec_perm(tmp1, tmp2, align);
  185. y = vec_add(t1,vec_sld(t1,t2,4));
  186. y = vec_sub(vec_splat_s32(8),y);
  187. tmp1 = vec_ld (0, ref+16+i);
  188. x = vec_sl(*vbuf,vec_splat_u32(2));
  189. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  190. tmp2 = vec_ld (15, ref+16+i);
  191. *vbuf = vec_sub( *vbuf, y);
  192. t1 = t2;
  193. vbuf++;
  194. t2 = vec_perm(tmp1, tmp2, align);
  195. y = vec_add(t1,vec_sld(t1,t2,4));
  196. y = vec_sub(vec_splat_s32(8),y);
  197. t1 = t2;
  198. x = vec_sl(*vbuf,vec_splat_u32(2));
  199. y = vec_sra(vec_sub(y,x),vec_splat_u32(4));
  200. *vbuf = vec_sub( *vbuf, y);
  201. vbuf++;
  202. #endif
  203. }
  204. snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
  205. b[0] = b_0 - (((-2 * ref[1] + W_BO) - 4 * b_0) >> W_BS);
  206. }
  207. { // Lift 3
  208. DWTELEM * const src = b+w2;
  209. vbuf = (vector signed int *)b;
  210. vtmp = (vector signed int *)temp;
  211. i = 0;
  212. align = vec_lvsl(0, src);
  213. for (; i<w_r-3; i+=4) {
  214. #if 0
  215. temp[i] = src[i] - ((-3*(b[i] + b[i+1]))>>1);
  216. temp[i+1] = src[i+1] - ((-3*(b[i+1] + b[i+2]))>>1);
  217. temp[i+2] = src[i+2] - ((-3*(b[i+2] + b[i+3]))>>1);
  218. temp[i+3] = src[i+3] - ((-3*(b[i+3] + b[i+4]))>>1);
  219. #else
  220. tmp1 = vec_ld(0,src+i);
  221. t1 = vec_add(vbuf[0],vec_sld(vbuf[0],vbuf[1],4));
  222. tmp2 = vec_ld(15,src+i);
  223. t1 = vec_sub(vec_splat_s32(0),t1); //bad!
  224. t1 = vec_add(t1,vec_add(t1,t1));
  225. t2 = vec_perm(tmp1 ,tmp2 ,align);
  226. t1 = vec_sra(t1,vec_splat_u32(1));
  227. vbuf++;
  228. *vtmp = vec_sub(t2,t1);
  229. vtmp++;
  230. #endif
  231. }
  232. snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -3, 0, 1);
  233. }
  234. {
  235. //Interleave
  236. int a;
  237. vector signed int *t = (vector signed int *)temp,
  238. *v = (vector signed int *)b;
  239. snow_interleave_line_header(&i, width, b, temp);
  240. for (; (i & 0xE) != 0xE; i-=2){
  241. b[i+1] = temp[i>>1];
  242. b[i] = b[i>>1];
  243. }
  244. for (i-=14; i>=0; i-=16){
  245. a=i/4;
  246. v[a+3]=vec_mergel(v[(a>>1)+1],t[(a>>1)+1]);
  247. v[a+2]=vec_mergeh(v[(a>>1)+1],t[(a>>1)+1]);
  248. v[a+1]=vec_mergel(v[a>>1],t[a>>1]);
  249. v[a]=vec_mergeh(v[a>>1],t[a>>1]);
  250. }
  251. }
  252. }
  253. void ff_snow_vertical_compose97i_altivec(DWTELEM *b0, DWTELEM *b1, DWTELEM *b2, DWTELEM *b3, DWTELEM *b4, DWTELEM *b5, int width)
  254. {
  255. int i, w4 = width/4;
  256. vector signed int *v0, *v1,*v2,*v3,*v4,*v5;
  257. vector signed int t1, t2;
  258. v0=(vector signed int *)b0;
  259. v1=(vector signed int *)b1;
  260. v2=(vector signed int *)b2;
  261. v3=(vector signed int *)b3;
  262. v4=(vector signed int *)b4;
  263. v5=(vector signed int *)b5;
  264. for (i=0; i< w4;i++)
  265. {
  266. #if 0
  267. b4[i] -= (3*(b3[i] + b5[i])+4)>>3;
  268. b3[i] -= ((b2[i] + b4[i]));
  269. b2[i] += ((b1[i] + b3[i])+4*b2[i]+8)>>4;
  270. b1[i] += (3*(b0[i] + b2[i]))>>1;
  271. #else
  272. t1 = vec_add(v3[i], v5[i]);
  273. t2 = vec_add(t1, vec_add(t1,t1));
  274. t1 = vec_add(t2, vec_splat_s32(4));
  275. v4[i] = vec_sub(v4[i], vec_sra(t1,vec_splat_u32(3)));
  276. v3[i] = vec_sub(v3[i], vec_add(v2[i], v4[i]));
  277. t1 = vec_add(vec_splat_s32(8), vec_add(v1[i], v3[i]));
  278. t2 = vec_sl(v2[i], vec_splat_u32(2));
  279. v2[i] = vec_add(v2[i], vec_sra(vec_add(t1,t2),vec_splat_u32(4)));
  280. t1 = vec_add(v0[i], v2[i]);
  281. t2 = vec_add(t1, vec_add(t1,t1));
  282. v1[i] = vec_add(v1[i], vec_sra(t2,vec_splat_u32(1)));
  283. #endif
  284. }
  285. for(i*=4; i < width; i++)
  286. {
  287. b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
  288. b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
  289. b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
  290. b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
  291. }
  292. }
  293. #define LOAD_BLOCKS \
  294. tmp1 = vec_ld(0, &block[3][y*src_stride]);\
  295. align = vec_lvsl(0, &block[3][y*src_stride]);\
  296. tmp2 = vec_ld(15, &block[3][y*src_stride]);\
  297. \
  298. b3 = vec_perm(tmp1,tmp2,align);\
  299. \
  300. tmp1 = vec_ld(0, &block[2][y*src_stride]);\
  301. align = vec_lvsl(0, &block[2][y*src_stride]);\
  302. tmp2 = vec_ld(15, &block[2][y*src_stride]);\
  303. \
  304. b2 = vec_perm(tmp1,tmp2,align);\
  305. \
  306. tmp1 = vec_ld(0, &block[1][y*src_stride]);\
  307. align = vec_lvsl(0, &block[1][y*src_stride]);\
  308. tmp2 = vec_ld(15, &block[1][y*src_stride]);\
  309. \
  310. b1 = vec_perm(tmp1,tmp2,align);\
  311. \
  312. tmp1 = vec_ld(0, &block[0][y*src_stride]);\
  313. align = vec_lvsl(0, &block[0][y*src_stride]);\
  314. tmp2 = vec_ld(15, &block[0][y*src_stride]);\
  315. \
  316. b0 = vec_perm(tmp1,tmp2,align);
  317. #define LOAD_OBMCS \
  318. tmp1 = vec_ld(0, obmc1);\
  319. align = vec_lvsl(0, obmc1);\
  320. tmp2 = vec_ld(15, obmc1);\
  321. \
  322. ob1 = vec_perm(tmp1,tmp2,align);\
  323. \
  324. tmp1 = vec_ld(0, obmc2);\
  325. align = vec_lvsl(0, obmc2);\
  326. tmp2 = vec_ld(15, obmc2);\
  327. \
  328. ob2 = vec_perm(tmp1,tmp2,align);\
  329. \
  330. tmp1 = vec_ld(0, obmc3);\
  331. align = vec_lvsl(0, obmc3);\
  332. tmp2 = vec_ld(15, obmc3);\
  333. \
  334. ob3 = vec_perm(tmp1,tmp2,align);\
  335. \
  336. tmp1 = vec_ld(0, obmc4);\
  337. align = vec_lvsl(0, obmc4);\
  338. tmp2 = vec_ld(15, obmc4);\
  339. \
  340. ob4 = vec_perm(tmp1,tmp2,align);
  341. /* interleave logic
  342. * h1 <- [ a,b,a,b, a,b,a,b, a,b,a,b, a,b,a,b ]
  343. * h2 <- [ c,d,c,d, c,d,c,d, c,d,c,d, c,d,c,d ]
  344. * h <- [ a,b,c,d, a,b,c,d, a,b,c,d, a,b,c,d ]
  345. */
  346. #define STEPS_0_1\
  347. h1 = (vector unsigned short)\
  348. vec_mergeh(ob1, ob2);\
  349. \
  350. h2 = (vector unsigned short)\
  351. vec_mergeh(ob3, ob4);\
  352. \
  353. ih = (vector unsigned char)\
  354. vec_mergeh(h1,h2);\
  355. \
  356. l1 = (vector unsigned short) vec_mergeh(b3, b2);\
  357. \
  358. ih1 = (vector unsigned char) vec_mergel(h1, h2);\
  359. \
  360. l2 = (vector unsigned short) vec_mergeh(b1, b0);\
  361. \
  362. il = (vector unsigned char) vec_mergeh(l1, l2);\
  363. \
  364. v[0] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
  365. \
  366. il1 = (vector unsigned char) vec_mergel(l1, l2);\
  367. \
  368. v[1] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
  369. #define FINAL_STEP_SCALAR\
  370. for(x=0; x<b_w; x++)\
  371. if(add){\
  372. vbuf[x] += dst[x + src_x];\
  373. vbuf[x] = (vbuf[x] + (1<<(FRAC_BITS-1))) >> FRAC_BITS;\
  374. if(vbuf[x]&(~255)) vbuf[x]= ~(vbuf[x]>>31);\
  375. dst8[x + y*src_stride] = vbuf[x];\
  376. }else{\
  377. dst[x + src_x] -= vbuf[x];\
  378. }
  379. static void inner_add_yblock_bw_8_obmc_16_altivec(uint8_t *obmc,
  380. const int obmc_stride,
  381. uint8_t * * block, int b_w,
  382. int b_h, int src_x, int src_y,
  383. int src_stride, slice_buffer * sb,
  384. int add, uint8_t * dst8)
  385. {
  386. int y, x;
  387. DWTELEM * dst;
  388. vector unsigned short h1, h2, l1, l2;
  389. vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
  390. vector unsigned char b0,b1,b2,b3;
  391. vector unsigned char ob1,ob2,ob3,ob4;
  392. DECLARE_ALIGNED_16(int, vbuf[16]);
  393. vector signed int *v = (vector signed int *)vbuf, *d;
  394. for(y=0; y<b_h; y++){
  395. //FIXME ugly misuse of obmc_stride
  396. uint8_t *obmc1= obmc + y*obmc_stride;
  397. uint8_t *obmc2= obmc1+ (obmc_stride>>1);
  398. uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
  399. uint8_t *obmc4= obmc3+ (obmc_stride>>1);
  400. dst = slice_buffer_get_line(sb, src_y + y);
  401. d = (vector signed int *)(dst + src_x);
  402. //FIXME i could avoid some loads!
  403. // load blocks
  404. LOAD_BLOCKS
  405. // load obmcs
  406. LOAD_OBMCS
  407. // steps 0 1
  408. STEPS_0_1
  409. FINAL_STEP_SCALAR
  410. }
  411. }
  412. #define STEPS_2_3\
  413. h1 = (vector unsigned short) vec_mergel(ob1, ob2);\
  414. \
  415. h2 = (vector unsigned short) vec_mergel(ob3, ob4);\
  416. \
  417. ih = (vector unsigned char) vec_mergeh(h1,h2);\
  418. \
  419. l1 = (vector unsigned short) vec_mergel(b3, b2);\
  420. \
  421. l2 = (vector unsigned short) vec_mergel(b1, b0);\
  422. \
  423. ih1 = (vector unsigned char) vec_mergel(h1,h2);\
  424. \
  425. il = (vector unsigned char) vec_mergeh(l1,l2);\
  426. \
  427. v[2] = (vector signed int) vec_msum(ih, il, vec_splat_u32(0));\
  428. \
  429. il1 = (vector unsigned char) vec_mergel(l1,l2);\
  430. \
  431. v[3] = (vector signed int) vec_msum(ih1, il1, vec_splat_u32(0));
  432. static void inner_add_yblock_bw_16_obmc_32_altivec(uint8_t *obmc,
  433. const int obmc_stride,
  434. uint8_t * * block, int b_w,
  435. int b_h, int src_x, int src_y,
  436. int src_stride, slice_buffer * sb,
  437. int add, uint8_t * dst8)
  438. {
  439. int y, x;
  440. DWTELEM * dst;
  441. vector unsigned short h1, h2, l1, l2;
  442. vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
  443. vector unsigned char b0,b1,b2,b3;
  444. vector unsigned char ob1,ob2,ob3,ob4;
  445. DECLARE_ALIGNED_16(int, vbuf[b_w]);
  446. vector signed int *v = (vector signed int *)vbuf, *d;
  447. for(y=0; y<b_h; y++){
  448. //FIXME ugly misuse of obmc_stride
  449. uint8_t *obmc1= obmc + y*obmc_stride;
  450. uint8_t *obmc2= obmc1+ (obmc_stride>>1);
  451. uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
  452. uint8_t *obmc4= obmc3+ (obmc_stride>>1);
  453. dst = slice_buffer_get_line(sb, src_y + y);
  454. d = (vector signed int *)(dst + src_x);
  455. // load blocks
  456. LOAD_BLOCKS
  457. // load obmcs
  458. LOAD_OBMCS
  459. // steps 0 1 2 3
  460. STEPS_0_1
  461. STEPS_2_3
  462. FINAL_STEP_SCALAR
  463. }
  464. }
  465. #define FINAL_STEP_VEC \
  466. \
  467. if(add)\
  468. {\
  469. for(x=0; x<b_w/4; x++)\
  470. {\
  471. v[x] = vec_add(v[x], d[x]);\
  472. v[x] = vec_sra(vec_add(v[x],\
  473. vec_sl( vec_splat_s32(1),\
  474. vec_splat_u32(7))),\
  475. vec_splat_u32(8));\
  476. \
  477. mask = (vector bool int) vec_sl((vector signed int)\
  478. vec_cmpeq(v[x],v[x]),vec_splat_u32(8));\
  479. mask = (vector bool int) vec_and(v[x],vec_nor(mask,mask));\
  480. \
  481. mask = (vector bool int)\
  482. vec_cmpeq((vector signed int)mask,\
  483. (vector signed int)vec_splat_u32(0));\
  484. \
  485. vs = vec_sra(v[x],vec_splat_u32(8));\
  486. vs = vec_sra(v[x],vec_splat_u32(8));\
  487. vs = vec_sra(v[x],vec_splat_u32(15));\
  488. \
  489. vs = vec_nor(vs,vs);\
  490. \
  491. v[x]= vec_sel(v[x],vs,mask);\
  492. }\
  493. \
  494. for(x=0; x<b_w; x++)\
  495. dst8[x + y*src_stride] = vbuf[x];\
  496. \
  497. }\
  498. else\
  499. for(x=0; x<b_w/4; x++)\
  500. d[x] = vec_sub(d[x], v[x]);
  501. static void inner_add_yblock_a_bw_8_obmc_16_altivec(uint8_t *obmc,
  502. const int obmc_stride,
  503. uint8_t * * block, int b_w,
  504. int b_h, int src_x, int src_y,
  505. int src_stride, slice_buffer * sb,
  506. int add, uint8_t * dst8)
  507. {
  508. int y, x;
  509. DWTELEM * dst;
  510. vector bool int mask;
  511. vector signed int vs;
  512. vector unsigned short h1, h2, l1, l2;
  513. vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
  514. vector unsigned char b0,b1,b2,b3;
  515. vector unsigned char ob1,ob2,ob3,ob4;
  516. DECLARE_ALIGNED_16(int, vbuf[16]);
  517. vector signed int *v = (vector signed int *)vbuf, *d;
  518. for(y=0; y<b_h; y++){
  519. //FIXME ugly misuse of obmc_stride
  520. uint8_t *obmc1= obmc + y*obmc_stride;
  521. uint8_t *obmc2= obmc1+ (obmc_stride>>1);
  522. uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
  523. uint8_t *obmc4= obmc3+ (obmc_stride>>1);
  524. dst = slice_buffer_get_line(sb, src_y + y);
  525. d = (vector signed int *)(dst + src_x);
  526. //FIXME i could avoid some loads!
  527. // load blocks
  528. LOAD_BLOCKS
  529. // load obmcs
  530. LOAD_OBMCS
  531. // steps 0 1
  532. STEPS_0_1
  533. FINAL_STEP_VEC
  534. }
  535. }
  536. static void inner_add_yblock_a_bw_16_obmc_32_altivec(uint8_t *obmc,
  537. const int obmc_stride,
  538. uint8_t * * block, int b_w,
  539. int b_h, int src_x, int src_y,
  540. int src_stride, slice_buffer * sb,
  541. int add, uint8_t * dst8)
  542. {
  543. int y, x;
  544. DWTELEM * dst;
  545. vector bool int mask;
  546. vector signed int vs;
  547. vector unsigned short h1, h2, l1, l2;
  548. vector unsigned char ih, il, ih1, il1, tmp1, tmp2, align;
  549. vector unsigned char b0,b1,b2,b3;
  550. vector unsigned char ob1,ob2,ob3,ob4;
  551. DECLARE_ALIGNED_16(int, vbuf[b_w]);
  552. vector signed int *v = (vector signed int *)vbuf, *d;
  553. for(y=0; y<b_h; y++){
  554. //FIXME ugly misuse of obmc_stride
  555. uint8_t *obmc1= obmc + y*obmc_stride;
  556. uint8_t *obmc2= obmc1+ (obmc_stride>>1);
  557. uint8_t *obmc3= obmc1+ obmc_stride*(obmc_stride>>1);
  558. uint8_t *obmc4= obmc3+ (obmc_stride>>1);
  559. dst = slice_buffer_get_line(sb, src_y + y);
  560. d = (vector signed int *)(dst + src_x);
  561. // load blocks
  562. LOAD_BLOCKS
  563. // load obmcs
  564. LOAD_OBMCS
  565. // steps 0 1 2 3
  566. STEPS_0_1
  567. STEPS_2_3
  568. FINAL_STEP_VEC
  569. }
  570. }
  571. void ff_snow_inner_add_yblock_altivec(uint8_t *obmc, const int obmc_stride,
  572. uint8_t * * block, int b_w, int b_h,
  573. int src_x, int src_y, int src_stride,
  574. slice_buffer * sb, int add,
  575. uint8_t * dst8)
  576. {
  577. if (src_x&15) {
  578. if (b_w == 16)
  579. inner_add_yblock_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
  580. b_w, b_h, src_x, src_y,
  581. src_stride, sb, add, dst8);
  582. else if (b_w == 8)
  583. inner_add_yblock_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
  584. b_w, b_h, src_x, src_y,
  585. src_stride, sb, add, dst8);
  586. else
  587. ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
  588. src_y, src_stride, sb, add, dst8);
  589. } else {
  590. if (b_w == 16)
  591. inner_add_yblock_a_bw_16_obmc_32_altivec(obmc, obmc_stride, block,
  592. b_w, b_h, src_x, src_y,
  593. src_stride, sb, add, dst8);
  594. else if (b_w == 8)
  595. inner_add_yblock_a_bw_8_obmc_16_altivec(obmc, obmc_stride, block,
  596. b_w, b_h, src_x, src_y,
  597. src_stride, sb, add, dst8);
  598. else
  599. ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,
  600. src_y, src_stride, sb, add, dst8);
  601. }
  602. }
  603. void snow_init_altivec(DSPContext* c, AVCodecContext *avctx)
  604. {
  605. c->horizontal_compose97i = ff_snow_horizontal_compose97i_altivec;
  606. c->vertical_compose97i = ff_snow_vertical_compose97i_altivec;
  607. c->inner_add_yblock = ff_snow_inner_add_yblock_altivec;
  608. }