You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

466 lines
18KB

  1. /*
  2. * Copyright (c) 2002 Brian Foley
  3. * Copyright (c) 2002 Dieter Shirley
  4. * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org>
  5. *
  6. * This file is part of Libav.
  7. *
  8. * Libav is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * Libav is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with Libav; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. #include "config.h"
  23. #include "libavutil/attributes.h"
  24. #include "libavutil/cpu.h"
  25. #include "libavutil/ppc/cpu.h"
  26. #include "libavutil/ppc/util_altivec.h"
  27. #include "libavcodec/hpeldsp.h"
  28. #include "hpeldsp_altivec.h"
  29. #if HAVE_ALTIVEC && HAVE_BIGENDIAN
  30. /* next one assumes that ((line_size % 16) == 0) */
  31. void ff_put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  32. {
  33. register vector unsigned char pixelsv1, pixelsv2;
  34. register vector unsigned char pixelsv1B, pixelsv2B;
  35. register vector unsigned char pixelsv1C, pixelsv2C;
  36. register vector unsigned char pixelsv1D, pixelsv2D;
  37. register vector unsigned char perm = vec_lvsl(0, pixels);
  38. int i;
  39. register ptrdiff_t line_size_2 = line_size << 1;
  40. register ptrdiff_t line_size_3 = line_size + line_size_2;
  41. register ptrdiff_t line_size_4 = line_size << 2;
  42. // hand-unrolling the loop by 4 gains about 15%
  43. // mininum execution time goes from 74 to 60 cycles
  44. // it's faster than -funroll-loops, but using
  45. // -funroll-loops w/ this is bad - 74 cycles again.
  46. // all this is on a 7450, tuning for the 7450
  47. for (i = 0; i < h; i += 4) {
  48. pixelsv1 = vec_ld( 0, pixels);
  49. pixelsv2 = vec_ld(15, pixels);
  50. pixelsv1B = vec_ld(line_size, pixels);
  51. pixelsv2B = vec_ld(15 + line_size, pixels);
  52. pixelsv1C = vec_ld(line_size_2, pixels);
  53. pixelsv2C = vec_ld(15 + line_size_2, pixels);
  54. pixelsv1D = vec_ld(line_size_3, pixels);
  55. pixelsv2D = vec_ld(15 + line_size_3, pixels);
  56. vec_st(vec_perm(pixelsv1, pixelsv2, perm),
  57. 0, (unsigned char*)block);
  58. vec_st(vec_perm(pixelsv1B, pixelsv2B, perm),
  59. line_size, (unsigned char*)block);
  60. vec_st(vec_perm(pixelsv1C, pixelsv2C, perm),
  61. line_size_2, (unsigned char*)block);
  62. vec_st(vec_perm(pixelsv1D, pixelsv2D, perm),
  63. line_size_3, (unsigned char*)block);
  64. pixels+=line_size_4;
  65. block +=line_size_4;
  66. }
  67. }
  68. /* next one assumes that ((line_size % 16) == 0) */
  69. #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
  70. void ff_avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  71. {
  72. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  73. register vector unsigned char perm = vec_lvsl(0, pixels);
  74. int i;
  75. for (i = 0; i < h; i++) {
  76. pixelsv1 = vec_ld( 0, pixels);
  77. pixelsv2 = vec_ld(16,pixels);
  78. blockv = vec_ld(0, block);
  79. pixelsv = vec_perm(pixelsv1, pixelsv2, perm);
  80. blockv = vec_avg(blockv,pixelsv);
  81. vec_st(blockv, 0, (unsigned char*)block);
  82. pixels+=line_size;
  83. block +=line_size;
  84. }
  85. }
  86. /* next one assumes that ((line_size % 8) == 0) */
  87. static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
  88. {
  89. register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv;
  90. int i;
  91. for (i = 0; i < h; i++) {
  92. /* block is 8 bytes-aligned, so we're either in the
  93. left block (16 bytes-aligned) or in the right block (not) */
  94. int rightside = ((unsigned long)block & 0x0000000F);
  95. blockv = vec_ld(0, block);
  96. pixelsv1 = vec_ld( 0, pixels);
  97. pixelsv2 = vec_ld(16, pixels);
  98. pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels));
  99. if (rightside) {
  100. pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1));
  101. } else {
  102. pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3));
  103. }
  104. blockv = vec_avg(blockv, pixelsv);
  105. vec_st(blockv, 0, block);
  106. pixels += line_size;
  107. block += line_size;
  108. }
  109. }
  110. /* next one assumes that ((line_size % 8) == 0) */
  111. static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  112. {
  113. register int i;
  114. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  115. register vector unsigned char blockv, temp1, temp2;
  116. register vector unsigned short pixelssum1, pixelssum2, temp3;
  117. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  118. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  119. temp1 = vec_ld(0, pixels);
  120. temp2 = vec_ld(16, pixels);
  121. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  122. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  123. pixelsv2 = temp2;
  124. } else {
  125. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  126. }
  127. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  128. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  129. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  130. (vector unsigned short)pixelsv2);
  131. pixelssum1 = vec_add(pixelssum1, vctwo);
  132. for (i = 0; i < h ; i++) {
  133. int rightside = ((unsigned long)block & 0x0000000F);
  134. blockv = vec_ld(0, block);
  135. temp1 = vec_ld(line_size, pixels);
  136. temp2 = vec_ld(line_size + 16, pixels);
  137. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  138. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  139. pixelsv2 = temp2;
  140. } else {
  141. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  142. }
  143. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  144. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  145. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  146. (vector unsigned short)pixelsv2);
  147. temp3 = vec_add(pixelssum1, pixelssum2);
  148. temp3 = vec_sra(temp3, vctwo);
  149. pixelssum1 = vec_add(pixelssum2, vctwo);
  150. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  151. if (rightside) {
  152. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  153. } else {
  154. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  155. }
  156. vec_st(blockv, 0, block);
  157. block += line_size;
  158. pixels += line_size;
  159. }
  160. }
  161. /* next one assumes that ((line_size % 8) == 0) */
  162. static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  163. {
  164. register int i;
  165. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  166. register vector unsigned char blockv, temp1, temp2;
  167. register vector unsigned short pixelssum1, pixelssum2, temp3;
  168. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  169. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  170. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  171. temp1 = vec_ld(0, pixels);
  172. temp2 = vec_ld(16, pixels);
  173. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  174. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  175. pixelsv2 = temp2;
  176. } else {
  177. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  178. }
  179. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  180. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  181. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  182. (vector unsigned short)pixelsv2);
  183. pixelssum1 = vec_add(pixelssum1, vcone);
  184. for (i = 0; i < h ; i++) {
  185. int rightside = ((unsigned long)block & 0x0000000F);
  186. blockv = vec_ld(0, block);
  187. temp1 = vec_ld(line_size, pixels);
  188. temp2 = vec_ld(line_size + 16, pixels);
  189. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  190. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  191. pixelsv2 = temp2;
  192. } else {
  193. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  194. }
  195. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  196. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  197. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  198. (vector unsigned short)pixelsv2);
  199. temp3 = vec_add(pixelssum1, pixelssum2);
  200. temp3 = vec_sra(temp3, vctwo);
  201. pixelssum1 = vec_add(pixelssum2, vcone);
  202. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  203. if (rightside) {
  204. blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  205. } else {
  206. blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  207. }
  208. vec_st(blockv, 0, block);
  209. block += line_size;
  210. pixels += line_size;
  211. }
  212. }
  213. /* next one assumes that ((line_size % 16) == 0) */
  214. static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
  215. {
  216. register int i;
  217. register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  218. register vector unsigned char blockv, temp1, temp2;
  219. register vector unsigned short temp3, temp4,
  220. pixelssum1, pixelssum2, pixelssum3, pixelssum4;
  221. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  222. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  223. temp1 = vec_ld(0, pixels);
  224. temp2 = vec_ld(16, pixels);
  225. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  226. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  227. pixelsv2 = temp2;
  228. } else {
  229. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  230. }
  231. pixelsv3 = vec_mergel(vczero, pixelsv1);
  232. pixelsv4 = vec_mergel(vczero, pixelsv2);
  233. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  234. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  235. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  236. (vector unsigned short)pixelsv4);
  237. pixelssum3 = vec_add(pixelssum3, vctwo);
  238. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  239. (vector unsigned short)pixelsv2);
  240. pixelssum1 = vec_add(pixelssum1, vctwo);
  241. for (i = 0; i < h ; i++) {
  242. blockv = vec_ld(0, block);
  243. temp1 = vec_ld(line_size, pixels);
  244. temp2 = vec_ld(line_size + 16, pixels);
  245. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  246. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  247. pixelsv2 = temp2;
  248. } else {
  249. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  250. }
  251. pixelsv3 = vec_mergel(vczero, pixelsv1);
  252. pixelsv4 = vec_mergel(vczero, pixelsv2);
  253. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  254. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  255. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  256. (vector unsigned short)pixelsv4);
  257. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  258. (vector unsigned short)pixelsv2);
  259. temp4 = vec_add(pixelssum3, pixelssum4);
  260. temp4 = vec_sra(temp4, vctwo);
  261. temp3 = vec_add(pixelssum1, pixelssum2);
  262. temp3 = vec_sra(temp3, vctwo);
  263. pixelssum3 = vec_add(pixelssum4, vctwo);
  264. pixelssum1 = vec_add(pixelssum2, vctwo);
  265. blockv = vec_packsu(temp3, temp4);
  266. vec_st(blockv, 0, block);
  267. block += line_size;
  268. pixels += line_size;
  269. }
  270. }
  271. /* next one assumes that ((line_size % 16) == 0) */
  272. static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, ptrdiff_t line_size, int h)
  273. {
  274. register int i;
  275. register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4;
  276. register vector unsigned char blockv, temp1, temp2;
  277. register vector unsigned short temp3, temp4,
  278. pixelssum1, pixelssum2, pixelssum3, pixelssum4;
  279. register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0);
  280. register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1);
  281. register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2);
  282. temp1 = vec_ld(0, pixels);
  283. temp2 = vec_ld(16, pixels);
  284. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  285. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  286. pixelsv2 = temp2;
  287. } else {
  288. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  289. }
  290. pixelsv3 = vec_mergel(vczero, pixelsv1);
  291. pixelsv4 = vec_mergel(vczero, pixelsv2);
  292. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  293. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  294. pixelssum3 = vec_add((vector unsigned short)pixelsv3,
  295. (vector unsigned short)pixelsv4);
  296. pixelssum3 = vec_add(pixelssum3, vcone);
  297. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  298. (vector unsigned short)pixelsv2);
  299. pixelssum1 = vec_add(pixelssum1, vcone);
  300. for (i = 0; i < h ; i++) {
  301. blockv = vec_ld(0, block);
  302. temp1 = vec_ld(line_size, pixels);
  303. temp2 = vec_ld(line_size + 16, pixels);
  304. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  305. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  306. pixelsv2 = temp2;
  307. } else {
  308. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  309. }
  310. pixelsv3 = vec_mergel(vczero, pixelsv1);
  311. pixelsv4 = vec_mergel(vczero, pixelsv2);
  312. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  313. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  314. pixelssum4 = vec_add((vector unsigned short)pixelsv3,
  315. (vector unsigned short)pixelsv4);
  316. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  317. (vector unsigned short)pixelsv2);
  318. temp4 = vec_add(pixelssum3, pixelssum4);
  319. temp4 = vec_sra(temp4, vctwo);
  320. temp3 = vec_add(pixelssum1, pixelssum2);
  321. temp3 = vec_sra(temp3, vctwo);
  322. pixelssum3 = vec_add(pixelssum4, vcone);
  323. pixelssum1 = vec_add(pixelssum2, vcone);
  324. blockv = vec_packsu(temp3, temp4);
  325. vec_st(blockv, 0, block);
  326. block += line_size;
  327. pixels += line_size;
  328. }
  329. }
  330. /* next one assumes that ((line_size % 8) == 0) */
  331. static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
  332. {
  333. register int i;
  334. register vector unsigned char pixelsv1, pixelsv2, pixelsavg;
  335. register vector unsigned char blockv, temp1, temp2, blocktemp;
  336. register vector unsigned short pixelssum1, pixelssum2, temp3;
  337. register const vector unsigned char vczero = (const vector unsigned char)
  338. vec_splat_u8(0);
  339. register const vector unsigned short vctwo = (const vector unsigned short)
  340. vec_splat_u16(2);
  341. temp1 = vec_ld(0, pixels);
  342. temp2 = vec_ld(16, pixels);
  343. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels));
  344. if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) {
  345. pixelsv2 = temp2;
  346. } else {
  347. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels));
  348. }
  349. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  350. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  351. pixelssum1 = vec_add((vector unsigned short)pixelsv1,
  352. (vector unsigned short)pixelsv2);
  353. pixelssum1 = vec_add(pixelssum1, vctwo);
  354. for (i = 0; i < h ; i++) {
  355. int rightside = ((unsigned long)block & 0x0000000F);
  356. blockv = vec_ld(0, block);
  357. temp1 = vec_ld(line_size, pixels);
  358. temp2 = vec_ld(line_size + 16, pixels);
  359. pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels));
  360. if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) {
  361. pixelsv2 = temp2;
  362. } else {
  363. pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels));
  364. }
  365. pixelsv1 = vec_mergeh(vczero, pixelsv1);
  366. pixelsv2 = vec_mergeh(vczero, pixelsv2);
  367. pixelssum2 = vec_add((vector unsigned short)pixelsv1,
  368. (vector unsigned short)pixelsv2);
  369. temp3 = vec_add(pixelssum1, pixelssum2);
  370. temp3 = vec_sra(temp3, vctwo);
  371. pixelssum1 = vec_add(pixelssum2, vctwo);
  372. pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero);
  373. if (rightside) {
  374. blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1));
  375. } else {
  376. blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3));
  377. }
  378. blockv = vec_avg(blocktemp, blockv);
  379. vec_st(blockv, 0, block);
  380. block += line_size;
  381. pixels += line_size;
  382. }
  383. }
  384. #endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
  385. av_cold void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags)
  386. {
  387. #if HAVE_ALTIVEC && HAVE_BIGENDIAN
  388. if (!PPC_ALTIVEC(av_get_cpu_flags()))
  389. return;
  390. c->avg_pixels_tab[0][0] = ff_avg_pixels16_altivec;
  391. c->avg_pixels_tab[1][0] = avg_pixels8_altivec;
  392. c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec;
  393. c->put_pixels_tab[0][0] = ff_put_pixels16_altivec;
  394. c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec;
  395. c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec;
  396. c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_altivec;
  397. c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec;
  398. c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec;
  399. #endif /* HAVE_ALTIVEC && HAVE_BIGENDIAN */
  400. }