You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1011 lines
35KB

  1. /*
  2. * DSP utils
  3. * Copyright (c) 2000, 2001 Fabrice Bellard
  4. * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
  5. *
  6. * This file is part of FFmpeg.
  7. *
  8. * FFmpeg is free software; you can redistribute it and/or
  9. * modify it under the terms of the GNU Lesser General Public
  10. * License as published by the Free Software Foundation; either
  11. * version 2.1 of the License, or (at your option) any later version.
  12. *
  13. * FFmpeg is distributed in the hope that it will be useful,
  14. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  15. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16. * Lesser General Public License for more details.
  17. *
  18. * You should have received a copy of the GNU Lesser General Public
  19. * License along with FFmpeg; if not, write to the Free Software
  20. * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21. */
  22. /**
  23. * @file
  24. * DSP utils
  25. */
  26. #include "libavutil/attributes.h"
  27. #include "libavutil/internal.h"
  28. #include "avcodec.h"
  29. #include "copy_block.h"
  30. #include "dsputil.h"
  31. #include "simple_idct.h"
  32. #include "mpegvideo.h"
  33. #include "config.h"
  34. uint32_t ff_square_tab[512] = { 0, };
  35. static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  36. int line_size, int h)
  37. {
  38. int s = 0, i;
  39. uint32_t *sq = ff_square_tab + 256;
  40. for (i = 0; i < h; i++) {
  41. s += sq[pix1[0] - pix2[0]];
  42. s += sq[pix1[1] - pix2[1]];
  43. s += sq[pix1[2] - pix2[2]];
  44. s += sq[pix1[3] - pix2[3]];
  45. pix1 += line_size;
  46. pix2 += line_size;
  47. }
  48. return s;
  49. }
  50. static int sse8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  51. int line_size, int h)
  52. {
  53. int s = 0, i;
  54. uint32_t *sq = ff_square_tab + 256;
  55. for (i = 0; i < h; i++) {
  56. s += sq[pix1[0] - pix2[0]];
  57. s += sq[pix1[1] - pix2[1]];
  58. s += sq[pix1[2] - pix2[2]];
  59. s += sq[pix1[3] - pix2[3]];
  60. s += sq[pix1[4] - pix2[4]];
  61. s += sq[pix1[5] - pix2[5]];
  62. s += sq[pix1[6] - pix2[6]];
  63. s += sq[pix1[7] - pix2[7]];
  64. pix1 += line_size;
  65. pix2 += line_size;
  66. }
  67. return s;
  68. }
  69. static int sse16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  70. int line_size, int h)
  71. {
  72. int s = 0, i;
  73. uint32_t *sq = ff_square_tab + 256;
  74. for (i = 0; i < h; i++) {
  75. s += sq[pix1[0] - pix2[0]];
  76. s += sq[pix1[1] - pix2[1]];
  77. s += sq[pix1[2] - pix2[2]];
  78. s += sq[pix1[3] - pix2[3]];
  79. s += sq[pix1[4] - pix2[4]];
  80. s += sq[pix1[5] - pix2[5]];
  81. s += sq[pix1[6] - pix2[6]];
  82. s += sq[pix1[7] - pix2[7]];
  83. s += sq[pix1[8] - pix2[8]];
  84. s += sq[pix1[9] - pix2[9]];
  85. s += sq[pix1[10] - pix2[10]];
  86. s += sq[pix1[11] - pix2[11]];
  87. s += sq[pix1[12] - pix2[12]];
  88. s += sq[pix1[13] - pix2[13]];
  89. s += sq[pix1[14] - pix2[14]];
  90. s += sq[pix1[15] - pix2[15]];
  91. pix1 += line_size;
  92. pix2 += line_size;
  93. }
  94. return s;
  95. }
  96. static int sum_abs_dctelem_c(int16_t *block)
  97. {
  98. int sum = 0, i;
  99. for (i = 0; i < 64; i++)
  100. sum += FFABS(block[i]);
  101. return sum;
  102. }
  103. #define avg2(a, b) ((a + b + 1) >> 1)
  104. #define avg4(a, b, c, d) ((a + b + c + d + 2) >> 2)
  105. static inline int pix_abs16_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  106. int line_size, int h)
  107. {
  108. int s = 0, i;
  109. for (i = 0; i < h; i++) {
  110. s += abs(pix1[0] - pix2[0]);
  111. s += abs(pix1[1] - pix2[1]);
  112. s += abs(pix1[2] - pix2[2]);
  113. s += abs(pix1[3] - pix2[3]);
  114. s += abs(pix1[4] - pix2[4]);
  115. s += abs(pix1[5] - pix2[5]);
  116. s += abs(pix1[6] - pix2[6]);
  117. s += abs(pix1[7] - pix2[7]);
  118. s += abs(pix1[8] - pix2[8]);
  119. s += abs(pix1[9] - pix2[9]);
  120. s += abs(pix1[10] - pix2[10]);
  121. s += abs(pix1[11] - pix2[11]);
  122. s += abs(pix1[12] - pix2[12]);
  123. s += abs(pix1[13] - pix2[13]);
  124. s += abs(pix1[14] - pix2[14]);
  125. s += abs(pix1[15] - pix2[15]);
  126. pix1 += line_size;
  127. pix2 += line_size;
  128. }
  129. return s;
  130. }
  131. static int pix_abs16_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  132. int line_size, int h)
  133. {
  134. int s = 0, i;
  135. for (i = 0; i < h; i++) {
  136. s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
  137. s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
  138. s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
  139. s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
  140. s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
  141. s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
  142. s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
  143. s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
  144. s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
  145. s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
  146. s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
  147. s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
  148. s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
  149. s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
  150. s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
  151. s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
  152. pix1 += line_size;
  153. pix2 += line_size;
  154. }
  155. return s;
  156. }
  157. static int pix_abs16_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  158. int line_size, int h)
  159. {
  160. int s = 0, i;
  161. uint8_t *pix3 = pix2 + line_size;
  162. for (i = 0; i < h; i++) {
  163. s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
  164. s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
  165. s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
  166. s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
  167. s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
  168. s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
  169. s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
  170. s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
  171. s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
  172. s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
  173. s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
  174. s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
  175. s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
  176. s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
  177. s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
  178. s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
  179. pix1 += line_size;
  180. pix2 += line_size;
  181. pix3 += line_size;
  182. }
  183. return s;
  184. }
  185. static int pix_abs16_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  186. int line_size, int h)
  187. {
  188. int s = 0, i;
  189. uint8_t *pix3 = pix2 + line_size;
  190. for (i = 0; i < h; i++) {
  191. s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
  192. s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
  193. s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
  194. s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
  195. s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
  196. s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
  197. s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
  198. s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
  199. s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
  200. s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
  201. s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
  202. s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
  203. s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
  204. s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
  205. s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
  206. s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
  207. pix1 += line_size;
  208. pix2 += line_size;
  209. pix3 += line_size;
  210. }
  211. return s;
  212. }
  213. static inline int pix_abs8_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  214. int line_size, int h)
  215. {
  216. int s = 0, i;
  217. for (i = 0; i < h; i++) {
  218. s += abs(pix1[0] - pix2[0]);
  219. s += abs(pix1[1] - pix2[1]);
  220. s += abs(pix1[2] - pix2[2]);
  221. s += abs(pix1[3] - pix2[3]);
  222. s += abs(pix1[4] - pix2[4]);
  223. s += abs(pix1[5] - pix2[5]);
  224. s += abs(pix1[6] - pix2[6]);
  225. s += abs(pix1[7] - pix2[7]);
  226. pix1 += line_size;
  227. pix2 += line_size;
  228. }
  229. return s;
  230. }
  231. static int pix_abs8_x2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  232. int line_size, int h)
  233. {
  234. int s = 0, i;
  235. for (i = 0; i < h; i++) {
  236. s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
  237. s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
  238. s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
  239. s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
  240. s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
  241. s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
  242. s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
  243. s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
  244. pix1 += line_size;
  245. pix2 += line_size;
  246. }
  247. return s;
  248. }
  249. static int pix_abs8_y2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  250. int line_size, int h)
  251. {
  252. int s = 0, i;
  253. uint8_t *pix3 = pix2 + line_size;
  254. for (i = 0; i < h; i++) {
  255. s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
  256. s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
  257. s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
  258. s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
  259. s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
  260. s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
  261. s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
  262. s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
  263. pix1 += line_size;
  264. pix2 += line_size;
  265. pix3 += line_size;
  266. }
  267. return s;
  268. }
  269. static int pix_abs8_xy2_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
  270. int line_size, int h)
  271. {
  272. int s = 0, i;
  273. uint8_t *pix3 = pix2 + line_size;
  274. for (i = 0; i < h; i++) {
  275. s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
  276. s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
  277. s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
  278. s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
  279. s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
  280. s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
  281. s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
  282. s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
  283. pix1 += line_size;
  284. pix2 += line_size;
  285. pix3 += line_size;
  286. }
  287. return s;
  288. }
  289. static int nsse16_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
  290. {
  291. int score1 = 0, score2 = 0, x, y;
  292. for (y = 0; y < h; y++) {
  293. for (x = 0; x < 16; x++)
  294. score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
  295. if (y + 1 < h) {
  296. for (x = 0; x < 15; x++)
  297. score2 += FFABS(s1[x] - s1[x + stride] -
  298. s1[x + 1] + s1[x + stride + 1]) -
  299. FFABS(s2[x] - s2[x + stride] -
  300. s2[x + 1] + s2[x + stride + 1]);
  301. }
  302. s1 += stride;
  303. s2 += stride;
  304. }
  305. if (c)
  306. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  307. else
  308. return score1 + FFABS(score2) * 8;
  309. }
  310. static int nsse8_c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, int stride, int h)
  311. {
  312. int score1 = 0, score2 = 0, x, y;
  313. for (y = 0; y < h; y++) {
  314. for (x = 0; x < 8; x++)
  315. score1 += (s1[x] - s2[x]) * (s1[x] - s2[x]);
  316. if (y + 1 < h) {
  317. for (x = 0; x < 7; x++)
  318. score2 += FFABS(s1[x] - s1[x + stride] -
  319. s1[x + 1] + s1[x + stride + 1]) -
  320. FFABS(s2[x] - s2[x + stride] -
  321. s2[x + 1] + s2[x + stride + 1]);
  322. }
  323. s1 += stride;
  324. s2 += stride;
  325. }
  326. if (c)
  327. return score1 + FFABS(score2) * c->avctx->nsse_weight;
  328. else
  329. return score1 + FFABS(score2) * 8;
  330. }
  331. static int zero_cmp(MpegEncContext *s, uint8_t *a, uint8_t *b,
  332. int stride, int h)
  333. {
  334. return 0;
  335. }
  336. void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type)
  337. {
  338. int i;
  339. memset(cmp, 0, sizeof(void *) * 6);
  340. for (i = 0; i < 6; i++) {
  341. switch (type & 0xFF) {
  342. case FF_CMP_SAD:
  343. cmp[i] = c->sad[i];
  344. break;
  345. case FF_CMP_SATD:
  346. cmp[i] = c->hadamard8_diff[i];
  347. break;
  348. case FF_CMP_SSE:
  349. cmp[i] = c->sse[i];
  350. break;
  351. case FF_CMP_DCT:
  352. cmp[i] = c->dct_sad[i];
  353. break;
  354. case FF_CMP_DCT264:
  355. cmp[i] = c->dct264_sad[i];
  356. break;
  357. case FF_CMP_DCTMAX:
  358. cmp[i] = c->dct_max[i];
  359. break;
  360. case FF_CMP_PSNR:
  361. cmp[i] = c->quant_psnr[i];
  362. break;
  363. case FF_CMP_BIT:
  364. cmp[i] = c->bit[i];
  365. break;
  366. case FF_CMP_RD:
  367. cmp[i] = c->rd[i];
  368. break;
  369. case FF_CMP_VSAD:
  370. cmp[i] = c->vsad[i];
  371. break;
  372. case FF_CMP_VSSE:
  373. cmp[i] = c->vsse[i];
  374. break;
  375. case FF_CMP_ZERO:
  376. cmp[i] = zero_cmp;
  377. break;
  378. case FF_CMP_NSSE:
  379. cmp[i] = c->nsse[i];
  380. break;
  381. #if CONFIG_DWT
  382. case FF_CMP_W53:
  383. cmp[i]= c->w53[i];
  384. break;
  385. case FF_CMP_W97:
  386. cmp[i]= c->w97[i];
  387. break;
  388. #endif
  389. default:
  390. av_log(NULL, AV_LOG_ERROR,
  391. "internal error in cmp function selection\n");
  392. }
  393. }
  394. }
  395. #define BUTTERFLY2(o1, o2, i1, i2) \
  396. o1 = (i1) + (i2); \
  397. o2 = (i1) - (i2);
  398. #define BUTTERFLY1(x, y) \
  399. { \
  400. int a, b; \
  401. a = x; \
  402. b = y; \
  403. x = a + b; \
  404. y = a - b; \
  405. }
  406. #define BUTTERFLYA(x, y) (FFABS((x) + (y)) + FFABS((x) - (y)))
  407. static int hadamard8_diff8x8_c(MpegEncContext *s, uint8_t *dst,
  408. uint8_t *src, int stride, int h)
  409. {
  410. int i, temp[64], sum = 0;
  411. av_assert2(h == 8);
  412. for (i = 0; i < 8; i++) {
  413. // FIXME: try pointer walks
  414. BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
  415. src[stride * i + 0] - dst[stride * i + 0],
  416. src[stride * i + 1] - dst[stride * i + 1]);
  417. BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
  418. src[stride * i + 2] - dst[stride * i + 2],
  419. src[stride * i + 3] - dst[stride * i + 3]);
  420. BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
  421. src[stride * i + 4] - dst[stride * i + 4],
  422. src[stride * i + 5] - dst[stride * i + 5]);
  423. BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
  424. src[stride * i + 6] - dst[stride * i + 6],
  425. src[stride * i + 7] - dst[stride * i + 7]);
  426. BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
  427. BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
  428. BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
  429. BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
  430. BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
  431. BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
  432. BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
  433. BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
  434. }
  435. for (i = 0; i < 8; i++) {
  436. BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
  437. BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
  438. BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
  439. BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
  440. BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
  441. BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
  442. BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
  443. BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
  444. sum += BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i]) +
  445. BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i]) +
  446. BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i]) +
  447. BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
  448. }
  449. return sum;
  450. }
  451. static int hadamard8_intra8x8_c(MpegEncContext *s, uint8_t *src,
  452. uint8_t *dummy, int stride, int h)
  453. {
  454. int i, temp[64], sum = 0;
  455. av_assert2(h == 8);
  456. for (i = 0; i < 8; i++) {
  457. // FIXME: try pointer walks
  458. BUTTERFLY2(temp[8 * i + 0], temp[8 * i + 1],
  459. src[stride * i + 0], src[stride * i + 1]);
  460. BUTTERFLY2(temp[8 * i + 2], temp[8 * i + 3],
  461. src[stride * i + 2], src[stride * i + 3]);
  462. BUTTERFLY2(temp[8 * i + 4], temp[8 * i + 5],
  463. src[stride * i + 4], src[stride * i + 5]);
  464. BUTTERFLY2(temp[8 * i + 6], temp[8 * i + 7],
  465. src[stride * i + 6], src[stride * i + 7]);
  466. BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 2]);
  467. BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 3]);
  468. BUTTERFLY1(temp[8 * i + 4], temp[8 * i + 6]);
  469. BUTTERFLY1(temp[8 * i + 5], temp[8 * i + 7]);
  470. BUTTERFLY1(temp[8 * i + 0], temp[8 * i + 4]);
  471. BUTTERFLY1(temp[8 * i + 1], temp[8 * i + 5]);
  472. BUTTERFLY1(temp[8 * i + 2], temp[8 * i + 6]);
  473. BUTTERFLY1(temp[8 * i + 3], temp[8 * i + 7]);
  474. }
  475. for (i = 0; i < 8; i++) {
  476. BUTTERFLY1(temp[8 * 0 + i], temp[8 * 1 + i]);
  477. BUTTERFLY1(temp[8 * 2 + i], temp[8 * 3 + i]);
  478. BUTTERFLY1(temp[8 * 4 + i], temp[8 * 5 + i]);
  479. BUTTERFLY1(temp[8 * 6 + i], temp[8 * 7 + i]);
  480. BUTTERFLY1(temp[8 * 0 + i], temp[8 * 2 + i]);
  481. BUTTERFLY1(temp[8 * 1 + i], temp[8 * 3 + i]);
  482. BUTTERFLY1(temp[8 * 4 + i], temp[8 * 6 + i]);
  483. BUTTERFLY1(temp[8 * 5 + i], temp[8 * 7 + i]);
  484. sum +=
  485. BUTTERFLYA(temp[8 * 0 + i], temp[8 * 4 + i])
  486. + BUTTERFLYA(temp[8 * 1 + i], temp[8 * 5 + i])
  487. + BUTTERFLYA(temp[8 * 2 + i], temp[8 * 6 + i])
  488. + BUTTERFLYA(temp[8 * 3 + i], temp[8 * 7 + i]);
  489. }
  490. sum -= FFABS(temp[8 * 0] + temp[8 * 4]); // -mean
  491. return sum;
  492. }
  493. static int dct_sad8x8_c(MpegEncContext *s, uint8_t *src1,
  494. uint8_t *src2, int stride, int h)
  495. {
  496. LOCAL_ALIGNED_16(int16_t, temp, [64]);
  497. av_assert2(h == 8);
  498. s->pdsp.diff_pixels(temp, src1, src2, stride);
  499. s->fdsp.fdct(temp);
  500. return s->dsp.sum_abs_dctelem(temp);
  501. }
  502. #if CONFIG_GPL
  503. #define DCT8_1D \
  504. { \
  505. const int s07 = SRC(0) + SRC(7); \
  506. const int s16 = SRC(1) + SRC(6); \
  507. const int s25 = SRC(2) + SRC(5); \
  508. const int s34 = SRC(3) + SRC(4); \
  509. const int a0 = s07 + s34; \
  510. const int a1 = s16 + s25; \
  511. const int a2 = s07 - s34; \
  512. const int a3 = s16 - s25; \
  513. const int d07 = SRC(0) - SRC(7); \
  514. const int d16 = SRC(1) - SRC(6); \
  515. const int d25 = SRC(2) - SRC(5); \
  516. const int d34 = SRC(3) - SRC(4); \
  517. const int a4 = d16 + d25 + (d07 + (d07 >> 1)); \
  518. const int a5 = d07 - d34 - (d25 + (d25 >> 1)); \
  519. const int a6 = d07 + d34 - (d16 + (d16 >> 1)); \
  520. const int a7 = d16 - d25 + (d34 + (d34 >> 1)); \
  521. DST(0, a0 + a1); \
  522. DST(1, a4 + (a7 >> 2)); \
  523. DST(2, a2 + (a3 >> 1)); \
  524. DST(3, a5 + (a6 >> 2)); \
  525. DST(4, a0 - a1); \
  526. DST(5, a6 - (a5 >> 2)); \
  527. DST(6, (a2 >> 1) - a3); \
  528. DST(7, (a4 >> 2) - a7); \
  529. }
  530. static int dct264_sad8x8_c(MpegEncContext *s, uint8_t *src1,
  531. uint8_t *src2, int stride, int h)
  532. {
  533. int16_t dct[8][8];
  534. int i, sum = 0;
  535. s->pdsp.diff_pixels(dct[0], src1, src2, stride);
  536. #define SRC(x) dct[i][x]
  537. #define DST(x, v) dct[i][x] = v
  538. for (i = 0; i < 8; i++)
  539. DCT8_1D
  540. #undef SRC
  541. #undef DST
  542. #define SRC(x) dct[x][i]
  543. #define DST(x, v) sum += FFABS(v)
  544. for (i = 0; i < 8; i++)
  545. DCT8_1D
  546. #undef SRC
  547. #undef DST
  548. return sum;
  549. }
  550. #endif
  551. static int dct_max8x8_c(MpegEncContext *s, uint8_t *src1,
  552. uint8_t *src2, int stride, int h)
  553. {
  554. LOCAL_ALIGNED_16(int16_t, temp, [64]);
  555. int sum = 0, i;
  556. av_assert2(h == 8);
  557. s->pdsp.diff_pixels(temp, src1, src2, stride);
  558. s->fdsp.fdct(temp);
  559. for (i = 0; i < 64; i++)
  560. sum = FFMAX(sum, FFABS(temp[i]));
  561. return sum;
  562. }
  563. static int quant_psnr8x8_c(MpegEncContext *s, uint8_t *src1,
  564. uint8_t *src2, int stride, int h)
  565. {
  566. LOCAL_ALIGNED_16(int16_t, temp, [64 * 2]);
  567. int16_t *const bak = temp + 64;
  568. int sum = 0, i;
  569. av_assert2(h == 8);
  570. s->mb_intra = 0;
  571. s->pdsp.diff_pixels(temp, src1, src2, stride);
  572. memcpy(bak, temp, 64 * sizeof(int16_t));
  573. s->block_last_index[0 /* FIXME */] =
  574. s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
  575. s->dct_unquantize_inter(s, temp, 0, s->qscale);
  576. ff_simple_idct_8(temp); // FIXME
  577. for (i = 0; i < 64; i++)
  578. sum += (temp[i] - bak[i]) * (temp[i] - bak[i]);
  579. return sum;
  580. }
  581. static int rd8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
  582. int stride, int h)
  583. {
  584. const uint8_t *scantable = s->intra_scantable.permutated;
  585. LOCAL_ALIGNED_16(int16_t, temp, [64]);
  586. LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
  587. LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
  588. int i, last, run, bits, level, distortion, start_i;
  589. const int esc_length = s->ac_esc_length;
  590. uint8_t *length, *last_length;
  591. av_assert2(h == 8);
  592. copy_block8(lsrc1, src1, 8, stride, 8);
  593. copy_block8(lsrc2, src2, 8, stride, 8);
  594. s->pdsp.diff_pixels(temp, lsrc1, lsrc2, 8);
  595. s->block_last_index[0 /* FIXME */] =
  596. last =
  597. s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
  598. bits = 0;
  599. if (s->mb_intra) {
  600. start_i = 1;
  601. length = s->intra_ac_vlc_length;
  602. last_length = s->intra_ac_vlc_last_length;
  603. bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
  604. } else {
  605. start_i = 0;
  606. length = s->inter_ac_vlc_length;
  607. last_length = s->inter_ac_vlc_last_length;
  608. }
  609. if (last >= start_i) {
  610. run = 0;
  611. for (i = start_i; i < last; i++) {
  612. int j = scantable[i];
  613. level = temp[j];
  614. if (level) {
  615. level += 64;
  616. if ((level & (~127)) == 0)
  617. bits += length[UNI_AC_ENC_INDEX(run, level)];
  618. else
  619. bits += esc_length;
  620. run = 0;
  621. } else
  622. run++;
  623. }
  624. i = scantable[last];
  625. level = temp[i] + 64;
  626. av_assert2(level - 64);
  627. if ((level & (~127)) == 0) {
  628. bits += last_length[UNI_AC_ENC_INDEX(run, level)];
  629. } else
  630. bits += esc_length;
  631. }
  632. if (last >= 0) {
  633. if (s->mb_intra)
  634. s->dct_unquantize_intra(s, temp, 0, s->qscale);
  635. else
  636. s->dct_unquantize_inter(s, temp, 0, s->qscale);
  637. }
  638. s->idsp.idct_add(lsrc2, 8, temp);
  639. distortion = s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
  640. return distortion + ((bits * s->qscale * s->qscale * 109 + 64) >> 7);
  641. }
  642. static int bit8x8_c(MpegEncContext *s, uint8_t *src1, uint8_t *src2,
  643. int stride, int h)
  644. {
  645. const uint8_t *scantable = s->intra_scantable.permutated;
  646. LOCAL_ALIGNED_16(int16_t, temp, [64]);
  647. int i, last, run, bits, level, start_i;
  648. const int esc_length = s->ac_esc_length;
  649. uint8_t *length, *last_length;
  650. av_assert2(h == 8);
  651. s->pdsp.diff_pixels(temp, src1, src2, stride);
  652. s->block_last_index[0 /* FIXME */] =
  653. last =
  654. s->fast_dct_quantize(s, temp, 0 /* FIXME */, s->qscale, &i);
  655. bits = 0;
  656. if (s->mb_intra) {
  657. start_i = 1;
  658. length = s->intra_ac_vlc_length;
  659. last_length = s->intra_ac_vlc_last_length;
  660. bits += s->luma_dc_vlc_length[temp[0] + 256]; // FIXME: chroma
  661. } else {
  662. start_i = 0;
  663. length = s->inter_ac_vlc_length;
  664. last_length = s->inter_ac_vlc_last_length;
  665. }
  666. if (last >= start_i) {
  667. run = 0;
  668. for (i = start_i; i < last; i++) {
  669. int j = scantable[i];
  670. level = temp[j];
  671. if (level) {
  672. level += 64;
  673. if ((level & (~127)) == 0)
  674. bits += length[UNI_AC_ENC_INDEX(run, level)];
  675. else
  676. bits += esc_length;
  677. run = 0;
  678. } else
  679. run++;
  680. }
  681. i = scantable[last];
  682. level = temp[i] + 64;
  683. av_assert2(level - 64);
  684. if ((level & (~127)) == 0)
  685. bits += last_length[UNI_AC_ENC_INDEX(run, level)];
  686. else
  687. bits += esc_length;
  688. }
  689. return bits;
  690. }
  691. #define VSAD_INTRA(size) \
  692. static int vsad_intra ## size ## _c(MpegEncContext *c, \
  693. uint8_t *s, uint8_t *dummy, \
  694. int stride, int h) \
  695. { \
  696. int score = 0, x, y; \
  697. \
  698. for (y = 1; y < h; y++) { \
  699. for (x = 0; x < size; x += 4) { \
  700. score += FFABS(s[x] - s[x + stride]) + \
  701. FFABS(s[x + 1] - s[x + stride + 1]) + \
  702. FFABS(s[x + 2] - s[x + 2 + stride]) + \
  703. FFABS(s[x + 3] - s[x + 3 + stride]); \
  704. } \
  705. s += stride; \
  706. } \
  707. \
  708. return score; \
  709. }
  710. VSAD_INTRA(8)
  711. VSAD_INTRA(16)
  712. #define VSAD(size) \
  713. static int vsad ## size ## _c(MpegEncContext *c, \
  714. uint8_t *s1, uint8_t *s2, \
  715. int stride, int h) \
  716. { \
  717. int score = 0, x, y; \
  718. \
  719. for (y = 1; y < h; y++) { \
  720. for (x = 0; x < size; x++) \
  721. score += FFABS(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \
  722. s1 += stride; \
  723. s2 += stride; \
  724. } \
  725. \
  726. return score; \
  727. }
  728. VSAD(8)
  729. VSAD(16)
  730. #define SQ(a) ((a) * (a))
  731. #define VSSE_INTRA(size) \
  732. static int vsse_intra ## size ## _c(MpegEncContext *c, \
  733. uint8_t *s, uint8_t *dummy, \
  734. int stride, int h) \
  735. { \
  736. int score = 0, x, y; \
  737. \
  738. for (y = 1; y < h; y++) { \
  739. for (x = 0; x < size; x += 4) { \
  740. score += SQ(s[x] - s[x + stride]) + \
  741. SQ(s[x + 1] - s[x + stride + 1]) + \
  742. SQ(s[x + 2] - s[x + stride + 2]) + \
  743. SQ(s[x + 3] - s[x + stride + 3]); \
  744. } \
  745. s += stride; \
  746. } \
  747. \
  748. return score; \
  749. }
  750. VSSE_INTRA(8)
  751. VSSE_INTRA(16)
  752. #define VSSE(size) \
  753. static int vsse ## size ## _c(MpegEncContext *c, uint8_t *s1, uint8_t *s2, \
  754. int stride, int h) \
  755. { \
  756. int score = 0, x, y; \
  757. \
  758. for (y = 1; y < h; y++) { \
  759. for (x = 0; x < size; x++) \
  760. score += SQ(s1[x] - s2[x] - s1[x + stride] + s2[x + stride]); \
  761. s1 += stride; \
  762. s2 += stride; \
  763. } \
  764. \
  765. return score; \
  766. }
  767. VSSE(8)
  768. VSSE(16)
  769. #define WRAPPER8_16_SQ(name8, name16) \
  770. static int name16(MpegEncContext *s, uint8_t *dst, uint8_t *src, \
  771. int stride, int h) \
  772. { \
  773. int score = 0; \
  774. \
  775. score += name8(s, dst, src, stride, 8); \
  776. score += name8(s, dst + 8, src + 8, stride, 8); \
  777. if (h == 16) { \
  778. dst += 8 * stride; \
  779. src += 8 * stride; \
  780. score += name8(s, dst, src, stride, 8); \
  781. score += name8(s, dst + 8, src + 8, stride, 8); \
  782. } \
  783. return score; \
  784. }
  785. WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
  786. WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
  787. WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
  788. #if CONFIG_GPL
  789. WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
  790. #endif
  791. WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
  792. WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
  793. WRAPPER8_16_SQ(rd8x8_c, rd16_c)
  794. WRAPPER8_16_SQ(bit8x8_c, bit16_c)
  795. /* init static data */
  796. av_cold void ff_dsputil_static_init(void)
  797. {
  798. int i;
  799. for (i = 0; i < 512; i++)
  800. ff_square_tab[i] = (i - 256) * (i - 256);
  801. }
  802. int ff_check_alignment(void)
  803. {
  804. static int did_fail = 0;
  805. LOCAL_ALIGNED_16(int, aligned, [4]);
  806. if ((intptr_t)aligned & 15) {
  807. if (!did_fail) {
  808. #if HAVE_MMX || HAVE_ALTIVEC
  809. av_log(NULL, AV_LOG_ERROR,
  810. "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
  811. "and may be very slow or crash. This is not a bug in libavcodec,\n"
  812. "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
  813. "Do not report crashes to FFmpeg developers.\n");
  814. #endif
  815. did_fail=1;
  816. }
  817. return -1;
  818. }
  819. return 0;
  820. }
  821. av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
  822. {
  823. const unsigned high_bit_depth = avctx->bits_per_raw_sample > 8;
  824. ff_check_alignment();
  825. c->sum_abs_dctelem = sum_abs_dctelem_c;
  826. /* TODO [0] 16 [1] 8 */
  827. c->pix_abs[0][0] = pix_abs16_c;
  828. c->pix_abs[0][1] = pix_abs16_x2_c;
  829. c->pix_abs[0][2] = pix_abs16_y2_c;
  830. c->pix_abs[0][3] = pix_abs16_xy2_c;
  831. c->pix_abs[1][0] = pix_abs8_c;
  832. c->pix_abs[1][1] = pix_abs8_x2_c;
  833. c->pix_abs[1][2] = pix_abs8_y2_c;
  834. c->pix_abs[1][3] = pix_abs8_xy2_c;
  835. #define SET_CMP_FUNC(name) \
  836. c->name[0] = name ## 16_c; \
  837. c->name[1] = name ## 8x8_c;
  838. SET_CMP_FUNC(hadamard8_diff)
  839. c->hadamard8_diff[4] = hadamard8_intra16_c;
  840. c->hadamard8_diff[5] = hadamard8_intra8x8_c;
  841. SET_CMP_FUNC(dct_sad)
  842. SET_CMP_FUNC(dct_max)
  843. #if CONFIG_GPL
  844. SET_CMP_FUNC(dct264_sad)
  845. #endif
  846. c->sad[0] = pix_abs16_c;
  847. c->sad[1] = pix_abs8_c;
  848. c->sse[0] = sse16_c;
  849. c->sse[1] = sse8_c;
  850. c->sse[2] = sse4_c;
  851. SET_CMP_FUNC(quant_psnr)
  852. SET_CMP_FUNC(rd)
  853. SET_CMP_FUNC(bit)
  854. c->vsad[0] = vsad16_c;
  855. c->vsad[1] = vsad8_c;
  856. c->vsad[4] = vsad_intra16_c;
  857. c->vsad[5] = vsad_intra8_c;
  858. c->vsse[0] = vsse16_c;
  859. c->vsse[1] = vsse8_c;
  860. c->vsse[4] = vsse_intra16_c;
  861. c->vsse[5] = vsse_intra8_c;
  862. c->nsse[0] = nsse16_c;
  863. c->nsse[1] = nsse8_c;
  864. #if CONFIG_SNOW_DECODER || CONFIG_SNOW_ENCODER
  865. ff_dsputil_init_dwt(c);
  866. #endif
  867. if (ARCH_ALPHA)
  868. ff_dsputil_init_alpha(c, avctx);
  869. if (ARCH_ARM)
  870. ff_dsputil_init_arm(c, avctx, high_bit_depth);
  871. if (ARCH_PPC)
  872. ff_dsputil_init_ppc(c, avctx, high_bit_depth);
  873. if (ARCH_X86)
  874. ff_dsputil_init_x86(c, avctx, high_bit_depth);
  875. }
  876. av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
  877. {
  878. ff_dsputil_init(c, avctx);
  879. }
  880. av_cold void avpriv_dsputil_init(DSPContext *c, AVCodecContext *avctx)
  881. {
  882. ff_dsputil_init(c, avctx);
  883. }