You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

616 lines
17KB

  1. /*
  2. * High quality image resampling with polyphase filters
  3. * Copyright (c) 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #include "avcodec.h"
  20. #include "dsputil.h"
  21. #ifdef USE_FASTMEMCPY
  22. #include "fastmemcpy.h"
  23. #endif
  24. #define NB_COMPONENTS 3
  25. #define PHASE_BITS 4
  26. #define NB_PHASES (1 << PHASE_BITS)
  27. #define NB_TAPS 4
  28. #define FCENTER 1 /* index of the center of the filter */
  29. #define POS_FRAC_BITS 16
  30. #define POS_FRAC (1 << POS_FRAC_BITS)
  31. /* 6 bits precision is needed for MMX */
  32. #define FILTER_BITS 8
  33. #define LINE_BUF_HEIGHT (NB_TAPS * 4)
  34. struct ImgReSampleContext {
  35. int iwidth, iheight, owidth, oheight;
  36. int h_incr, v_incr;
  37. INT16 h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
  38. INT16 v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
  39. UINT8 *line_buf;
  40. };
  41. static inline int get_phase(int pos)
  42. {
  43. return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1);
  44. }
  45. /* This function must be optimized */
  46. static void h_resample_fast(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  47. int src_start, int src_incr, INT16 *filters)
  48. {
  49. int src_pos, phase, sum, i;
  50. UINT8 *s;
  51. INT16 *filter;
  52. src_pos = src_start;
  53. for(i=0;i<dst_width;i++) {
  54. #ifdef TEST
  55. /* test */
  56. if ((src_pos >> POS_FRAC_BITS) < 0 ||
  57. (src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS))
  58. abort();
  59. #endif
  60. s = src + (src_pos >> POS_FRAC_BITS);
  61. phase = get_phase(src_pos);
  62. filter = filters + phase * NB_TAPS;
  63. #if NB_TAPS == 4
  64. sum = s[0] * filter[0] +
  65. s[1] * filter[1] +
  66. s[2] * filter[2] +
  67. s[3] * filter[3];
  68. #else
  69. {
  70. int j;
  71. sum = 0;
  72. for(j=0;j<NB_TAPS;j++)
  73. sum += s[j] * filter[j];
  74. }
  75. #endif
  76. sum = sum >> FILTER_BITS;
  77. if (sum < 0)
  78. sum = 0;
  79. else if (sum > 255)
  80. sum = 255;
  81. dst[0] = sum;
  82. src_pos += src_incr;
  83. dst++;
  84. }
  85. }
  86. /* This function must be optimized */
  87. static void v_resample(UINT8 *dst, int dst_width, UINT8 *src, int wrap,
  88. INT16 *filter)
  89. {
  90. int sum, i;
  91. UINT8 *s;
  92. s = src;
  93. for(i=0;i<dst_width;i++) {
  94. #if NB_TAPS == 4
  95. sum = s[0 * wrap] * filter[0] +
  96. s[1 * wrap] * filter[1] +
  97. s[2 * wrap] * filter[2] +
  98. s[3 * wrap] * filter[3];
  99. #else
  100. {
  101. int j;
  102. UINT8 *s1 = s;
  103. sum = 0;
  104. for(j=0;j<NB_TAPS;j++) {
  105. sum += s1[0] * filter[j];
  106. s1 += wrap;
  107. }
  108. }
  109. #endif
  110. sum = sum >> FILTER_BITS;
  111. if (sum < 0)
  112. sum = 0;
  113. else if (sum > 255)
  114. sum = 255;
  115. dst[0] = sum;
  116. dst++;
  117. s++;
  118. }
  119. }
  120. #ifdef HAVE_MMX
  121. #include "i386/mmx.h"
  122. #define FILTER4(reg) \
  123. {\
  124. s = src + (src_pos >> POS_FRAC_BITS);\
  125. phase = get_phase(src_pos);\
  126. filter = filters + phase * NB_TAPS;\
  127. movq_m2r(*s, reg);\
  128. punpcklbw_r2r(mm7, reg);\
  129. movq_m2r(*filter, mm6);\
  130. pmaddwd_r2r(reg, mm6);\
  131. movq_r2r(mm6, reg);\
  132. psrlq_i2r(32, reg);\
  133. paddd_r2r(mm6, reg);\
  134. psrad_i2r(FILTER_BITS, reg);\
  135. src_pos += src_incr;\
  136. }
  137. #define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
  138. /* XXX: do four pixels at a time */
  139. static void h_resample_fast4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  140. int src_start, int src_incr, INT16 *filters)
  141. {
  142. int src_pos, phase;
  143. UINT8 *s;
  144. INT16 *filter;
  145. mmx_t tmp;
  146. src_pos = src_start;
  147. pxor_r2r(mm7, mm7);
  148. while (dst_width >= 4) {
  149. FILTER4(mm0);
  150. FILTER4(mm1);
  151. FILTER4(mm2);
  152. FILTER4(mm3);
  153. packuswb_r2r(mm7, mm0);
  154. packuswb_r2r(mm7, mm1);
  155. packuswb_r2r(mm7, mm3);
  156. packuswb_r2r(mm7, mm2);
  157. movq_r2m(mm0, tmp);
  158. dst[0] = tmp.ub[0];
  159. movq_r2m(mm1, tmp);
  160. dst[1] = tmp.ub[0];
  161. movq_r2m(mm2, tmp);
  162. dst[2] = tmp.ub[0];
  163. movq_r2m(mm3, tmp);
  164. dst[3] = tmp.ub[0];
  165. dst += 4;
  166. dst_width -= 4;
  167. }
  168. while (dst_width > 0) {
  169. FILTER4(mm0);
  170. packuswb_r2r(mm7, mm0);
  171. movq_r2m(mm0, tmp);
  172. dst[0] = tmp.ub[0];
  173. dst++;
  174. dst_width--;
  175. }
  176. emms();
  177. }
  178. static void v_resample4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int wrap,
  179. INT16 *filter)
  180. {
  181. int sum, i, v;
  182. UINT8 *s;
  183. mmx_t tmp;
  184. mmx_t coefs[4];
  185. for(i=0;i<4;i++) {
  186. v = filter[i];
  187. coefs[i].uw[0] = v;
  188. coefs[i].uw[1] = v;
  189. coefs[i].uw[2] = v;
  190. coefs[i].uw[3] = v;
  191. }
  192. pxor_r2r(mm7, mm7);
  193. s = src;
  194. while (dst_width >= 4) {
  195. movq_m2r(s[0 * wrap], mm0);
  196. punpcklbw_r2r(mm7, mm0);
  197. movq_m2r(s[1 * wrap], mm1);
  198. punpcklbw_r2r(mm7, mm1);
  199. movq_m2r(s[2 * wrap], mm2);
  200. punpcklbw_r2r(mm7, mm2);
  201. movq_m2r(s[3 * wrap], mm3);
  202. punpcklbw_r2r(mm7, mm3);
  203. pmullw_m2r(coefs[0], mm0);
  204. pmullw_m2r(coefs[1], mm1);
  205. pmullw_m2r(coefs[2], mm2);
  206. pmullw_m2r(coefs[3], mm3);
  207. paddw_r2r(mm1, mm0);
  208. paddw_r2r(mm3, mm2);
  209. paddw_r2r(mm2, mm0);
  210. psraw_i2r(FILTER_BITS, mm0);
  211. packuswb_r2r(mm7, mm0);
  212. movq_r2m(mm0, tmp);
  213. *(UINT32 *)dst = tmp.ud[0];
  214. dst += 4;
  215. s += 4;
  216. dst_width -= 4;
  217. }
  218. while (dst_width > 0) {
  219. sum = s[0 * wrap] * filter[0] +
  220. s[1 * wrap] * filter[1] +
  221. s[2 * wrap] * filter[2] +
  222. s[3 * wrap] * filter[3];
  223. sum = sum >> FILTER_BITS;
  224. if (sum < 0)
  225. sum = 0;
  226. else if (sum > 255)
  227. sum = 255;
  228. dst[0] = sum;
  229. dst++;
  230. s++;
  231. dst_width--;
  232. }
  233. emms();
  234. }
  235. #endif
  236. /* slow version to handle limit cases. Does not need optimisation */
  237. static void h_resample_slow(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  238. int src_start, int src_incr, INT16 *filters)
  239. {
  240. int src_pos, phase, sum, j, v, i;
  241. UINT8 *s, *src_end;
  242. INT16 *filter;
  243. src_end = src + src_width;
  244. src_pos = src_start;
  245. for(i=0;i<dst_width;i++) {
  246. s = src + (src_pos >> POS_FRAC_BITS);
  247. phase = get_phase(src_pos);
  248. filter = filters + phase * NB_TAPS;
  249. sum = 0;
  250. for(j=0;j<NB_TAPS;j++) {
  251. if (s < src)
  252. v = src[0];
  253. else if (s >= src_end)
  254. v = src_end[-1];
  255. else
  256. v = s[0];
  257. sum += v * filter[j];
  258. s++;
  259. }
  260. sum = sum >> FILTER_BITS;
  261. if (sum < 0)
  262. sum = 0;
  263. else if (sum > 255)
  264. sum = 255;
  265. dst[0] = sum;
  266. src_pos += src_incr;
  267. dst++;
  268. }
  269. }
  270. static void h_resample(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  271. int src_start, int src_incr, INT16 *filters)
  272. {
  273. int n, src_end;
  274. if (src_start < 0) {
  275. n = (0 - src_start + src_incr - 1) / src_incr;
  276. h_resample_slow(dst, n, src, src_width, src_start, src_incr, filters);
  277. dst += n;
  278. dst_width -= n;
  279. src_start += n * src_incr;
  280. }
  281. src_end = src_start + dst_width * src_incr;
  282. if (src_end > ((src_width - NB_TAPS) << POS_FRAC_BITS)) {
  283. n = (((src_width - NB_TAPS + 1) << POS_FRAC_BITS) - 1 - src_start) /
  284. src_incr;
  285. } else {
  286. n = dst_width;
  287. }
  288. #ifdef HAVE_MMX
  289. if ((mm_flags & MM_MMX) && NB_TAPS == 4)
  290. h_resample_fast4_mmx(dst, n,
  291. src, src_width, src_start, src_incr, filters);
  292. else
  293. #endif
  294. h_resample_fast(dst, n,
  295. src, src_width, src_start, src_incr, filters);
  296. if (n < dst_width) {
  297. dst += n;
  298. dst_width -= n;
  299. src_start += n * src_incr;
  300. h_resample_slow(dst, dst_width,
  301. src, src_width, src_start, src_incr, filters);
  302. }
  303. }
  304. static void component_resample(ImgReSampleContext *s,
  305. UINT8 *output, int owrap, int owidth, int oheight,
  306. UINT8 *input, int iwrap, int iwidth, int iheight)
  307. {
  308. int src_y, src_y1, last_src_y, ring_y, phase_y, y1, y;
  309. UINT8 *new_line, *src_line;
  310. last_src_y = - FCENTER - 1;
  311. /* position of the bottom of the filter in the source image */
  312. src_y = (last_src_y + NB_TAPS) * POS_FRAC;
  313. ring_y = NB_TAPS; /* position in ring buffer */
  314. for(y=0;y<oheight;y++) {
  315. /* apply horizontal filter on new lines from input if needed */
  316. src_y1 = src_y >> POS_FRAC_BITS;
  317. while (last_src_y < src_y1) {
  318. if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS)
  319. ring_y = NB_TAPS;
  320. last_src_y++;
  321. /* handle limit conditions : replicate line (slighly
  322. inefficient because we filter multiple times */
  323. y1 = last_src_y;
  324. if (y1 < 0) {
  325. y1 = 0;
  326. } else if (y1 >= iheight) {
  327. y1 = iheight - 1;
  328. }
  329. src_line = input + y1 * iwrap;
  330. new_line = s->line_buf + ring_y * owidth;
  331. /* apply filter and handle limit cases correctly */
  332. h_resample(new_line, owidth,
  333. src_line, iwidth, - FCENTER * POS_FRAC, s->h_incr,
  334. &s->h_filters[0][0]);
  335. /* handle ring buffer wraping */
  336. if (ring_y >= LINE_BUF_HEIGHT) {
  337. memcpy(s->line_buf + (ring_y - LINE_BUF_HEIGHT) * owidth,
  338. new_line, owidth);
  339. }
  340. }
  341. /* apply vertical filter */
  342. phase_y = get_phase(src_y);
  343. #ifdef HAVE_MMX
  344. /* desactivated MMX because loss of precision */
  345. if ((mm_flags & MM_MMX) && NB_TAPS == 4 && 0)
  346. v_resample4_mmx(output, owidth,
  347. s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
  348. &s->v_filters[phase_y][0]);
  349. else
  350. #endif
  351. v_resample(output, owidth,
  352. s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
  353. &s->v_filters[phase_y][0]);
  354. src_y += s->v_incr;
  355. output += owrap;
  356. }
  357. }
  358. /* XXX: the following filter is quite naive, but it seems to suffice
  359. for 4 taps */
  360. static void build_filter(INT16 *filter, float factor)
  361. {
  362. int ph, i, v;
  363. float x, y, tab[NB_TAPS], norm, mult;
  364. /* if upsampling, only need to interpolate, no filter */
  365. if (factor > 1.0)
  366. factor = 1.0;
  367. for(ph=0;ph<NB_PHASES;ph++) {
  368. norm = 0;
  369. for(i=0;i<NB_TAPS;i++) {
  370. x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor;
  371. if (x == 0)
  372. y = 1.0;
  373. else
  374. y = sin(x) / x;
  375. tab[i] = y;
  376. norm += y;
  377. }
  378. /* normalize so that an uniform color remains the same */
  379. mult = (float)(1 << FILTER_BITS) / norm;
  380. for(i=0;i<NB_TAPS;i++) {
  381. v = (int)(tab[i] * mult);
  382. filter[ph * NB_TAPS + i] = v;
  383. }
  384. }
  385. }
  386. ImgReSampleContext *img_resample_init(int owidth, int oheight,
  387. int iwidth, int iheight)
  388. {
  389. ImgReSampleContext *s;
  390. s = av_mallocz(sizeof(ImgReSampleContext));
  391. if (!s)
  392. return NULL;
  393. s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS));
  394. if (!s->line_buf)
  395. goto fail;
  396. s->owidth = owidth;
  397. s->oheight = oheight;
  398. s->iwidth = iwidth;
  399. s->iheight = iheight;
  400. s->h_incr = (iwidth * POS_FRAC) / owidth;
  401. s->v_incr = (iheight * POS_FRAC) / oheight;
  402. build_filter(&s->h_filters[0][0], (float)owidth / (float)iwidth);
  403. build_filter(&s->v_filters[0][0], (float)oheight / (float)iheight);
  404. return s;
  405. fail:
  406. av_free(s);
  407. return NULL;
  408. }
  409. void img_resample(ImgReSampleContext *s,
  410. AVPicture *output, AVPicture *input)
  411. {
  412. int i, shift;
  413. for(i=0;i<3;i++) {
  414. shift = (i == 0) ? 0 : 1;
  415. component_resample(s, output->data[i], output->linesize[i],
  416. s->owidth >> shift, s->oheight >> shift,
  417. input->data[i], input->linesize[i],
  418. s->iwidth >> shift, s->iheight >> shift);
  419. }
  420. }
  421. void img_resample_close(ImgReSampleContext *s)
  422. {
  423. av_free(s->line_buf);
  424. av_free(s);
  425. }
  426. #ifdef TEST
  427. void *av_mallocz(int size)
  428. {
  429. void *ptr;
  430. ptr = malloc(size);
  431. memset(ptr, 0, size);
  432. return ptr;
  433. }
  434. /* input */
  435. #define XSIZE 256
  436. #define YSIZE 256
  437. UINT8 img[XSIZE * YSIZE];
  438. /* output */
  439. #define XSIZE1 512
  440. #define YSIZE1 512
  441. UINT8 img1[XSIZE1 * YSIZE1];
  442. UINT8 img2[XSIZE1 * YSIZE1];
  443. void save_pgm(const char *filename, UINT8 *img, int xsize, int ysize)
  444. {
  445. FILE *f;
  446. f=fopen(filename,"w");
  447. fprintf(f,"P5\n%d %d\n%d\n", xsize, ysize, 255);
  448. fwrite(img,1, xsize * ysize,f);
  449. fclose(f);
  450. }
  451. static void dump_filter(INT16 *filter)
  452. {
  453. int i, ph;
  454. for(ph=0;ph<NB_PHASES;ph++) {
  455. printf("%2d: ", ph);
  456. for(i=0;i<NB_TAPS;i++) {
  457. printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0);
  458. }
  459. printf("\n");
  460. }
  461. }
  462. #ifdef HAVE_MMX
  463. extern int mm_flags;
  464. #endif
  465. int main(int argc, char **argv)
  466. {
  467. int x, y, v, i, xsize, ysize;
  468. ImgReSampleContext *s;
  469. float fact, factors[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
  470. char buf[256];
  471. /* build test image */
  472. for(y=0;y<YSIZE;y++) {
  473. for(x=0;x<XSIZE;x++) {
  474. if (x < XSIZE/2 && y < YSIZE/2) {
  475. if (x < XSIZE/4 && y < YSIZE/4) {
  476. if ((x % 10) <= 6 &&
  477. (y % 10) <= 6)
  478. v = 0xff;
  479. else
  480. v = 0x00;
  481. } else if (x < XSIZE/4) {
  482. if (x & 1)
  483. v = 0xff;
  484. else
  485. v = 0;
  486. } else if (y < XSIZE/4) {
  487. if (y & 1)
  488. v = 0xff;
  489. else
  490. v = 0;
  491. } else {
  492. if (y < YSIZE*3/8) {
  493. if ((y+x) & 1)
  494. v = 0xff;
  495. else
  496. v = 0;
  497. } else {
  498. if (((x+3) % 4) <= 1 &&
  499. ((y+3) % 4) <= 1)
  500. v = 0xff;
  501. else
  502. v = 0x00;
  503. }
  504. }
  505. } else if (x < XSIZE/2) {
  506. v = ((x - (XSIZE/2)) * 255) / (XSIZE/2);
  507. } else if (y < XSIZE/2) {
  508. v = ((y - (XSIZE/2)) * 255) / (XSIZE/2);
  509. } else {
  510. v = ((x + y - XSIZE) * 255) / XSIZE;
  511. }
  512. img[y * XSIZE + x] = v;
  513. }
  514. }
  515. save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE);
  516. for(i=0;i<sizeof(factors)/sizeof(float);i++) {
  517. fact = factors[i];
  518. xsize = (int)(XSIZE * fact);
  519. ysize = (int)(YSIZE * fact);
  520. s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
  521. printf("Factor=%0.2f\n", fact);
  522. dump_filter(&s->h_filters[0][0]);
  523. component_resample(s, img1, xsize, xsize, ysize,
  524. img, XSIZE, XSIZE, YSIZE);
  525. img_resample_close(s);
  526. sprintf(buf, "/tmp/out%d.pgm", i);
  527. save_pgm(buf, img1, xsize, ysize);
  528. }
  529. /* mmx test */
  530. #ifdef HAVE_MMX
  531. printf("MMX test\n");
  532. fact = 0.72;
  533. xsize = (int)(XSIZE * fact);
  534. ysize = (int)(YSIZE * fact);
  535. mm_flags = MM_MMX;
  536. s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
  537. component_resample(s, img1, xsize, xsize, ysize,
  538. img, XSIZE, XSIZE, YSIZE);
  539. mm_flags = 0;
  540. s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
  541. component_resample(s, img2, xsize, xsize, ysize,
  542. img, XSIZE, XSIZE, YSIZE);
  543. if (memcmp(img1, img2, xsize * ysize) != 0) {
  544. fprintf(stderr, "mmx error\n");
  545. exit(1);
  546. }
  547. printf("MMX OK\n");
  548. #endif
  549. return 0;
  550. }
  551. #endif