You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

620 lines
17KB

  1. /*
  2. * High quality image resampling with polyphase filters
  3. * Copyright (c) 2001 Gerard Lantau.
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  18. */
  19. #include <stdlib.h>
  20. #include <stdio.h>
  21. #include <string.h>
  22. #include <math.h>
  23. #include "dsputil.h"
  24. #include "avcodec.h"
  25. #ifdef USE_FASTMEMCPY
  26. #include "fastmemcpy.h"
  27. #endif
  28. #define NB_COMPONENTS 3
  29. #define PHASE_BITS 4
  30. #define NB_PHASES (1 << PHASE_BITS)
  31. #define NB_TAPS 4
  32. #define FCENTER 1 /* index of the center of the filter */
  33. #define POS_FRAC_BITS 16
  34. #define POS_FRAC (1 << POS_FRAC_BITS)
  35. /* 6 bits precision is needed for MMX */
  36. #define FILTER_BITS 8
  37. #define LINE_BUF_HEIGHT (NB_TAPS * 4)
  38. struct ImgReSampleContext {
  39. int iwidth, iheight, owidth, oheight;
  40. int h_incr, v_incr;
  41. INT16 h_filters[NB_PHASES][NB_TAPS] __align8; /* horizontal filters */
  42. INT16 v_filters[NB_PHASES][NB_TAPS] __align8; /* vertical filters */
  43. UINT8 *line_buf;
  44. };
  45. static inline int get_phase(int pos)
  46. {
  47. return ((pos) >> (POS_FRAC_BITS - PHASE_BITS)) & ((1 << PHASE_BITS) - 1);
  48. }
  49. /* This function must be optimized */
  50. static void h_resample_fast(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  51. int src_start, int src_incr, INT16 *filters)
  52. {
  53. int src_pos, phase, sum, i;
  54. UINT8 *s;
  55. INT16 *filter;
  56. src_pos = src_start;
  57. for(i=0;i<dst_width;i++) {
  58. #ifdef TEST
  59. /* test */
  60. if ((src_pos >> POS_FRAC_BITS) < 0 ||
  61. (src_pos >> POS_FRAC_BITS) > (src_width - NB_TAPS))
  62. abort();
  63. #endif
  64. s = src + (src_pos >> POS_FRAC_BITS);
  65. phase = get_phase(src_pos);
  66. filter = filters + phase * NB_TAPS;
  67. #if NB_TAPS == 4
  68. sum = s[0] * filter[0] +
  69. s[1] * filter[1] +
  70. s[2] * filter[2] +
  71. s[3] * filter[3];
  72. #else
  73. {
  74. int j;
  75. sum = 0;
  76. for(j=0;j<NB_TAPS;j++)
  77. sum += s[j] * filter[j];
  78. }
  79. #endif
  80. sum = sum >> FILTER_BITS;
  81. if (sum < 0)
  82. sum = 0;
  83. else if (sum > 255)
  84. sum = 255;
  85. dst[0] = sum;
  86. src_pos += src_incr;
  87. dst++;
  88. }
  89. }
  90. /* This function must be optimized */
  91. static void v_resample(UINT8 *dst, int dst_width, UINT8 *src, int wrap,
  92. INT16 *filter)
  93. {
  94. int sum, i;
  95. UINT8 *s;
  96. s = src;
  97. for(i=0;i<dst_width;i++) {
  98. #if NB_TAPS == 4
  99. sum = s[0 * wrap] * filter[0] +
  100. s[1 * wrap] * filter[1] +
  101. s[2 * wrap] * filter[2] +
  102. s[3 * wrap] * filter[3];
  103. #else
  104. {
  105. int j;
  106. UINT8 *s1 = s;
  107. sum = 0;
  108. for(j=0;j<NB_TAPS;j++) {
  109. sum += s1[0] * filter[j];
  110. s1 += wrap;
  111. }
  112. }
  113. #endif
  114. sum = sum >> FILTER_BITS;
  115. if (sum < 0)
  116. sum = 0;
  117. else if (sum > 255)
  118. sum = 255;
  119. dst[0] = sum;
  120. dst++;
  121. s++;
  122. }
  123. }
  124. #ifdef HAVE_MMX
  125. #include "i386/mmx.h"
  126. #define FILTER4(reg) \
  127. {\
  128. s = src + (src_pos >> POS_FRAC_BITS);\
  129. phase = get_phase(src_pos);\
  130. filter = filters + phase * NB_TAPS;\
  131. movq_m2r(*s, reg);\
  132. punpcklbw_r2r(mm7, reg);\
  133. movq_m2r(*filter, mm6);\
  134. pmaddwd_r2r(reg, mm6);\
  135. movq_r2r(mm6, reg);\
  136. psrlq_i2r(32, reg);\
  137. paddd_r2r(mm6, reg);\
  138. psrad_i2r(FILTER_BITS, reg);\
  139. src_pos += src_incr;\
  140. }
  141. #define DUMP(reg) movq_r2m(reg, tmp); printf(#reg "=%016Lx\n", tmp.uq);
  142. /* XXX: do four pixels at a time */
  143. static void h_resample_fast4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  144. int src_start, int src_incr, INT16 *filters)
  145. {
  146. int src_pos, phase;
  147. UINT8 *s;
  148. INT16 *filter;
  149. mmx_t tmp;
  150. src_pos = src_start;
  151. pxor_r2r(mm7, mm7);
  152. while (dst_width >= 4) {
  153. FILTER4(mm0);
  154. FILTER4(mm1);
  155. FILTER4(mm2);
  156. FILTER4(mm3);
  157. packuswb_r2r(mm7, mm0);
  158. packuswb_r2r(mm7, mm1);
  159. packuswb_r2r(mm7, mm3);
  160. packuswb_r2r(mm7, mm2);
  161. movq_r2m(mm0, tmp);
  162. dst[0] = tmp.ub[0];
  163. movq_r2m(mm1, tmp);
  164. dst[1] = tmp.ub[0];
  165. movq_r2m(mm2, tmp);
  166. dst[2] = tmp.ub[0];
  167. movq_r2m(mm3, tmp);
  168. dst[3] = tmp.ub[0];
  169. dst += 4;
  170. dst_width -= 4;
  171. }
  172. while (dst_width > 0) {
  173. FILTER4(mm0);
  174. packuswb_r2r(mm7, mm0);
  175. movq_r2m(mm0, tmp);
  176. dst[0] = tmp.ub[0];
  177. dst++;
  178. dst_width--;
  179. }
  180. emms();
  181. }
  182. static void v_resample4_mmx(UINT8 *dst, int dst_width, UINT8 *src, int wrap,
  183. INT16 *filter)
  184. {
  185. int sum, i, v;
  186. UINT8 *s;
  187. mmx_t tmp;
  188. mmx_t coefs[4];
  189. for(i=0;i<4;i++) {
  190. v = filter[i];
  191. coefs[i].uw[0] = v;
  192. coefs[i].uw[1] = v;
  193. coefs[i].uw[2] = v;
  194. coefs[i].uw[3] = v;
  195. }
  196. pxor_r2r(mm7, mm7);
  197. s = src;
  198. while (dst_width >= 4) {
  199. movq_m2r(s[0 * wrap], mm0);
  200. punpcklbw_r2r(mm7, mm0);
  201. movq_m2r(s[1 * wrap], mm1);
  202. punpcklbw_r2r(mm7, mm1);
  203. movq_m2r(s[2 * wrap], mm2);
  204. punpcklbw_r2r(mm7, mm2);
  205. movq_m2r(s[3 * wrap], mm3);
  206. punpcklbw_r2r(mm7, mm3);
  207. pmullw_m2r(coefs[0], mm0);
  208. pmullw_m2r(coefs[1], mm1);
  209. pmullw_m2r(coefs[2], mm2);
  210. pmullw_m2r(coefs[3], mm3);
  211. paddw_r2r(mm1, mm0);
  212. paddw_r2r(mm3, mm2);
  213. paddw_r2r(mm2, mm0);
  214. psraw_i2r(FILTER_BITS, mm0);
  215. packuswb_r2r(mm7, mm0);
  216. movq_r2m(mm0, tmp);
  217. *(UINT32 *)dst = tmp.ud[0];
  218. dst += 4;
  219. s += 4;
  220. dst_width -= 4;
  221. }
  222. while (dst_width > 0) {
  223. sum = s[0 * wrap] * filter[0] +
  224. s[1 * wrap] * filter[1] +
  225. s[2 * wrap] * filter[2] +
  226. s[3 * wrap] * filter[3];
  227. sum = sum >> FILTER_BITS;
  228. if (sum < 0)
  229. sum = 0;
  230. else if (sum > 255)
  231. sum = 255;
  232. dst[0] = sum;
  233. dst++;
  234. s++;
  235. dst_width--;
  236. }
  237. emms();
  238. }
  239. #endif
  240. /* slow version to handle limit cases. Does not need optimisation */
  241. static void h_resample_slow(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  242. int src_start, int src_incr, INT16 *filters)
  243. {
  244. int src_pos, phase, sum, j, v, i;
  245. UINT8 *s, *src_end;
  246. INT16 *filter;
  247. src_end = src + src_width;
  248. src_pos = src_start;
  249. for(i=0;i<dst_width;i++) {
  250. s = src + (src_pos >> POS_FRAC_BITS);
  251. phase = get_phase(src_pos);
  252. filter = filters + phase * NB_TAPS;
  253. sum = 0;
  254. for(j=0;j<NB_TAPS;j++) {
  255. if (s < src)
  256. v = src[0];
  257. else if (s >= src_end)
  258. v = src_end[-1];
  259. else
  260. v = s[0];
  261. sum += v * filter[j];
  262. s++;
  263. }
  264. sum = sum >> FILTER_BITS;
  265. if (sum < 0)
  266. sum = 0;
  267. else if (sum > 255)
  268. sum = 255;
  269. dst[0] = sum;
  270. src_pos += src_incr;
  271. dst++;
  272. }
  273. }
  274. static void h_resample(UINT8 *dst, int dst_width, UINT8 *src, int src_width,
  275. int src_start, int src_incr, INT16 *filters)
  276. {
  277. int n, src_end;
  278. if (src_start < 0) {
  279. n = (0 - src_start + src_incr - 1) / src_incr;
  280. h_resample_slow(dst, n, src, src_width, src_start, src_incr, filters);
  281. dst += n;
  282. dst_width -= n;
  283. src_start += n * src_incr;
  284. }
  285. src_end = src_start + dst_width * src_incr;
  286. if (src_end > ((src_width - NB_TAPS) << POS_FRAC_BITS)) {
  287. n = (((src_width - NB_TAPS + 1) << POS_FRAC_BITS) - 1 - src_start) /
  288. src_incr;
  289. } else {
  290. n = dst_width;
  291. }
  292. #ifdef HAVE_MMX
  293. if ((mm_flags & MM_MMX) && NB_TAPS == 4)
  294. h_resample_fast4_mmx(dst, n,
  295. src, src_width, src_start, src_incr, filters);
  296. else
  297. #endif
  298. h_resample_fast(dst, n,
  299. src, src_width, src_start, src_incr, filters);
  300. if (n < dst_width) {
  301. dst += n;
  302. dst_width -= n;
  303. src_start += n * src_incr;
  304. h_resample_slow(dst, dst_width,
  305. src, src_width, src_start, src_incr, filters);
  306. }
  307. }
  308. static void component_resample(ImgReSampleContext *s,
  309. UINT8 *output, int owrap, int owidth, int oheight,
  310. UINT8 *input, int iwrap, int iwidth, int iheight)
  311. {
  312. int src_y, src_y1, last_src_y, ring_y, phase_y, y1, y;
  313. UINT8 *new_line, *src_line;
  314. last_src_y = - FCENTER - 1;
  315. /* position of the bottom of the filter in the source image */
  316. src_y = (last_src_y + NB_TAPS) * POS_FRAC;
  317. ring_y = NB_TAPS; /* position in ring buffer */
  318. for(y=0;y<oheight;y++) {
  319. /* apply horizontal filter on new lines from input if needed */
  320. src_y1 = src_y >> POS_FRAC_BITS;
  321. while (last_src_y < src_y1) {
  322. if (++ring_y >= LINE_BUF_HEIGHT + NB_TAPS)
  323. ring_y = NB_TAPS;
  324. last_src_y++;
  325. /* handle limit conditions : replicate line (slighly
  326. inefficient because we filter multiple times */
  327. y1 = last_src_y;
  328. if (y1 < 0) {
  329. y1 = 0;
  330. } else if (y1 >= iheight) {
  331. y1 = iheight - 1;
  332. }
  333. src_line = input + y1 * iwrap;
  334. new_line = s->line_buf + ring_y * owidth;
  335. /* apply filter and handle limit cases correctly */
  336. h_resample(new_line, owidth,
  337. src_line, iwidth, - FCENTER * POS_FRAC, s->h_incr,
  338. &s->h_filters[0][0]);
  339. /* handle ring buffer wraping */
  340. if (ring_y >= LINE_BUF_HEIGHT) {
  341. memcpy(s->line_buf + (ring_y - LINE_BUF_HEIGHT) * owidth,
  342. new_line, owidth);
  343. }
  344. }
  345. /* apply vertical filter */
  346. phase_y = get_phase(src_y);
  347. #ifdef HAVE_MMX
  348. /* desactivated MMX because loss of precision */
  349. if ((mm_flags & MM_MMX) && NB_TAPS == 4 && 0)
  350. v_resample4_mmx(output, owidth,
  351. s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
  352. &s->v_filters[phase_y][0]);
  353. else
  354. #endif
  355. v_resample(output, owidth,
  356. s->line_buf + (ring_y - NB_TAPS + 1) * owidth, owidth,
  357. &s->v_filters[phase_y][0]);
  358. src_y += s->v_incr;
  359. output += owrap;
  360. }
  361. }
  362. /* XXX: the following filter is quite naive, but it seems to suffice
  363. for 4 taps */
  364. static void build_filter(INT16 *filter, float factor)
  365. {
  366. int ph, i, v;
  367. float x, y, tab[NB_TAPS], norm, mult;
  368. /* if upsampling, only need to interpolate, no filter */
  369. if (factor > 1.0)
  370. factor = 1.0;
  371. for(ph=0;ph<NB_PHASES;ph++) {
  372. norm = 0;
  373. for(i=0;i<NB_TAPS;i++) {
  374. x = M_PI * ((float)(i - FCENTER) - (float)ph / NB_PHASES) * factor;
  375. if (x == 0)
  376. y = 1.0;
  377. else
  378. y = sin(x) / x;
  379. tab[i] = y;
  380. norm += y;
  381. }
  382. /* normalize so that an uniform color remains the same */
  383. mult = (float)(1 << FILTER_BITS) / norm;
  384. for(i=0;i<NB_TAPS;i++) {
  385. v = (int)(tab[i] * mult);
  386. filter[ph * NB_TAPS + i] = v;
  387. }
  388. }
  389. }
  390. ImgReSampleContext *img_resample_init(int owidth, int oheight,
  391. int iwidth, int iheight)
  392. {
  393. ImgReSampleContext *s;
  394. s = av_mallocz(sizeof(ImgReSampleContext));
  395. if (!s)
  396. return NULL;
  397. s->line_buf = av_mallocz(owidth * (LINE_BUF_HEIGHT + NB_TAPS));
  398. if (!s->line_buf)
  399. goto fail;
  400. s->owidth = owidth;
  401. s->oheight = oheight;
  402. s->iwidth = iwidth;
  403. s->iheight = iheight;
  404. s->h_incr = (iwidth * POS_FRAC) / owidth;
  405. s->v_incr = (iheight * POS_FRAC) / oheight;
  406. build_filter(&s->h_filters[0][0], (float)owidth / (float)iwidth);
  407. build_filter(&s->v_filters[0][0], (float)oheight / (float)iheight);
  408. return s;
  409. fail:
  410. free(s);
  411. return NULL;
  412. }
  413. void img_resample(ImgReSampleContext *s,
  414. AVPicture *output, AVPicture *input)
  415. {
  416. int i, shift;
  417. for(i=0;i<3;i++) {
  418. shift = (i == 0) ? 0 : 1;
  419. component_resample(s, output->data[i], output->linesize[i],
  420. s->owidth >> shift, s->oheight >> shift,
  421. input->data[i], input->linesize[i],
  422. s->iwidth >> shift, s->iheight >> shift);
  423. }
  424. }
  425. void img_resample_close(ImgReSampleContext *s)
  426. {
  427. free(s->line_buf);
  428. free(s);
  429. }
  430. #ifdef TEST
  431. void *av_mallocz(int size)
  432. {
  433. void *ptr;
  434. ptr = malloc(size);
  435. memset(ptr, 0, size);
  436. return ptr;
  437. }
  438. /* input */
  439. #define XSIZE 256
  440. #define YSIZE 256
  441. UINT8 img[XSIZE * YSIZE];
  442. /* output */
  443. #define XSIZE1 512
  444. #define YSIZE1 512
  445. UINT8 img1[XSIZE1 * YSIZE1];
  446. UINT8 img2[XSIZE1 * YSIZE1];
  447. void save_pgm(const char *filename, UINT8 *img, int xsize, int ysize)
  448. {
  449. FILE *f;
  450. f=fopen(filename,"w");
  451. fprintf(f,"P5\n%d %d\n%d\n", xsize, ysize, 255);
  452. fwrite(img,1, xsize * ysize,f);
  453. fclose(f);
  454. }
  455. static void dump_filter(INT16 *filter)
  456. {
  457. int i, ph;
  458. for(ph=0;ph<NB_PHASES;ph++) {
  459. printf("%2d: ", ph);
  460. for(i=0;i<NB_TAPS;i++) {
  461. printf(" %5.2f", filter[ph * NB_TAPS + i] / 256.0);
  462. }
  463. printf("\n");
  464. }
  465. }
  466. #ifdef HAVE_MMX
  467. int mm_flags;
  468. #endif
  469. int main(int argc, char **argv)
  470. {
  471. int x, y, v, i, xsize, ysize;
  472. ImgReSampleContext *s;
  473. float fact, factors[] = { 1/2.0, 3.0/4.0, 1.0, 4.0/3.0, 16.0/9.0, 2.0 };
  474. char buf[256];
  475. /* build test image */
  476. for(y=0;y<YSIZE;y++) {
  477. for(x=0;x<XSIZE;x++) {
  478. if (x < XSIZE/2 && y < YSIZE/2) {
  479. if (x < XSIZE/4 && y < YSIZE/4) {
  480. if ((x % 10) <= 6 &&
  481. (y % 10) <= 6)
  482. v = 0xff;
  483. else
  484. v = 0x00;
  485. } else if (x < XSIZE/4) {
  486. if (x & 1)
  487. v = 0xff;
  488. else
  489. v = 0;
  490. } else if (y < XSIZE/4) {
  491. if (y & 1)
  492. v = 0xff;
  493. else
  494. v = 0;
  495. } else {
  496. if (y < YSIZE*3/8) {
  497. if ((y+x) & 1)
  498. v = 0xff;
  499. else
  500. v = 0;
  501. } else {
  502. if (((x+3) % 4) <= 1 &&
  503. ((y+3) % 4) <= 1)
  504. v = 0xff;
  505. else
  506. v = 0x00;
  507. }
  508. }
  509. } else if (x < XSIZE/2) {
  510. v = ((x - (XSIZE/2)) * 255) / (XSIZE/2);
  511. } else if (y < XSIZE/2) {
  512. v = ((y - (XSIZE/2)) * 255) / (XSIZE/2);
  513. } else {
  514. v = ((x + y - XSIZE) * 255) / XSIZE;
  515. }
  516. img[y * XSIZE + x] = v;
  517. }
  518. }
  519. save_pgm("/tmp/in.pgm", img, XSIZE, YSIZE);
  520. for(i=0;i<sizeof(factors)/sizeof(float);i++) {
  521. fact = factors[i];
  522. xsize = (int)(XSIZE * fact);
  523. ysize = (int)(YSIZE * fact);
  524. s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
  525. printf("Factor=%0.2f\n", fact);
  526. dump_filter(&s->h_filters[0][0]);
  527. component_resample(s, img1, xsize, xsize, ysize,
  528. img, XSIZE, XSIZE, YSIZE);
  529. img_resample_close(s);
  530. sprintf(buf, "/tmp/out%d.pgm", i);
  531. save_pgm(buf, img1, xsize, ysize);
  532. }
  533. /* mmx test */
  534. #ifdef HAVE_MMX
  535. printf("MMX test\n");
  536. fact = 0.72;
  537. xsize = (int)(XSIZE * fact);
  538. ysize = (int)(YSIZE * fact);
  539. mm_flags = MM_MMX;
  540. s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
  541. component_resample(s, img1, xsize, xsize, ysize,
  542. img, XSIZE, XSIZE, YSIZE);
  543. mm_flags = 0;
  544. s = img_resample_init(xsize, ysize, XSIZE, YSIZE);
  545. component_resample(s, img2, xsize, xsize, ysize,
  546. img, XSIZE, XSIZE, YSIZE);
  547. if (memcmp(img1, img2, xsize * ysize) != 0) {
  548. fprintf(stderr, "mmx error\n");
  549. exit(1);
  550. }
  551. printf("MMX OK\n");
  552. #endif
  553. return 0;
  554. }
  555. #endif