slow but accurate integer dct from IJG (should be ok with the LGPL as the old DCT is the fast integer DCT from IJG)

per context DCT selection Originally committed as revision 878 to svn://svn.ffmpeg.org/ffmpeg/trunk
23 years ago · 28db7fce02
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -11,7 +11,7 @@ CFLAGS= $(OPTFLAGS) -Wall -g -DHAVE_AV_CONFIG_H -I.. -D_FILE_OFFSET_BITS=64 -D_L
 LDFLAGS= -g
 OBJS= common.o utils.o mem.o allcodecs.o \
      mpegvideo.o h263.o jrevdct.o jfdctfst.o \
      mpegvideo.o h263.o jrevdct.o jfdctfst.o jfdctint.o\
      mpegaudio.o ac3enc.o mjpeg.o resample.o dsputil.o \
      motion_est.o imgconvert.o imgresample.o msmpeg4.o \
      mpeg12.o h263dec.o svq1.o rv10.o mpegaudiodec.o pcm.o simple_idct.o \
--- a/libavcodec/avcodec.h
+++ b/libavcodec/avcodec.h
@@ -5,8 +5,8 @@
 #define LIBAVCODEC_VERSION_INT 0x000406
 #define LIBAVCODEC_VERSION     "0.4.6"
 #define LIBAVCODEC_BUILD       4620
 #define LIBAVCODEC_BUILD_STR   "4620"
 #define LIBAVCODEC_BUILD       4621
 #define LIBAVCODEC_BUILD_STR   "4621"
 enum CodecID {
    CODEC_ID_NONE, 
@@ -96,6 +96,7 @@ extern int motion_estimation_method;
 static const int Motion_Est_QTab[] = { ME_ZERO, ME_PHODS, ME_LOG, 
                                       ME_X1, ME_EPZS, ME_FULL };
 #define FF_MAX_B_FRAMES 4
 /* encoding support */
@@ -308,6 +309,12 @@ typedef struct AVCodecContext {
    int aspected_width;
    int aspected_height;
    int dct_algo;
 #define FF_DCT_AUTO 0
 #define FF_DCT_FASTINT 1
 #define FF_DCT_INT 2
 #define FF_DCT_MMX 3
    //FIXME this should be reordered after kabis API is finished ...
    //TODO kill kabi
    /*
@@ -338,7 +345,7 @@ typedef struct AVCodecContext {
 	    uc_res6,uc_res7,uc_res8,uc_res9,uc_res10,uc_res11,uc_res12;
    unsigned int
 	    ui_res0,ui_res1,ui_res2,ui_res3,ui_res4,ui_res5,ui_res6,ui_res7,ui_res8,ui_res9,
 	    ui_res10,ui_res11,ui_res12,ui_res13,ui_res14,ui_res15,ui_res16,ui_res17;
 	    ui_res10,ui_res11,ui_res12,ui_res13,ui_res14,ui_res15,ui_res16;
 } AVCodecContext;
 typedef struct AVCodec {
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -25,7 +25,6 @@
 void (*ff_idct)(DCTELEM *block);
 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
 void (*av_fdct)(DCTELEM *block);
 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
@@ -1323,7 +1322,6 @@ void dsputil_init(void)
    pix_abs8x8_x2  = pix_abs8x8_x2_c;
    pix_abs8x8_y2  = pix_abs8x8_y2_c;
    pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
    av_fdct = fdct_ifast;
    use_permuted_idct = 1;
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -27,13 +27,12 @@
 typedef short DCTELEM;
 void fdct_ifast (DCTELEM *data);
 void ff_jpeg_fdct_islow (DCTELEM *data);
 void j_rev_dct (DCTELEM *data);
 void fdct_mmx(DCTELEM *block);
 extern void (*av_fdct)(DCTELEM *block);
 /* encoding scans */
 extern UINT8 ff_alternate_horizontal_scan[64];
 extern UINT8 ff_alternate_vertical_scan[64];
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -424,8 +424,6 @@ void dsputil_init_mmx(void)
        pix_abs8x8_y2 = pix_abs8x8_y2_mmx;
        pix_abs8x8_xy2= pix_abs8x8_xy2_mmx;
        av_fdct = fdct_mmx;
        put_pixels_tab[0] = put_pixels_mmx;
        put_pixels_tab[1] = put_pixels_x2_mmx;
        put_pixels_tab[2] = put_pixels_y2_mmx;
--- a/libavcodec/i386/mpegvideo_mmx.c
+++ b/libavcodec/i386/mpegvideo_mmx.c
@@ -552,16 +552,21 @@ void unused_var_warning_killer(){
 void MPV_common_init_mmx(MpegEncContext *s)
 {
    if (mm_flags & MM_MMX) {
        const int dct_algo= s->avctx->dct_algo;
        s->dct_unquantize_h263 = dct_unquantize_h263_mmx;
        s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_mmx;
        s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_mmx;
        draw_edges = draw_edges_mmx;
        if(mm_flags & MM_MMXEXT){
            dct_quantize= dct_quantize_MMX2;
        } else {
            dct_quantize= dct_quantize_MMX;
        if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){
            s->fdct = fdct_mmx;
            if(mm_flags & MM_MMXEXT){
                s->dct_quantize= dct_quantize_MMX2;
            } else {
                s->dct_quantize= dct_quantize_MMX;
            }
        }
    }
 }
--- a/libavcodec/i386/mpegvideo_mmx_template.c
+++ b/libavcodec/i386/mpegvideo_mmx_template.c
@@ -40,7 +40,8 @@ static int RENAME(dct_quantize)(MpegEncContext *s,
    const UINT16 *qmat, *bias;
    static __align8 INT16 temp_block[64];
    av_fdct (block);
    //s->fdct (block);
    fdct_mmx (block); //cant be anything else ...
    if (s->mb_intra) {
        int dummy;
--- a/libavcodec/jfdctint.c
+++ b/libavcodec/jfdctint.c
@@ -0,0 +1,290 @@
 /*
 * jfdctint.c
 *
 * Copyright (C) 1991-1996, Thomas G. Lane.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying README file.
 *
 * This file contains a slow-but-accurate integer implementation of the
 * forward DCT (Discrete Cosine Transform).
 *
 * A 2-D DCT can be done by 1-D DCT on each row followed by 1-D DCT
 * on each column.  Direct algorithms are also available, but they are
 * much more complex and seem not to be any faster when reduced to code.
 *
 * This implementation is based on an algorithm described in
 *   C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
 *   Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
 *   Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
 * The primary algorithm described there uses 11 multiplies and 29 adds.
 * We use their alternate method with 12 multiplies and 32 adds.
 * The advantage of this method is that no data path contains more than one
 * multiplication; this allows a very simple and accurate implementation in
 * scaled fixed-point arithmetic, with a minimal number of shifts.
 */
 #include <stdlib.h>
 #include <stdio.h>
 #include "common.h"
 #include "dsputil.h"
 #define SHIFT_TEMPS
 #define DCTSIZE 8
 #define GLOBAL(x) x
 #define RIGHT_SHIFT(x, n) ((x) >> (n))
 #if 1 //def USE_ACCURATE_ROUNDING
 #define DESCALE(x,n)  RIGHT_SHIFT((x) + (1 << ((n) - 1)), n)
 #else
 #define DESCALE(x,n)  RIGHT_SHIFT(x, n)
 #endif
 /*
 * This module is specialized to the case DCTSIZE = 8.
 */
 #if DCTSIZE != 8
  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
 #endif
 /*
 * The poop on this scaling stuff is as follows:
 *
 * Each 1-D DCT step produces outputs which are a factor of sqrt(N)
 * larger than the true DCT outputs.  The final outputs are therefore
 * a factor of N larger than desired; since N=8 this can be cured by
 * a simple right shift at the end of the algorithm.  The advantage of
 * this arrangement is that we save two multiplications per 1-D DCT,
 * because the y0 and y4 outputs need not be divided by sqrt(N).
 * In the IJG code, this factor of 8 is removed by the quantization step
 * (in jcdctmgr.c), NOT in this module.
 *
 * We have to do addition and subtraction of the integer inputs, which
 * is no problem, and multiplication by fractional constants, which is
 * a problem to do in integer arithmetic.  We multiply all the constants
 * by CONST_SCALE and convert them to integer constants (thus retaining
 * CONST_BITS bits of precision in the constants).  After doing a
 * multiplication we have to divide the product by CONST_SCALE, with proper
 * rounding, to produce the correct output.  This division can be done
 * cheaply as a right shift of CONST_BITS bits.  We postpone shifting
 * as long as possible so that partial sums can be added together with
 * full fractional precision.
 *
 * The outputs of the first pass are scaled up by PASS1_BITS bits so that
 * they are represented to better-than-integral precision.  These outputs
 * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
 * with the recommended scaling.  (For 12-bit sample data, the intermediate
 * array is INT32 anyway.)
 *
 * To avoid overflow of the 32-bit intermediate results in pass 2, we must
 * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26.  Error analysis
 * shows that the values given below are the most effective.
 */
 #if BITS_IN_JSAMPLE == 8
 #define CONST_BITS  13
 #define PASS1_BITS  2
 #else
 #define CONST_BITS  13
 #define PASS1_BITS  1		/* lose a little precision to avoid overflow */
 #endif
 /* Some C compilers fail to reduce "FIX(constant)" at compile time, thus
 * causing a lot of useless floating-point operations at run time.
 * To get around this we use the following pre-calculated constants.
 * If you change CONST_BITS you may want to add appropriate values.
 * (With a reasonable C compiler, you can just rely on the FIX() macro...)
 */
 #if CONST_BITS == 13
 #define FIX_0_298631336  ((INT32)  2446)	/* FIX(0.298631336) */
 #define FIX_0_390180644  ((INT32)  3196)	/* FIX(0.390180644) */
 #define FIX_0_541196100  ((INT32)  4433)	/* FIX(0.541196100) */
 #define FIX_0_765366865  ((INT32)  6270)	/* FIX(0.765366865) */
 #define FIX_0_899976223  ((INT32)  7373)	/* FIX(0.899976223) */
 #define FIX_1_175875602  ((INT32)  9633)	/* FIX(1.175875602) */
 #define FIX_1_501321110  ((INT32)  12299)	/* FIX(1.501321110) */
 #define FIX_1_847759065  ((INT32)  15137)	/* FIX(1.847759065) */
 #define FIX_1_961570560  ((INT32)  16069)	/* FIX(1.961570560) */
 #define FIX_2_053119869  ((INT32)  16819)	/* FIX(2.053119869) */
 #define FIX_2_562915447  ((INT32)  20995)	/* FIX(2.562915447) */
 #define FIX_3_072711026  ((INT32)  25172)	/* FIX(3.072711026) */
 #else
 #define FIX_0_298631336  FIX(0.298631336)
 #define FIX_0_390180644  FIX(0.390180644)
 #define FIX_0_541196100  FIX(0.541196100)
 #define FIX_0_765366865  FIX(0.765366865)
 #define FIX_0_899976223  FIX(0.899976223)
 #define FIX_1_175875602  FIX(1.175875602)
 #define FIX_1_501321110  FIX(1.501321110)
 #define FIX_1_847759065  FIX(1.847759065)
 #define FIX_1_961570560  FIX(1.961570560)
 #define FIX_2_053119869  FIX(2.053119869)
 #define FIX_2_562915447  FIX(2.562915447)
 #define FIX_3_072711026  FIX(3.072711026)
 #endif
 /* Multiply an INT32 variable by an INT32 constant to yield an INT32 result.
 * For 8-bit samples with the recommended scaling, all the variable
 * and constant values involved are no more than 16 bits wide, so a
 * 16x16->32 bit multiply can be used instead of a full 32x32 multiply.
 * For 12-bit samples, a full 32-bit multiplication will be needed.
 */
 #if BITS_IN_JSAMPLE == 8
 #define MULTIPLY(var,const)  MULTIPLY16C16(var,const)
 #else
 #define MULTIPLY(var,const)  ((var) * (const))
 #endif
 /*
 * Perform the forward DCT on one block of samples.
 */
 GLOBAL(void)
 ff_jpeg_fdct_islow (DCTELEM * data)
 {
  INT32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
  INT32 tmp10, tmp11, tmp12, tmp13;
  INT32 z1, z2, z3, z4, z5;
  DCTELEM *dataptr;
  int ctr;
  SHIFT_TEMPS
  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true DCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */
  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[0] + dataptr[7];
    tmp7 = dataptr[0] - dataptr[7];
    tmp1 = dataptr[1] + dataptr[6];
    tmp6 = dataptr[1] - dataptr[6];
    tmp2 = dataptr[2] + dataptr[5];
    tmp5 = dataptr[2] - dataptr[5];
    tmp3 = dataptr[3] + dataptr[4];
    tmp4 = dataptr[3] - dataptr[4];
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */
    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    dataptr[0] = (DCTELEM) ((tmp10 + tmp11) << PASS1_BITS);
    dataptr[4] = (DCTELEM) ((tmp10 - tmp11) << PASS1_BITS);
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
 				   CONST_BITS-PASS1_BITS);
    dataptr[6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
 				   CONST_BITS-PASS1_BITS);
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents cos(K*pi/16).
     * i0..i3 in the paper are tmp4..tmp7 here.
     */
    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    z3 += z5;
    z4 += z5;
    dataptr[7] = (DCTELEM) DESCALE(tmp4 + z1 + z3, CONST_BITS-PASS1_BITS);
    dataptr[5] = (DCTELEM) DESCALE(tmp5 + z2 + z4, CONST_BITS-PASS1_BITS);
    dataptr[3] = (DCTELEM) DESCALE(tmp6 + z2 + z3, CONST_BITS-PASS1_BITS);
    dataptr[1] = (DCTELEM) DESCALE(tmp7 + z1 + z4, CONST_BITS-PASS1_BITS);
    dataptr += DCTSIZE;		/* advance pointer to next row */
  }
  /* Pass 2: process columns.
   * We remove the PASS1_BITS scaling, but leave the results scaled up
   * by an overall factor of 8.
   */
  dataptr = data;
  for (ctr = DCTSIZE-1; ctr >= 0; ctr--) {
    tmp0 = dataptr[DCTSIZE*0] + dataptr[DCTSIZE*7];
    tmp7 = dataptr[DCTSIZE*0] - dataptr[DCTSIZE*7];
    tmp1 = dataptr[DCTSIZE*1] + dataptr[DCTSIZE*6];
    tmp6 = dataptr[DCTSIZE*1] - dataptr[DCTSIZE*6];
    tmp2 = dataptr[DCTSIZE*2] + dataptr[DCTSIZE*5];
    tmp5 = dataptr[DCTSIZE*2] - dataptr[DCTSIZE*5];
    tmp3 = dataptr[DCTSIZE*3] + dataptr[DCTSIZE*4];
    tmp4 = dataptr[DCTSIZE*3] - dataptr[DCTSIZE*4];
    /* Even part per LL&M figure 1 --- note that published figure is faulty;
     * rotator "sqrt(2)*c1" should be "sqrt(2)*c6".
     */
    tmp10 = tmp0 + tmp3;
    tmp13 = tmp0 - tmp3;
    tmp11 = tmp1 + tmp2;
    tmp12 = tmp1 - tmp2;
    dataptr[DCTSIZE*0] = (DCTELEM) DESCALE(tmp10 + tmp11, PASS1_BITS);
    dataptr[DCTSIZE*4] = (DCTELEM) DESCALE(tmp10 - tmp11, PASS1_BITS);
    z1 = MULTIPLY(tmp12 + tmp13, FIX_0_541196100);
    dataptr[DCTSIZE*2] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp13, FIX_0_765366865),
 					   CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*6] = (DCTELEM) DESCALE(z1 + MULTIPLY(tmp12, - FIX_1_847759065),
 					   CONST_BITS+PASS1_BITS);
    /* Odd part per figure 8 --- note paper omits factor of sqrt(2).
     * cK represents cos(K*pi/16).
     * i0..i3 in the paper are tmp4..tmp7 here.
     */
    z1 = tmp4 + tmp7;
    z2 = tmp5 + tmp6;
    z3 = tmp4 + tmp6;
    z4 = tmp5 + tmp7;
    z5 = MULTIPLY(z3 + z4, FIX_1_175875602); /* sqrt(2) * c3 */
    tmp4 = MULTIPLY(tmp4, FIX_0_298631336); /* sqrt(2) * (-c1+c3+c5-c7) */
    tmp5 = MULTIPLY(tmp5, FIX_2_053119869); /* sqrt(2) * ( c1+c3-c5+c7) */
    tmp6 = MULTIPLY(tmp6, FIX_3_072711026); /* sqrt(2) * ( c1+c3+c5-c7) */
    tmp7 = MULTIPLY(tmp7, FIX_1_501321110); /* sqrt(2) * ( c1+c3-c5-c7) */
    z1 = MULTIPLY(z1, - FIX_0_899976223); /* sqrt(2) * (c7-c3) */
    z2 = MULTIPLY(z2, - FIX_2_562915447); /* sqrt(2) * (-c1-c3) */
    z3 = MULTIPLY(z3, - FIX_1_961570560); /* sqrt(2) * (-c3-c5) */
    z4 = MULTIPLY(z4, - FIX_0_390180644); /* sqrt(2) * (c5-c3) */
    z3 += z5;
    z4 += z5;
    dataptr[DCTSIZE*7] = (DCTELEM) DESCALE(tmp4 + z1 + z3,
 					   CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*5] = (DCTELEM) DESCALE(tmp5 + z2 + z4,
 					   CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*3] = (DCTELEM) DESCALE(tmp6 + z2 + z3,
 					   CONST_BITS+PASS1_BITS);
    dataptr[DCTSIZE*1] = (DCTELEM) DESCALE(tmp7 + z1 + z4,
 					   CONST_BITS+PASS1_BITS);
    dataptr++;			/* advance pointer to next column */
  }
 }
--- a/libavcodec/mpegvideo.c
+++ b/libavcodec/mpegvideo.c
@@ -36,7 +36,6 @@ static void dct_unquantize_h263_c(MpegEncContext *s,
 static void draw_edges_c(UINT8 *buf, int wrap, int width, int height, int w);
 static int dct_quantize_c(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
 int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow)= dct_quantize_c;
 void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w)= draw_edges_c;
 static void emulated_edge_mc(MpegEncContext *s, UINT8 *src, int linesize, int block_w, int block_h, 
                                    int src_x, int src_y, int w, int h);
@@ -76,14 +75,25 @@ extern UINT8 zigzag_end[64];
 /* default motion estimation */
 int motion_estimation_method = ME_EPZS;
 static void convert_matrix(int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64],
 static void convert_matrix(MpegEncContext *s, int (*qmat)[64], uint16_t (*qmat16)[64], uint16_t (*qmat16_bias)[64],
                           const UINT16 *quant_matrix, int bias)
 {
    int qscale;
    for(qscale=1; qscale<32; qscale++){
        int i;
        if (av_fdct == fdct_ifast) {
        if (s->fdct == ff_jpeg_fdct_islow) {
            for(i=0;i<64;i++) {
                const int j= block_permute_op(i);
                /* 16 <= qscale * quant_matrix[i] <= 7905 */
                /* 19952         <= aanscales[i] * qscale * quant_matrix[i]           <= 249205026 */
                /* (1<<36)/19952 >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= (1<<36)/249205026 */
                /* 3444240       >= (1<<36)/(aanscales[i] * qscale * quant_matrix[i]) >= 275 */
                qmat[qscale][j] = (int)((UINT64_C(1) << (QMAT_SHIFT-3)) / 
                                (qscale * quant_matrix[j]));
            }
        } else if (s->fdct == fdct_ifast) {
            for(i=0;i<64;i++) {
                const int j= block_permute_op(i);
                /* 16 <= qscale * quant_matrix[i] <= 7905 */
@@ -130,6 +140,12 @@ int MPV_common_init(MpegEncContext *s)
    s->dct_unquantize_h263 = dct_unquantize_h263_c;
    s->dct_unquantize_mpeg1 = dct_unquantize_mpeg1_c;
    s->dct_unquantize_mpeg2 = dct_unquantize_mpeg2_c;
    s->dct_quantize= dct_quantize_c;
    if(s->avctx->dct_algo==FF_DCT_FASTINT)
        s->fdct = fdct_ifast;
    else
        s->fdct = ff_jpeg_fdct_islow;
 #ifdef HAVE_MMX
    MPV_common_init_mmx(s);
@@ -563,9 +579,9 @@ int MPV_encode_init(AVCodecContext *avctx)
    /* precompute matrix */
    /* for mjpeg, we do include qscale in the matrix */
    if (s->out_format != FMT_MJPEG) {
        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, s->q_intra_matrix16_bias, 
        convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16, s->q_intra_matrix16_bias, 
                       s->intra_matrix, s->intra_quant_bias);
        convert_matrix(s->q_inter_matrix, s->q_inter_matrix16, s->q_inter_matrix16_bias, 
        convert_matrix(s, s->q_inter_matrix, s->q_inter_matrix16, s->q_inter_matrix16_bias, 
                       s->inter_matrix, s->inter_quant_bias);
    }
@@ -1812,14 +1828,14 @@ static void encode_mb(MpegEncContext *s, int motion_x, int motion_y)
    if(s->out_format==FMT_MJPEG){
        for(i=0;i<6;i++) {
            int overflow;
            s->block_last_index[i] = dct_quantize(s, s->block[i], i, 8, &overflow);
            s->block_last_index[i] = s->dct_quantize(s, s->block[i], i, 8, &overflow);
            if (overflow) clip_coeffs(s, s->block[i], s->block_last_index[i]);
        }
    }else{
        for(i=0;i<6;i++) {
            if(!skip_dct[i]){
                int overflow;
                s->block_last_index[i] = dct_quantize(s, s->block[i], i, s->qscale, &overflow);
                s->block_last_index[i] = s->dct_quantize(s, s->block[i], i, s->qscale, &overflow);
            // FIXME we could decide to change to quantizer instead of clipping
            // JS: I don't think that would be a good idea it could lower quality instead
            //     of improve it. Just INTRADC clipping deserves changes in quantizer
@@ -2081,7 +2097,7 @@ static void encode_picture(MpegEncContext *s, int picture_number)
        s->intra_matrix[0] = ff_mpeg1_default_intra_matrix[0];
        for(i=1;i<64;i++)
            s->intra_matrix[i] = CLAMP_TO_8BIT((ff_mpeg1_default_intra_matrix[i] * s->qscale) >> 3);
        convert_matrix(s->q_intra_matrix, s->q_intra_matrix16, 
        convert_matrix(s, s->q_intra_matrix, s->q_intra_matrix16, 
                       s->q_intra_matrix16_bias, s->intra_matrix, s->intra_quant_bias);
    }
@@ -2446,7 +2462,7 @@ static int dct_quantize_c(MpegEncContext *s,
    int max=0;
    unsigned int threshold1, threshold2;
    av_fdct (block);
    s->fdct (block);
    /* we need this permutation so that we correct the IDCT
       permutation. will be moved into DCT code */
--- a/libavcodec/mpegvideo.h
+++ b/libavcodec/mpegvideo.h
@@ -465,6 +465,8 @@ typedef struct MpegEncContext {
                           DCTELEM *block, int n, int qscale);
    void (*dct_unquantize)(struct MpegEncContext *s, // unquantizer to use (mpeg4 can use both)
                           DCTELEM *block, int n, int qscale);
    int (*dct_quantize)(struct MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
    void (*fdct)(DCTELEM *block);
 } MpegEncContext;
 int MPV_common_init(MpegEncContext *s);
@@ -478,7 +480,6 @@ void MPV_common_init_mmx(MpegEncContext *s);
 #ifdef ARCH_ALPHA
 void MPV_common_init_axp(MpegEncContext *s);
 #endif
 extern int (*dct_quantize)(MpegEncContext *s, DCTELEM *block, int n, int qscale, int *overflow);
 extern void (*draw_edges)(UINT8 *buf, int wrap, int width, int height, int w);
 void ff_conceal_past_errors(MpegEncContext *s, int conceal_all);
 void ff_copy_bits(PutBitContext *pb, UINT8 *src, int length);