Merge commit '880e2aa23645ed9871c66ee1cbd00f93c72d2d73'

* commit '880e2aa23645ed9871c66ee1cbd00f93c72d2d73': Remove all Blackfin architecture optimizations Conflicts: libavcodec/bfin/dsputil.S libavcodec/bfin/dsputil_init.c libavcodec/bfin/fdct_bfin.S libavcodec/bfin/hpel_pixels_no_rnd.S libavcodec/bfin/hpeldsp_init.c libavcodec/bfin/idct_bfin.S libavcodec/bfin/mathops.h libavcodec/bfin/pixels.S libavcodec/bfin/pixels.h libavcodec/bfin/vp3dsp.S libavcodec/bfin/vp3dsp_init.c libavutil/bfin/asm.h libavutil/bfin/attributes.h libswscale/bfin/internal_bfin.S libswscale/bfin/swscale_bfin.c libswscale/bfin/yuv2rgb_bfin.c libswscale/swscale_internal.h libswscale/version.h If someone wants to maintain blackfin support in FFmpeg, please contact ffmpeg-devel@ffmpeg.org Merged-by: Michael Niedermayer <michaelni@gmx.at>
11 years ago · b55d3bbeed
--- a/libavcodec/bfin/Makefile
+++ b/libavcodec/bfin/Makefile
@@ -1,10 +0,0 @@
 OBJS-$(CONFIG_DSPUTIL)                  += bfin/dsputil_init.o          \
                                           bfin/dsputil.o               \
                                           bfin/fdct_bfin.o             \
                                           bfin/idct_bfin.o             \
                                           bfin/pixels.o
 OBJS-$(CONFIG_HPELDSP)                  += bfin/hpeldsp_init.o          \
                                           bfin/hpel_pixels_no_rnd.o    \
                                           bfin/pixels.o
 OBJS-$(CONFIG_VP3DSP)                   += bfin/vp3dsp_init.o           \
                                           bfin/vp3dsp.o
--- a/libavcodec/bfin/dsputil.S
+++ b/libavcodec/bfin/dsputil.S
@@ -1,382 +0,0 @@
 /*
 * Blackfin Pixel Operations
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/bfin/asm.h"

 DEFUN(put_pixels_clamped,mL1,
        (int16_t *block, uint8_t *dest, int line_size)):
    [--SP] = (R7:4);
    R4 = 0;
    R5.l = 0x00ff;
    R5.h = 0x00ff;
    I0 = R0;         // block
    I1 = R1;         // dest
    R2 += -4;        // line_size
    M1 = R2;
    P0 = 8;
    R0 = [I0++];
    R1 = [I0++];
    R2 = MAX(R0, R4) (V);
    LSETUP (ppc$0,ppc$1) LC0=P0;
 ppc$0: R2 = MIN(R2, R5) (V);
       R3 = MAX(R1, R4) (V);
       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
       R2 = MAX(R0, R4) (V)      || [I1++] = R6;
       R2 = MIN(R2, R5) (V);
       R3 = MAX(R1, R4) (V);
       R3 = MIN(R3, R5) (V)      || R0 = [I0++];
       R6 = BYTEPACK (R2,R3)     || R1 = [I0++];
 ppc$1: R2 = Max(R0, R4) (V)      || [I1++M1] = R6;

    (R7:4) = [SP++];
    RTS;
 DEFUN_END(put_pixels_clamped)

 DEFUN(add_pixels_clamped,mL1,
        (int16_t *block, uint8_t *dest, int line_size)):
    [-- SP] = (R7:4);
    R4 = 0;
    I0 = 0;
    R2 += -4;        // line_size
    M0 = R2;
    I1 = R1;         // dest
    I3 = R0;         // block
    I2 = R1;         // dest
    P0 = 8;
    M3 = 2;
    R0 = [I3++]  || R2 = [I1];
    R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
    R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
    R6 = BYTEOP3P(R1:0, R3:2) (LO)    || R1.H = W[I3++]  || R2 = [I1];

    LSETUP(apc$2,apc$3) LC1 = P0;
 apc$2: R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R3 = [I1++M0];
       R2 = R2 << 8                      || R0.H = W[I3--];
       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
       R6 = R6 + R7 (S)                  || R1.H = W[I3];
       R6 = BYTEOP3P(R1:0, R3:2) (LO)    || I3+=M3          || [I2++]=R6;
       R7 = BYTEOP3P(R1:0, R3:2) (HI, R) || R0 = [I3++]     || R2 = [I1];
       R2 = R2 << 8                      || R0.H = W[I3--]  || R3 = [I1++];
       R3 = R3 >> 8                      || R1.L = W[I3]    || I3 += 4;
       R6 = R6 + R7 (S)                  || R1.H = W[I3++];
 apc$3: R6 = BYTEOP3P(R1:0, R3:2) (LO)    || [I2++M0] = R6   || R2 = [I1];

    (R7:4) = [SP++];
    RTS;
 DEFUN_END(add_pixels_clamped)

 DEFUN(diff_pixels,mL1,
       (int16_t *block, uint8_t *s1, uint8_t *s2, int stride)):
        link 0;
        [--sp] = (r7:4);
        p0=8;
        i3=r0;        // block
        i0=r1;        // s1
        i1=r2;        // s2
        r2=[fp+20];   // stride
        r2+=-8;
        m0=r2;


        LSETUP(.LS0,.LE0) LC0=P0;
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];

 .LS0:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
        (R5,R4) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || [I3++] = R4;
        DISALGNEXCPT                       || R2 = [I1++]   || [I3++] = R5;
        [i3++]=r6;
 .LE0:  [i3++]=r7;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(diff_pixels)

 /*
    for (i = 0; i < 16; i++) {
        for (j = 0; j < 16; j++) {
          sum += pix[j];
        }
        pix += line_size;
    }
 */
 DEFUN(pix_sum,mL1,
        (uint8_t *p, int stride)):
        link 0;
        [--sp] = (r7:4);
        p0=8;
        i0=r0;        // s1
        i1=r0;
        m1=r1;
        r1=r1+r1;
        r1+=-16;       // stride
        m0=r1;
        i1+=m1;

        r6=0;

        LSETUP(LS$PS,LE$PS) LC0=P0;
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];

 LS$PS:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++]   || R2 = [I1++];
        r6=r6+|+r5;
        r6=r6+|+r4;
        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R1 = [I0++]   || R3 = [I1++];
        r6=r6+|+r5;
        r6=r6+|+r4;
        (R5,R4) = BYTEOP16P (R3:2,R1:0)    || R0 = [I0++m0] || R2 = [I1++m0];
        r6=r6+|+r5;
        r6=r6+|+r4;
        (R5,R4) = BYTEOP16P (R3:2,R1:0) (R)|| R0 = [I0++]   || R2 = [I1++];
        r6=r6+|+r5;
 LE$PS:  r6=r6+|+r4;
        r0.l=r6.l+r6.h;
        r0.h=0;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(pix_sum)


 DEFUN(get_pixels,mL1,
        (int16_t *av_restrict block, const uint8_t *pixels, int line_size)):
        [--sp] = (r7:4);
        i3=r0;        // dest
        i0=r1;        // src0
        p0=8;
        r2+=-8;
        m0=r2;
        LSETUP(gp8$0,gp8$1) LC0=P0;

        DISALGNEXCPT                   || R0 = [I0++];
        DISALGNEXCPT                   || R1 = [I0++];

 gp8$0:  (R7,R6) = byteunpack R1:0      || R0 = [I0++M0];
        (R5,R4) = byteunpack R1:0 (R)  || R0 = [I0++]    || [I3++]=R6;
        DISALGNEXCPT                   || R1 = [I0++]    || [I3++]=R7;
        [I3++]=R4;
 gp8$1:  [I3++]=R5


        (r7:4) = [sp++];
        RTS;
 DEFUN_END(get_pixels)


 /* sad = sad16x16 (ubyte *mb, ubyte *refwin, srcwidth, refwinwidth, h) */
 /* 91 cycles */
 DEFUN(z_sad16x16,mL1,
        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
        link 0;
        I0 = R0;
        I1 = R1;

        A1 = A0 = 0;
        R0 = [sp+20]; // rwidth
        P2 = [sp+24]; // height
        R3 = 16;
        R0 = R0 - R3;
        R3 = R2 - R3;
        M1 = R0;
        M0 = R3;

        DISALGNEXCPT         || R0 = [I0++]    || R2 = [I1++];
        LSETUP (s$16, e$16) LC0=P2;
 s$16:   DISALGNEXCPT         || R1 = [I0++]    || R3 = [I1++];
        SAA (R1:0,R3:2)      || R0 = [I0++]    || R2 = [I1++];
        SAA (R1:0,R3:2) (R)  || R1 = [I0++]    || R3 = [I1++];
        SAA (R1:0,R3:2)      || R0 = [I0++M0]  || R2 = [I1++M1];
 e$16:   SAA (R1:0,R3:2) (R)  || R0 = [I0++]    || R2 = [I1++];

        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
        R0 = R2 + R3 ;
        unlink;
        RTS;
 DEFUN_END(z_sad16x16)

 /* sad = sad8x8 (ubyte *mb, ubyte *refwin, int srcwidth, int refwinwidth, int h) */
 /* 36 cycles */
 DEFUN(z_sad8x8,mL1,
        (uint8_t *blk1, uint8_t *blk2, int dsz, int line_size, int h)):
        I0 = R0;
        I1 = R1;

        A1 = A0 = 0;
        r0 = [sp+12]; // rwidth
        P2 = [sp+16]; //height
        R3 = 8;
        R0 = R0 - R3;
        R3 = R2 - R3;
        M0 = R3;
        M1 = R0;

        LSETUP (s$8, e$8) LC0=P2;
        DISALGNEXCPT         || R0 = [I0++]   || R2 = [I1++];
        DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];
 s$8:    SAA (R1:0,R3:2)      || R0 = [I0++M0] || R2 = [I1++M1];
        SAA (R1:0,R3:2) (R)  || R0 = [I0++]   || R2 = [I1++];
 e$8:    DISALGNEXCPT         || R1 = [I0++]   || R3 = [I1++];

        R3=A1.L+A1.H,  R2=A0.L+A0.H ;
        R0 = R2 + R3 ;
        RTS;
 DEFUN_END(z_sad8x8)

 DEFUN(pix_norm1,mL1,
        (uint8_t * pix, int line_size)):
        [--SP]=(R7:4,P5:3);

        // Fetch the input arguments.
        P1 = R0;  // pix
        P0 = R1;  // line_size
        P5 = 16;  // loop ctr.
        P0 -= P5;
        M0 = P0;  // M0 = line_size-16;
        // Now for the real work.
        A1 = A0 = 0;
        lsetup(_pix_norm1_blkfn_loopStart, _pix_norm1_blkfn_loopEnd) LC1 = P5;
        I0 = P1;
        DISALGNEXCPT || r0 = [i0++];

 _pix_norm1_blkfn_loopStart:
        // following unpacks pix1[0..15] pix1+line_size[0..15]
        DISALGNEXCPT || r1 = [i0++];

        (r5, r4) = byteunpack r1:0 || r0 = [i0++];
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
        (r5, r4) = byteunpack r1:0(r) || r1 = [i0++];
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
        (r5, r4) = byteunpack r1:0 || r0 = [i0++M0];
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);
        (r5, r4) = byteunpack r1:0(r) || r0 = [i0++];
        a1 += r5.h * r5.h, a0 += r5.l * r5.l (is);
 _pix_norm1_blkfn_loopEnd:
        a1 += r4.h * r4.h, a0 += r4.l * r4.l (is);


 // Clean up at the end:
        R2 = A0, R3 = A1;
        R0 = R2 + R3 (S);

        (R7:4,P5:3)=[SP++];

        RTS;
 DEFUN_END(pix_norm1)

 DEFUN(sse4,mL1,
        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
        link 0;
        [--sp] = (r7:6);
        p0=[fp+24];   // h
        i0=r1;        // pix1
        i1=r2;        // pix2
        r2=[fp+20];   // line_size
        r2+=-4;
        m0=r2;

        a0=a1=0;
        LSETUP(.S40,.E40) LC0=P0;
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];

 .S40:   DISALGNEXCPT                       || R1 = [I0++M0] || R3 = [I1++M0];
        (R7,R6) = BYTEOP16M (R1:0,R3:2);
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
 .E40:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
        a0 += a1;
        r0 = a0;

        (r7:6) = [sp++];
        unlink;
        rts;
 DEFUN_END(sse4)

 DEFUN(sse8,mL1,
        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
        link 0;
        [--sp] = (r7:6);
        p0=[fp+24];   // h
        i0=r1;        // pix1
        i1=r2;        // pix2
        r2=[fp+20];   // line_size
        r2+=-8;
        m0=r2;

        a0=a1=0;
        LSETUP(.S80,.E80) LC0=P0;
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];

 .S80:   DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
 .E80:   a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
        a0 += a1;
        r0 = a0;

        (r7:6) = [sp++];
        unlink;
        rts;
 DEFUN_END(sse8)

 DEFUN(sse16,mL1,
        (void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)):
        link 0;
        [--sp] = (r7:6);
        p0=[fp+24];   // h
        i0=r1;        // pix1
        i1=r2;        // pix2
        r2=[fp+20];   // line_size
        r2+=-16;
        m0=r2;

        a0=a1=0;
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
        LSETUP(.S160,.E160) LC0=P0;

 .S160:  DISALGNEXCPT                       || R1 = [I0++]   || R3 = [I1++];
        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++]   || R2 = [I1++];
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R1 = [I0++]   || R3 = [I1++];
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
        (R7,R6) = BYTEOP16M (R1:0,R3:2)    || R0 = [I0++M0] || R2 = [I1++M0];
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
        a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
        (R7,R6) = BYTEOP16M (R1:0,R3:2) (R)|| R0 = [I0++]   || R2 = [I1++];
        a0 += r7.l * r7.l, a1 += r7.h * r7.h (is);
 .E160:  a0 += r6.l * r6.l, a1 += r6.h * r6.h (is);
        a0 += a1;
        r0 = a0;

        (r7:6) = [sp++];
        unlink;
        rts;
 DEFUN_END(sse16)
--- a/libavcodec/bfin/dsputil_init.c
+++ b/libavcodec/bfin/dsputil_init.c
@@ -1,195 +0,0 @@
 /*
 * BlackFin DSPUTILS
 *
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 * Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/bfin/attributes.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/dsputil.h"
 #include "libavcodec/mpegvideo.h"

 void ff_bfin_idct(int16_t *block) attribute_l1_text;
 void ff_bfin_fdct(int16_t *block) attribute_l1_text;

 void ff_bfin_add_pixels_clamped(const int16_t *block, uint8_t *dest,
                                int line_size) attribute_l1_text;
 void ff_bfin_put_pixels_clamped(const int16_t *block, uint8_t *dest,
                                int line_size) attribute_l1_text;

 void ff_bfin_diff_pixels(int16_t *block, const uint8_t *s1, const uint8_t *s2,
                         int stride)  attribute_l1_text;
 void ff_bfin_get_pixels(int16_t *restrict block, const uint8_t *pixels,
                        int line_size) attribute_l1_text;

 int ff_bfin_pix_norm1(uint8_t *pix, int line_size) attribute_l1_text;
 int ff_bfin_pix_sum(uint8_t *p, int stride) attribute_l1_text;

 int ff_bfin_z_sad8x8(uint8_t *blk1, uint8_t *blk2, int dsz,
                     int line_size, int h) attribute_l1_text;
 int ff_bfin_z_sad16x16(uint8_t *blk1, uint8_t *blk2, int dsz,
                       int line_size, int h) attribute_l1_text;

 int ff_bfin_sse4(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                 int line_size, int h) attribute_l1_text;
 int ff_bfin_sse8(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                 int line_size, int h) attribute_l1_text;
 int ff_bfin_sse16(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
                  int line_size, int h) attribute_l1_text;

 static void bfin_idct_add(uint8_t *dest, int line_size, int16_t *block)
 {
    ff_bfin_idct(block);
    ff_bfin_add_pixels_clamped(block, dest, line_size);
 }

 static void bfin_idct_put(uint8_t *dest, int line_size, int16_t *block)
 {
    ff_bfin_idct(block);
    ff_bfin_put_pixels_clamped(block, dest, line_size);
 }

 static void bfin_clear_blocks(int16_t *blocks)
 {
    // This is just a simple memset.
    __asm__ ("P0=192; "
             "I0=%0;  "
             "R0=0;   "
             "LSETUP(clear_blocks_blkfn_lab,clear_blocks_blkfn_lab)LC0=P0;"
             "clear_blocks_blkfn_lab:"
             "[I0++]=R0;"
             ::"a" (blocks) : "P0", "I0", "R0");
 }

 static int bfin_pix_abs16(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                          int line_size, int h)
 {
    return ff_bfin_z_sad16x16(blk1, blk2, line_size, line_size, h);
 }

 static uint8_t vtmp_blk[256] attribute_l1_data_b;

 static int bfin_pix_abs16_x2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                             int line_size, int h)
 {
    ff_bfin_put_pixels16uc(vtmp_blk, blk2, blk2 + 1, 16, line_size, h);
    return ff_bfin_z_sad16x16(blk1, vtmp_blk, line_size, 16, h);
 }

 static int bfin_pix_abs16_y2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                             int line_size, int h)
 {
    ff_bfin_put_pixels16uc(vtmp_blk, blk2, blk2 + line_size, 16, line_size, h);
    return ff_bfin_z_sad16x16(blk1, vtmp_blk, line_size, 16, h);
 }

 static int bfin_pix_abs16_xy2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                              int line_size, int h)
 {
    ff_bfin_z_put_pixels16_xy2(vtmp_blk, blk2, 16, line_size, h);
    return ff_bfin_z_sad16x16(blk1, vtmp_blk, line_size, 16, h);
 }

 static int bfin_pix_abs8(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                         int line_size, int h)
 {
    return ff_bfin_z_sad8x8(blk1, blk2, line_size, line_size, h);
 }

 static int bfin_pix_abs8_x2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                            int line_size, int h)
 {
    ff_bfin_put_pixels8uc(vtmp_blk, blk2, blk2 + 1, 8, line_size, h);
    return ff_bfin_z_sad8x8(blk1, vtmp_blk, line_size, 8, h);
 }

 static int bfin_pix_abs8_y2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                            int line_size, int h)
 {
    ff_bfin_put_pixels8uc(vtmp_blk, blk2, blk2 + line_size, 8, line_size, h);
    return ff_bfin_z_sad8x8(blk1, vtmp_blk, line_size, 8, h);
 }

 static int bfin_pix_abs8_xy2(MpegEncContext *c, uint8_t *blk1, uint8_t *blk2,
                             int line_size, int h)
 {
    ff_bfin_z_put_pixels8_xy2(vtmp_blk, blk2, 8, line_size, h);
    return ff_bfin_z_sad8x8(blk1, vtmp_blk, line_size, 8, h);
 }

 /*
 * decoder optimization
 * start on 2/11 100 frames of 352x240@25 compiled with no optimization -g debugging
 * 9.824s ~ 2.44x off
 * 6.360s ~ 1.58x off with -O2
 * 5.740s ~ 1.43x off with idcts
 *
 * 2.64s    2/20 same sman.mp4 decode only
 */

 av_cold void ff_dsputil_init_bfin(DSPContext *c, AVCodecContext *avctx,
                                  unsigned high_bit_depth)
 {
    c->diff_pixels = ff_bfin_diff_pixels;

    c->put_pixels_clamped = ff_bfin_put_pixels_clamped;
    c->add_pixels_clamped = ff_bfin_add_pixels_clamped;

    c->clear_blocks = bfin_clear_blocks;

    c->pix_sum   = ff_bfin_pix_sum;
    c->pix_norm1 = ff_bfin_pix_norm1;

    c->sad[0] = bfin_pix_abs16;
    c->sad[1] = bfin_pix_abs8;

    /* TODO [0] 16  [1] 8 */
    c->pix_abs[0][0] = bfin_pix_abs16;
    c->pix_abs[0][1] = bfin_pix_abs16_x2;
    c->pix_abs[0][2] = bfin_pix_abs16_y2;
    c->pix_abs[0][3] = bfin_pix_abs16_xy2;

    c->pix_abs[1][0] = bfin_pix_abs8;
    c->pix_abs[1][1] = bfin_pix_abs8_x2;
    c->pix_abs[1][2] = bfin_pix_abs8_y2;
    c->pix_abs[1][3] = bfin_pix_abs8_xy2;

    c->sse[0] = ff_bfin_sse16;
    c->sse[1] = ff_bfin_sse8;
    c->sse[2] = ff_bfin_sse4;

    if (!high_bit_depth) {
        c->get_pixels = ff_bfin_get_pixels;

        if (avctx->dct_algo == FF_DCT_AUTO)
            c->fdct = ff_bfin_fdct;

        if (avctx->idct_algo == FF_IDCT_AUTO) {
            c->idct_permutation_type = FF_NO_IDCT_PERM;
            c->idct                  = ff_bfin_idct;
            c->idct_add              = bfin_idct_add;
            c->idct_put              = bfin_idct_put;
        }
    }
 }
--- a/libavcodec/bfin/fdct_bfin.S
+++ b/libavcodec/bfin/fdct_bfin.S
@@ -1,327 +0,0 @@
 /*
 * fdct BlackFin
 *
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 /*
  void ff_bfin_fdct (int16_t *buf);

  This implementation works only for 8x8 input. The range of input
  must be -256 to 255 i.e. 8bit input represented in a 16bit data
  word. The original data must be sign extended into the 16bit data
  words.


   Chen factorization of

           8
   X(m) = sum (x(n) * cos ((2n+1)*m*pi/16))
          n=0

                                             C4
 0  --*-------------*0+7---*-----*0+3-------*-*-------------------> 0
       \           /        \   /            X S4,S4
 1  --*-\---------/-*1+6---*-\-/-*1+2-------*-*-------------------> 4
         \       /            \              -C4     C3
 2  --*---\-----/---*2+5---*-/-\-*1-2---------------*-*-----------> 2
           \   /            /   \                    X S3,-S3
 3  --*-----\-/-----*3+4---*-----*0-3---------------*-*-----------> 6
             /                                  C7   C3
 4  --*-----/-\-----*3-4------------*-*4+5--*-----*---------------> 1
           /   \            -C4      X       \   /S7    C3
 5  --*---/-----\---*2-5---*-*------*=*4-5----\-/------*-*--------> 5
         /       \          X S4,S4            /        X S3,-S3
 6  --*-/---------\-*1-6---*-*------*=*7-6----/-\------*-*--------> 3
       /           \        C4       X       /   \-S7   C3
    --*-------------*0-7------------*-*7+6--*-----*---------------> 7
                                                C7

 Notation
        Cn = cos(n*pi/8) used throughout the code.


  Registers used:
        R0, R1, R2, R3, R4, R5, R6,R7,  P0, P1, P2, P3, P4, P5, A0, A1.
  Other registers used:
        I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0.

  Input - r0 - pointer to start of int16_t *block

  Output - The DCT output coefficients in the int16_t *block

  Register constraint:
               This code is called from jpeg_encode.
               R6, R5, R4 if modified should be stored and restored.


  Performance: (Timer version 0.6.33)
               Code Size : 240 Bytes.
               Memory Required :
               Input Matrix : 8 * 8 * 2 Bytes.
               Coefficients : 16 Bytes
               Temporary matrix: 8 * 8 * 2 Bytes.
               Cycle Count :26+{18+8*(14+2S)}*2  where S -> Stalls
                            (7.45 c/pel)
        -----------------------------------------
        |  Size  |  Forward DCT  |  Inverse DCT |
        -----------------------------------------
        |  8x8   |   284 Cycles  |  311 Cycles  |
        -----------------------------------------

 Ck = int16(cos(k/16*pi)*32767+.5)/2
 #define C4 23170
 #define C3 13623
 #define C6 6270
 #define C7 3196

 Sk = int16(sin(k/16*pi)*32767+.5)/2
 #define S4 11585
 #define S3 9102
 #define S6 15137
 #define S7 16069

 the coefficients are ordered as follows:
 short dct_coef[]
  C4,S4,
  C6,S6,
  C7,S7,
  S3,C3,

 -----------------------------------------------------------
 FFMPEG conformance testing results
 -----------------------------------------------------------
 dct-test: modified with the following
            dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test);
 produces the following output:

 root:/u/ffmpeg/bhead/libavcodec> ./dct-test
 ffmpeg DCT/IDCT test

    2  -131    -6   -48   -36    33   -83    24
   34    52   -24   -15     5    92    57   143
  -67   -43    -1    74   -16     5   -71    32
  -78   106    92   -34   -38    81    20   -18
    7   -62    40     2   -15    90   -62   -83
  -83     1  -104   -13    43   -19     7    11
  -63    31    12   -29    83    72    21    10
  -17   -63   -15    73    50   -91   159   -14
 DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27
 DCT BFINfdct: 92.1 kdct/s
 root:/u/ffmpeg/bhead/libavcodec>

 */

 #include "libavutil/bfin/asm.h"

 SECTION_L1_DATA_B

 .align 4;
 dct_coeff:
 .short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537;

 SECTION_L1_DATA_A

 .align 4
 vtmp:   .space 128

 .text
 DEFUN(fdct,mL1,
        (int16_t *block)):
    [--SP] = (R7:4, P5:3);          // Push the registers onto the stack.

    b0 = r0;
    RELOC(r0, P3, dct_coeff);
    b3 = r0;
    RELOC(r0, P3, vtmp);
    b2 = r0;

    L3 = 16;                        // L3 is set to 16 to make the coefficient
                                    // array Circular.


 //----------------------------------------------------------------------------

 /*
 * I0, I1, and I2 registers are used to read the input data. I3 register is used
 * to read the coefficients. P0 and P1 registers are used for writing the output
 * data.
 */
    M0 = 12 (X);                    // All these initializations are used in the
    M1 = 16 (X);                    // modification of address offsets.

    M2 = 128 (X);

    P2 = 16;
    P3 = 32 (X);
    P4 = -110 (X);
    P5 = -62 (X);
    P0 = 2(X);


    // Prescale the input to get the correct precision.
    i0=b0;
    i1=b0;

    lsetup (.0, .1) LC0 = P3;
    r0=[i0++];
 .0:     r1=r0<<3 (v) || r0=[i0++] ;
 .1:     [i1++]=r1;

        /*
         * B0 points to the "in" buffer.
         * B2 points to "temp" buffer in the first iteration.
         */

    lsetup (.2, .3) LC0 = P0;
 .2:
        I0 = B0;                        // I0 points to Input Element (0, 0).
        I1 = B0;                        // Element 1 and 0 is read in R0.
        I1 += M0  || R0 = [I0++];       // I1 points to Input Element (0, 6).
        I2 = I1;                        // Element 6 is read into R3.H.
        I2 -= 4   || R3.H = W[I1++];    // I2 points to Input Element (0, 4).

        I3 = B3;                        // I3 points to Coefficients.
        P0 = B2;                        // P0 points to temporary array Element
                                        //   (0, 0).
        P1 = B2;                        // P1 points to temporary array.
        R7 = [P1++P2] || R2 = [I2++];   // P1 points to temporary array
                                        //   Element (1, 0).
                                        // R7 is a dummy read. X4,X5
                                        //   are read into R2.
        R3.L = W[I1--];                 // X7 is read into R3.L.
        R1.H = W[I0++];                 // X2 is read into R1.H.


        /*
         *  X0 = (X0 + X7) / 2.
         *  X1 = (X1 + X6) / 2.
         *  X6 = (X1 - X6) / 2.
         *  X7 = (X0 - X7) / 2.
         *  It reads the data 3 in R1.L.
         */

        R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP;

        /*
         *       X2 = (X2 + X5) / 2.
         *       X3 = (X3 + X4) / 2.
         *       X4 = (X3 - X4) / 2.
         *       X5 = (X2 - X5) / 2.
         *          R7 = C4 = cos(4*pi/16)
         */

        R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO) || NOP      ||  R7 = [I3++];

        /*
         * At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
         * R3 has (6,7).
         * Where the notation (x, y) represents uper/lower half pairs.
         */

        /*
         *       X0 = X0 + X3.
         *       X1 = X1 + X2.
         *       X2 = X1 - X2.
         *       X3 = X0 - X3.
         */
        R0 = R0 +|+ R1, R1 = R0 -|- R1;

        lsetup (.row0, .row1) LC1 = P2 >> 1;  // 1d dct, loops 8x
 .row0:

        /*
         *       This is part 2 computation continued.....
         *       A1 =      X6 * cos(pi/4)
         *       A0 =      X6 * cos(pi/4)
         *       A1 = A1 - X5 * cos(pi/4)
         *       A0 = A0 + X5 * cos(pi/4).
         *       The instruction W[I0] = R3.L is used for packing it to R2.L.
         */

        A1=R3.H*R7.l,         A0=R3.H*R7.l            ||  I1+=M1 || W[I0] = R3.L;
        R4.H=(A1-=R2.L*R7.l), R4.L=(A0+=R2.L*R7.l)    ||  I2+=M0 || NOP;

        /*       R0 = (X1,X0)      R1 = (X2,X3)       R4 = (X5, X6). */

        /*
         *       A1 =      X0 * cos(pi/4)
         *       A0 =      X0 * cos(pi/4)
         *       A1 = A1 - X1 * cos(pi/4)
         *       A0 = A0 + X1 * cos(pi/4)
         *       R7 = (C2,C6)
         */
        A1=R0.L*R7.h,        A0=R0.L*R7.h             || NOP       || R3.H=W[I1++];
        R5.H=(A1-=R0.H*R7.h),R5.L=(A0+=R0.H*R7.h)     || R7=[I3++] || NOP;

        /*
         *       A1 =      X2 * cos(3pi/8)
         *       A0 =      X3 * cos(3pi/8)
         *       A1 = A1 + X3 * cos(pi/8)
         *       A0 = A0 - X2 * cos(pi/8)
         *         R3 = cos(pi/4)
         *         R7 = (cos(7pi/8),cos(pi/8))
         *       X4 = X4 + X5.
         *       X5 = X4 - X5.
         *       X6 = X7 - X6.
         *       X7 = X7 + X6.
         */
        A1=R1.H*R7.L,        A0=R1.L*R7.L             || W[P0++P3]=R5.L || R2.L=W[I0];
        R2=R2+|+R4,          R4=R2-|-R4               || I0+=4          || R3.L=W[I1--];
        R6.H=(A1+=R1.L*R7.H),R6.L=(A0 -= R1.H * R7.H) || I0+=4          || R7=[I3++];

        /*         R2 = (X4, X7)        R4 = (X5,X6)      R5 = (X1, X0)       R6 = (X2,X3). */

        /*
         *       A1 =      X4 * cos(7pi/16)
         *       A0 =      X7 * cos(7pi/16)
         *       A1 = A1 + X7 * cos(pi/16)
         *       A0 = A0 - X4 * cos(pi/16)
         */

        A1=R2.H*R7.L,        A0=R2.L*R7.L             || W[P0++P3]=R6.H || R0=[I0++];
        R2.H=(A1+=R2.L*R7.H),R2.L=(A0-=R2.H*R7.H)     || W[P0++P3]=R5.H || R7=[I3++];

        /*
         *       A1 =      X5 * cos(3pi/16)
         *       A0 =      X6 * cos(3pi/16)
         *       A1 = A1 + X6 * cos(5pi/16)
         *       A0 = A0 - X5 * cos(5pi/16)
         *       The output values are written.
         */

        A1=R4.H*R7.H,        A0=R4.L*R7.H             || W[P0++P2]=R6.L || R1.H=W[I0++];
        R4.H=(A1+=R4.L*R7.L),R4.L=(A0-=R4.H*R7.L)     || W[P0++P4]=R2.L || R1.L=W[I0++];


        /* Beginning of next stage, **pipelined** + drain and store the
           rest of the column store. */

        R0=R0+|+R3,R3=R0-|-R3                         || W[P1++P3]=R2.H || R2=[I2++];
        R1=R1+|+R2,R2=R1-|-R2 (CO)                    || W[P1++P3]=R4.L || R7=[I3++];
 .row1:  R0=R0+|+R1,R1=R0-|-R1                         || W[P1++P5]=R4.H || NOP;

        // Exchange input with output.
        B1 = B0;
        B0 = B2;
 .3:     B2 = B1;

        L3=0;
        (r7:4,p5:3) = [sp++];
        RTS;
 DEFUN_END(fdct)
--- a/libavcodec/bfin/hpel_pixels_no_rnd.S
+++ b/libavcodec/bfin/hpel_pixels_no_rnd.S
@@ -1,81 +0,0 @@
 /*
 * Blackfin Pixel Operations
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/bfin/asm.h"

 DEFUN(put_pixels8uc_no_rnd,mL1,
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
                 int line_size, int h)):
        i3=r0;        // dest
        i0=r1;        // src0
        i1=r2;        // src1
        r2=[sp+12];   // line_size
        p0=[sp+16];   // h
        [--sp] = (r7:6);
        r2+=-4;
        m3=r2;
        r2+=-4;
        m0=r2;
        LSETUP(pp8$2,pp8$3) LC0=P0;
        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];

 pp8$2:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
        R6 = BYTEOP1P(R1:0,R3:2)(T)  || R0 = [I0++M0]|| R2  =[I1++M0];
        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]  || [I3++] = R6 ;
 pp8$3:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;

        (r7:6) = [sp++];
        RTS;
 DEFUN_END(put_pixels8uc_no_rnd)

 DEFUN(put_pixels16uc_no_rnd,mL1,
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
                 int line_size, int h)):
        i3=r0;        // dest
        i0=r1;        // src0
        i1=r2;        // src1
        r2=[sp+12];   // line_size
        p0=[sp+16];   // h

        [--sp] = (r7:6);
        r2+=-12;
        m3=r2;        // line_size
        r2+=-4;
        m0=r2;

        LSETUP(pp16$2,pp16$3) LC0=P0;
        DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];

 pp16$2:
        DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++]   || R2  =[I1++];
        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R1 = [I0++]   || R3  =[I1++];
        [I3++] = R6;

        R6 = BYTEOP1P(R1:0,R3:2)(T)    || R0 = [I0++M0] || R2  =[I1++M0];
        R7 = BYTEOP1P(R1:0,R3:2)(T,R) || R0 = [I0++]   || [I3++] = R7 ;
        [I3++] = R6;
 pp16$3: DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;

        (r7:6) = [sp++];

        RTS;
 DEFUN_END(put_pixels16uc_no_rnd)
--- a/libavcodec/bfin/hpeldsp_init.c
+++ b/libavcodec/bfin/hpeldsp_init.c
@@ -1,146 +0,0 @@
 /*
 * BlackFin halfpel functions
 *
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 * Copyright (c) 2006 Michael Benjamin <michael.benjamin@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stddef.h>
 #include <stdint.h>

 #include "libavutil/attributes.h"
 #include "libavutil/bfin/attributes.h"
 #include "libavcodec/hpeldsp.h"
 #include "pixels.h"

 void ff_bfin_put_pixels8uc_no_rnd(uint8_t *block, const uint8_t *s0,
                                  const uint8_t *s1, int line_size,
                                  int h) attribute_l1_text;
 void ff_bfin_put_pixels16uc_no_rnd(uint8_t *block, const uint8_t *s0,
                                   const uint8_t *s1, int line_size,
                                   int h) attribute_l1_text;

 static void bfin_put_pixels8(uint8_t *block, const uint8_t *pixels,
                             ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels8uc(block, pixels, pixels, line_size, line_size, h);
 }

 static void bfin_put_pixels8_x2(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels8uc(block, pixels, pixels + 1, line_size, line_size, h);
 }

 static void bfin_put_pixels8_y2(uint8_t *block, const uint8_t *pixels,
                                ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels8uc(block, pixels, pixels + line_size,
                          line_size, line_size, h);
 }

 static void bfin_put_pixels8_xy2(uint8_t *block, const uint8_t *s0,
                                 ptrdiff_t line_size, int h)
 {
    ff_bfin_z_put_pixels8_xy2(block, s0, line_size, line_size, h);
 }

 static void bfin_put_pixels16(uint8_t *block, const uint8_t *pixels,
                              ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels16uc(block, pixels, pixels, line_size, line_size, h);
 }

 static void bfin_put_pixels16_x2(uint8_t *block, const uint8_t *pixels,
                                 ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels16uc(block, pixels, pixels + 1, line_size, line_size, h);
 }

 static void bfin_put_pixels16_y2(uint8_t *block, const uint8_t *pixels,
                                 ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels16uc(block, pixels, pixels + line_size,
                           line_size, line_size, h);
 }

 static void bfin_put_pixels16_xy2(uint8_t *block, const uint8_t *s0,
                                  ptrdiff_t line_size, int h)
 {
    ff_bfin_z_put_pixels16_xy2(block, s0, line_size, line_size, h);
 }

 static void bfin_put_pixels8_no_rnd(uint8_t *block, const uint8_t *pixels,
                                    ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels8uc_no_rnd(block, pixels, pixels, line_size, h);
 }

 static void bfin_put_pixels8_x2_no_rnd(uint8_t *block, const uint8_t *pixels,
                                       ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels8uc_no_rnd(block, pixels, pixels + 1, line_size, h);
 }

 static void bfin_put_pixels8_y2_no_rnd(uint8_t *block, const uint8_t *pixels,
                                       ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels8uc_no_rnd(block, pixels, pixels + line_size,
                                 line_size, h);
 }

 static void bfin_put_pixels16_no_rnd(uint8_t *block, const uint8_t *pixels,
                                     ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels16uc_no_rnd(block, pixels, pixels, line_size, h);
 }

 static void bfin_put_pixels16_x2_no_rnd(uint8_t *block, const uint8_t *pixels,
                                        ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels16uc_no_rnd(block, pixels, pixels + 1, line_size, h);
 }

 static void bfin_put_pixels16_y2_no_rnd(uint8_t *block, const uint8_t *pixels,
                                        ptrdiff_t line_size, int h)
 {
    ff_bfin_put_pixels16uc_no_rnd(block, pixels, pixels + line_size,
                                  line_size, h);
 }

 av_cold void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags)
 {
    c->put_pixels_tab[0][0] = bfin_put_pixels16;
    c->put_pixels_tab[0][1] = bfin_put_pixels16_x2;
    c->put_pixels_tab[0][2] = bfin_put_pixels16_y2;
    c->put_pixels_tab[0][3] = bfin_put_pixels16_xy2;

    c->put_pixels_tab[1][0] = bfin_put_pixels8;
    c->put_pixels_tab[1][1] = bfin_put_pixels8_x2;
    c->put_pixels_tab[1][2] = bfin_put_pixels8_y2;
    c->put_pixels_tab[1][3] = bfin_put_pixels8_xy2;

    c->put_no_rnd_pixels_tab[1][0] = bfin_put_pixels8_no_rnd;
    c->put_no_rnd_pixels_tab[1][1] = bfin_put_pixels8_x2_no_rnd;
    c->put_no_rnd_pixels_tab[1][2] = bfin_put_pixels8_y2_no_rnd;

    c->put_no_rnd_pixels_tab[0][0] = bfin_put_pixels16_no_rnd;
    c->put_no_rnd_pixels_tab[0][1] = bfin_put_pixels16_x2_no_rnd;
    c->put_no_rnd_pixels_tab[0][2] = bfin_put_pixels16_y2_no_rnd;
 }
--- a/libavcodec/bfin/idct_bfin.S
+++ b/libavcodec/bfin/idct_bfin.S
@@ -1,297 +0,0 @@
 /*
 * idct BlackFin
 *
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 /*
   This blackfin DSP code implements an 8x8 inverse type II DCT.

 Prototype       : void ff_bfin_idct(int16_t *in)

 Registers Used  : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0.

 Performance     :
                    Code Size   : 498 Bytes.
                    Cycle Count : 417 Cycles


 -----------------------------------------------------------
 FFMPEG conformance testing results
 -----------------------------------------------------------

 dct-test: modified with the following
            dct_error("BFINidct", 1, ff_bfin_idct, idct, test);
 produces the following output

 root:/u/ffmpeg/bhead/libavcodec> ./dct-test -i
 ffmpeg DCT/IDCT test

    8    15    -2    21    24    17     0    10
    2   -10    -5    -5    -3     7   -14    -3
    2   -13   -10   -19    18    -6     6    -2
    9     4    16    -3     9    12    10    15
   15    -9    -2    10     1    16     0   -15
  -15     5     7     3    13     0    13    20
   -6   -15    24     9   -18     1     9   -22
   -8    25    23     2    -7     0    30    13
 IDCT BFINidct: err_inf=1 err2=0.01002344 syserr=0.00150000 maxout=266 blockSumErr=64
 IDCT BFINidct: 88.3 kdct/s

 */

 #include "libavutil/bfin/asm.h"

 SECTION_L1_DATA_B

 .align 4;
 coefs:
 .short 0x5a82;           //  C4
 .short 0x5a82;           //  C4
 .short 0x30FC;           //cos(3pi/8)  C6
 .short 0x7642;           //cos(pi/8)   C2
 .short 0x18F9;           //cos(7pi/16)
 .short 0x7D8A;           //cos(pi/16)
 .short 0x471D;           //cos(5pi/16)
 .short 0x6A6E;           //cos(3pi/16)
 .short 0x18F9;           //cos(7pi/16)
 .short 0x7D8A;           //cos(pi/16)

 SECTION_L1_DATA_A

 vtmp: .space 256

 #define TMP0 FP-8
 #define TMP1 FP-12
 #define TMP2 FP-16


 .text
 DEFUN(idct,mL1,
        (int16_t *block)):

 /********************** Function Prologue *********************************/
    link 16;
    [--SP] = (R7:4, P5:3);   // Push the registers onto the stack.
    B0 = R0;                 // Pointer to Input matrix
    RELOC(R1, P3, coefs);    // Pointer to Coefficients
    RELOC(R2, P3, vtmp);     // Pointer to Temporary matrix
    B3 = R1;
    B2 = R2;
    L3 = 20;                // L3 is used for making the coefficient array
                            // circular.
                            // MUST BE RESTORED TO ZERO at function exit.
    M1 = 16 (X);            // All these registers are initialized for
    M3 = 8(X);              // modifying address offsets.

    I0 = B0;                // I0 points to Input Element (0, 0).
    I2 = B0;                // I2 points to Input Element (0, 0).
    I2 += M3 || R0.H = W[I0];
                            // Element 0 is read into R0.H
    I1 = I2;                // I1 points to input Element (0, 6).
    I1 += 4  || R0.L = W[I2++];
                            // I2 points to input Element (0, 4).
                            // Element 4 is read into R0.L.
    P2 = 8 (X);
    P3 = 32 (X);
    P4 = -32 (X);
    P5 = 98 (X);
    R7 = 0x8000(Z);
    I3 = B3;                // I3 points to Coefficients
    P0 = B2;                // P0 points to array Element (0, 0) of temp
    P1 = B2;
    R7 = [I3++] || [TMP2]=R7;            // Coefficient C4 is read into R7.H and R7.L.
    MNOP;
    NOP;

    /*
     *   A1 =      Y0 * cos(pi/4)
     *   A0 =      Y0 * cos(pi/4)
     *   A1 = A1 + Y4 * cos(pi/4)
     *   A0 = A0 - Y4 * cos(pi/4)
     *   load:
     *     R1=(Y2,Y6)
     *     R7=(C2,C6)
     *   res:
     *     R3=Y0, R2=Y4
     */
    A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || I0+= 4       || R1.L=W[I1++];
    R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];

    LSETUP (.0, .1) LC0 = P2; // perform 8 1d idcts

    P2 = 112 (X);
    P1 = P1 + P2;           // P1 points to element (7, 0) of temp buffer.
    P2 = -94(X);

 .0:
       /*
        *   A1 =      Y2 * cos(3pi/8)
        *   A0 =      Y2 * cos(pi/8)
        *   A1 = A1 - Y6 * cos(pi/8)
        *   A0 = A0 + Y6 * cos(3pi/8)
        *      R5 = (Y1,Y7)
        *      R7 = (C1,C7)
        *   res:
        *      R1=Y2, R0=Y6
        */
        A1=R7.L*R1.H,       A0=R7.H*R1.H (IS)        || I0+=4        || R5.H=W[I0];
        R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS)  || R5.L=W[I1--] || R7=[I3++];
        /*
        *   Y0 = Y0 + Y6.
        *   Y4 = Y4 + Y2.
        *   Y2 = Y4 - Y2.
        *   Y6 = Y0 - Y6.
        *     R3 is saved
        *     R6.l=Y3
        * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
        */
        R3=R3+R0, R0=R3-R0;
        R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
        /*
         *  Compute the odd portion (1,3,5,7) even is done.
         *
         *  Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
         *  Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
         *  Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
         *  Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
         */
        //  R5=(Y1,Y7)  R6=(Y5,Y3)                                                   // R7=(C1,C7)
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || [TMP1]=R2 || R6.H=W[I2--];
        A1-=R7.H*R5.L,       A0+=R7.L*R5.L (IS)       || I0-=4     || R7=[I3++];
        A1+=R7.H*R6.H,       A0+=R7.L*R6.H (IS)       || I0+=M1;                     // R7=(C3,C5)
        R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || R4=[TMP0];
        A1+=R7.H*R5.L,       A0-=R7.L*R5.L (IS)       || I1+=M1    || R7=[I3++];     // R7=(C1,C7)
        A1+=R7.L*R6.H,       A0-=R7.H*R6.H (IS);
        R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
        // R3=Y1, R2=Y7, R7=Y5, R6=Y3

        /* Transpose write column. */
        R5.H=R4+R2 (RND12);                                   // Y0=Y0+Y7
        R5.L=R4-R2 (RND12) || R4 = [TMP1];                    // Y7=Y7-Y0
        R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H;                 // Y2=Y2+Y5 st Y0
        R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++];    // Y5=Y2-Y5 st Y7
        R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
        R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
        R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
        R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6

        /* pipeline loop start, + drain Y3, Y4 */
        A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || W[P0++P2]= R3.H || R1.H = W[I0--];
 .1:     R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];



    I0 = B2;                // I0 points to Input Element (0, 0)
    I2 = B2;                // I2 points to Input Element (0, 0)
    I2 += M3 || R0.H = W[I0];
                            // Y0 is read in R0.H
    I1 = I2;                // I1 points to input Element (0, 6)
    I1 += 4  || R0.L = W[I2++];
                            // I2 points to input Element (0, 4)
                            // Y4 is read in R0.L
    P2 = 8 (X);
    I3 = B3;                // I3 points to Coefficients
    P0 = B0;                // P0 points to array Element (0, 0) for writing
                            // output
    P1 = B0;
    R7 = [I3++];            // R7.H = C4 and R7.L = C4
    NOP;

    /*
     *   A1 =      Y0 * cos(pi/4)
     *   A0 =      Y0 * cos(pi/4)
     *   A1 = A1 + Y4 * cos(pi/4)
     *   A0 = A0 - Y4 * cos(pi/4)
     *   load:
     *     R1=(Y2,Y6)
     *     R7=(C2,C6)
     *   res:
     *     R3=Y0, R2=Y4
     */
    A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || I0+=4        || R1.L=W[I1++];
    R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];

    LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts
    P2 = 112 (X);
    P1 = P1 + P2;
    P2 = -94(X);

 .2:
        /*
         *   A1 =      Y2 * cos(3pi/8)
         *   A0 =      Y2 * cos(pi/8)
         *   A1 = A1 - Y6 * cos(pi/8)
         *   A0 = A0 + Y6 * cos(3pi/8)
         *      R5 = (Y1,Y7)
         *      R7 = (C1,C7)
         *   res:
         *      R1=Y2, R0=Y6
         */
        A1=R7.L*R1.H,       A0=R7.H*R1.H (IS)        || I0+=4        || R5.H=W[I0];
        R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS)  || R5.L=W[I1--] || R7=[I3++];
        /*
        *   Y0 = Y0 + Y6.
        *   Y4 = Y4 + Y2.
        *   Y2 = Y4 - Y2.
        *   Y6 = Y0 - Y6.
        *     R3 is saved
        *     R6.l=Y3
        * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
        */
        R3=R3+R0, R0=R3-R0;
        R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
        /*
         *  Compute the odd portion (1,3,5,7) even is done.
         *
         *  Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
         *  Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
         *  Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
         *  Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
         */
        //  R5=(Y1,Y7)  R6=(Y5,Y3)                                                   // R7=(C1,C7)
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || [TMP1]=R2 || R6.H=W[I2--];
        A1-=R7.H*R5.L,       A0+=R7.L*R5.L (IS)       || I0-=4     || R7=[I3++];
        A1+=R7.H*R6.H,       A0+=R7.L*R6.H (IS)       || I0+=M1;                     // R7=(C3,C5)
        R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || R4=[TMP0];
        A1+=R7.H*R5.L,       A0-=R7.L*R5.L (IS)       || I1+=M1    || R7=[I3++];     // R7=(C1,C7)
        A1+=R7.L*R6.H,       A0-=R7.H*R6.H (IS);
        R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
        // R3=Y1, R2=Y7, R7=Y5, R6=Y3

        /* Transpose write column. */
        R5.H=R4+R2 (RND20);                                   // Y0=Y0+Y7
        R5.L=R4-R2 (RND20) || R4 = [TMP1];                    // Y7=Y7-Y0
        R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H;                 // Y2=Y2+Y5 st Y0
        R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++];    // Y5=Y2-Y5 st Y7
        R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
        R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
        R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
        R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6

        /* pipeline loop start, + drain Y3, Y4 */
        A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || W[P0++P2]= R3.H || R1.H = W[I0--];
 .3:     R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];

    L3 = 0;
    (R7:4,P5:3)=[SP++];
    unlink;
    RTS;
 DEFUN_END(idct)
--- a/libavcodec/bfin/mathops.h
+++ b/libavcodec/bfin/mathops.h
@@ -1,44 +0,0 @@
 /*
 * simple math operations
 *
 * Copyright (C) 2007 Marc Hoffman <mmhoffm@gmail.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 #ifndef AVCODEC_BFIN_MATHOPS_H
 #define AVCODEC_BFIN_MATHOPS_H

 #include "config.h"

 #define MULH(X,Y) ({ int xxo;                           \
    __asm__ (                                               \
        "a1 = %2.L * %1.L (FU);\n\t"                    \
        "a1 = a1 >> 16;\n\t"                            \
        "a1 += %2.H * %1.L (IS,M);\n\t"                 \
        "a0 = %1.H * %2.H, a1+= %1.H * %2.L (IS,M);\n\t"\
        "a1 = a1 >>> 16;\n\t"                           \
        "%0 = (a0 += a1);\n\t"                          \
        : "=d" (xxo) : "d" (X), "d" (Y) : "A0","A1"); xxo; })

 /* signed 16x16 -> 32 multiply */
 #define MUL16(a, b) ({ int xxo;                         \
    __asm__ (                                               \
       "%0 = %1.l*%2.l (is);\n\t"                       \
       : "=W" (xxo) : "d" (a), "d" (b) : "A1");         \
    xxo; })

 #endif /* AVCODEC_BFIN_MATHOPS_H */
--- a/libavcodec/bfin/pixels.S
+++ b/libavcodec/bfin/pixels.S
@@ -1,207 +0,0 @@
 /*
 * Blackfin Pixel Operations
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/bfin/asm.h"

 /*
  motion compensation
  primitives

     * Halfpel motion compensation with rounding (a+b+1)>>1.
     * This is an array[4][4] of motion compensation funcions for 4
     * horizontal blocksizes (8,16) and the 4 halfpel positions<br>
     * *pixels_tab[ 0->16xH 1->8xH ][ xhalfpel + 2*yhalfpel ]
     * @param block destination where the result is stored
     * @param pixels source
     * @param line_size number of bytes in a horizontal line of block
     * @param h height

 */

 DEFUN(put_pixels8uc,mL1,
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
                 int dest_size, int line_size, int h)):
        i3=r0;        // dest
        i0=r1;        // src0
        i1=r2;        // src1
        r0=[sp+12];   // dest_size
        r2=[sp+16];   // line_size
        p0=[sp+20];   // h
        [--sp] = (r7:6);
        r0+=-4;
        m3=r0;
        r2+=-8;
        m0=r2;
        LSETUP(pp8$0,pp8$1) LC0=P0;
        DISALGNEXCPT                || R0 = [I0++]  || R2  =[I1++];

 pp8$0:  DISALGNEXCPT                || R1 = [I0++]  || R3  =[I1++];
        R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0]|| R2  =[I1++M0];
        R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]  || [I3++] = R6 ;
 pp8$1:  DISALGNEXCPT                || R2 = [I1++]  || [I3++M3] = R7;

        (r7:6) = [sp++];
        RTS;
 DEFUN_END(put_pixels8uc)

 DEFUN(put_pixels16uc,mL1,
        (uint8_t *block, const uint8_t *s0, const uint8_t *s1,
                 int dest_size, int line_size, int h)):
        link 0;
        [--sp] = (r7:6);
        i3=r0;        // dest
        i0=r1;        // src0
        i1=r2;        // src1
        r0=[fp+20];   // dest_size
        r2=[fp+24];   // line_size
        p0=[fp+28];   // h


        r0+=-12;
        m3=r0;        // line_size
        r2+=-16;
        m0=r2;

        LSETUP(pp16$0,pp16$1) LC0=P0;
         DISALGNEXCPT                || R0 = [I0++]   || R2  =[I1++];

 pp16$0:  DISALGNEXCPT                || R1 = [I0++]   || R3  =[I1++];
         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++]   || R2  =[I1++];
         R7 = BYTEOP1P(R1:0,R3:2)(R) || R1 = [I0++]   || R3  =[I1++];
         [I3++] = R6;
         R6 = BYTEOP1P(R1:0,R3:2)    || R0 = [I0++M0] || R2  =[I1++M0];
         R7 = BYTEOP1P(R1:0,R3:2)(R) || R0 = [I0++]   || [I3++] = R7 ;
         [I3++] = R6;
 pp16$1:  DISALGNEXCPT                || R2 = [I1++]   || [I3++M3] = R7;

        (r7:6) = [sp++];
        unlink;
        RTS;
 DEFUN_END(put_pixels16uc)

 DEFUN(z_put_pixels16_xy2,mL1,
        (uint8_t *block, const uint8_t *s0,
                 int dest_size, int line_size, int h)):
        link 0;
        [--sp] = (r7:4);
        i3=r0;        // dest
        i0=r1;        // src0--> pixels
        i1=r1;        // src1--> pixels + line_size
        r2+=-12;
        m2=r2;        // m2=dest_width-4
        r2=[fp+20];
        m3=r2;        // line_size
        p0=[fp+24];   // h
        r2+=-16;
        i1+=m3;       /* src1 + line_size */
        m0=r2;        /* line-size - 20 */

        B0 = I0;
        B1 = I1;
        B3 = I3;

        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];

        LSETUP(LS$16E,LE$16E) LC0=P0;
 LS$16E: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++] || R2  =[I1++];
        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R1 = [I0++] || [I3++] = R4 ;
        DISALGNEXCPT                       || R3 = [I1++] || [I3++] = R5;
        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0]|| R2  = [I1++M0];
        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++] || [I3++] = R4 ;
 LE$16E: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;

        M1 = 1;
        I3 = B3;
        I1 = B1;
        I0 = B0;

        I0 += M1;
        I1 += M1;

        DISALGNEXCPT                       || R0 = [I0++] || R2  =[I1++];
        LSETUP(LS$16O,LE$16O) LC0=P0;
 LS$16O: DISALGNEXCPT                       || R1 = [I0++] || R3  =[I1++];
        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++] || R2  =[I1++];
        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R1 = [I0++] || R6  =[I3++];
        R4 = R4 +|+ R6                       || R7 = [I3--];
        R5 = R5 +|+ R7                       || [I3++] = R4;
        DISALGNEXCPT                       || R3  =[I1++] || [I3++] = R5;
        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0]|| R2  = [I1++M0];
        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++] || R6 = [I3++];
        R4 = R4 +|+ R6                       || R7 = [I3--];
        R5 = R5 +|+ R7                       || [I3++] = R4;
 LE$16O: DISALGNEXCPT                       || R2 = [I1++] || [I3++M2] = R5;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(z_put_pixels16_xy2)

 DEFUN(z_put_pixels8_xy2,mL1,
        (uint8_t *block, const uint8_t *s0,
                 int dest_size, int line_size, int h)):
        link 0;
        [--sp] = (r7:4);
        i3=r0;        // dest
        i0=r1;        // src0--> pixels
        i1=r1;        // src1--> pixels + line_size
        r2+=-4;
        m2=r2;        // m2=dest_width-4
        r2=[fp+20];
        m3=r2;        // line_size
        p0=[fp+24];   // h
        r2+=-8;
        i1+=m3;       /* src1 + line_size */
        m0=r2;        /* line-size - 20 */

        b0 = I0;
        b1 = I1;
        b3 = I3;

        LSETUP(LS$8E,LE$8E) LC0=P0;
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
 LS$8E:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
        R4 = BYTEOP2P (R3:2,R1:0) (RNDL)   || R0 = [I0++M0] || R2  =[I1++M0];
        R5 = BYTEOP2P (R3:2,R1:0) (RNDL,R) || R0 = [I0++]   || [I3++] = R4 ;
 LE$8E:  DISALGNEXCPT                       || R2 = [I1++]   || [I3++M2] = R5;

        M1 = 1;
        I3 = b3;
        I1 = b1;
        I0 = b0;

        I0 += M1;
        I1 += M1;

        LSETUP(LS$8O,LE$8O) LC0=P0;
        DISALGNEXCPT                       || R0 = [I0++]   || R2  =[I1++];
 LS$8O:  DISALGNEXCPT                       || R1 = [I0++]   || R3  =[I1++];
        R4 = BYTEOP2P (R3:2,R1:0) (RNDH)   || R0 = [I0++M0] || R2  =[I1++M0];
        R5 = BYTEOP2P (R3:2,R1:0) (RNDH,R) || R0 = [I0++]   || R6  =[I3++];
        R4 = R4 +|+ R6                                      || R7 = [I3--];
        R5 = R5 +|+ R7                                      || [I3++] = R4;
 LE$8O:  DISALGNEXCPT                       || R2  =[I1++]   || [I3++M2] = R5;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(z_put_pixels8_xy2)
--- a/libavcodec/bfin/pixels.h
+++ b/libavcodec/bfin/pixels.h
@@ -1,42 +0,0 @@
 /*
 * BlackFin DSPUTILS COMMON OPTIMIZATIONS HEADER
 *
 * Copyright (C) 2007 Marc Hoffman <mmh@pleasantst.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVCODEC_BFIN_PIXELS_H
 #define AVCODEC_BFIN_PIXELS_H

 #include <stdint.h>

 #include "libavutil/bfin/attributes.h"

 void ff_bfin_z_put_pixels16_xy2(uint8_t *block, const uint8_t *s0,
                                int dest_size, int line_size, int h) attribute_l1_text;
 void ff_bfin_z_put_pixels8_xy2(uint8_t *block, const uint8_t *s0,
                               int dest_size, int line_size, int h) attribute_l1_text;

 void ff_bfin_put_pixels8uc(uint8_t *block, const uint8_t *s0,
                           const uint8_t *s1, int dest_size, int line_size,
                           int h) attribute_l1_text;
 void ff_bfin_put_pixels16uc(uint8_t *block, const uint8_t *s0,
                            const uint8_t *s1, int dest_size, int line_size,
                            int h) attribute_l1_text;

 #endif /* AVCODEC_BFIN_PIXELS_H */
--- a/libavcodec/bfin/vp3dsp.S
+++ b/libavcodec/bfin/vp3dsp.S
@@ -1,273 +0,0 @@
 /*
 * vp3_idct BlackFin
 *
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */
 /*
   This blackfin DSP code implements an 8x8 inverse type II DCT.

 Prototype       : void ff_bfin_vp3_idct(int16_t *in)

 Registers Used  : A0, A1, R0-R7, I0-I3, B0, B2, B3, M0-M2, L0-L3, P0-P5, LC0.

 */

 #include "libavutil/bfin/asm.h"

 SECTION_L1_DATA_B

 .align 4;
 coefs:
 .short 0x5a82;           //  C4
 .short 0x5a82;           //  C4
 .short 0x30FC;           //cos(3pi/8)  C6
 .short 0x7642;           //cos(pi/8)   C2
 .short 0x18F9;           //cos(7pi/16)
 .short 0x7D8A;           //cos(pi/16)
 .short 0x471D;           //cos(5pi/16)
 .short 0x6A6E;           //cos(3pi/16)
 .short 0x18F9;           //cos(7pi/16)
 .short 0x7D8A;           //cos(pi/16)

 SECTION_L1_DATA_A

 vtmp: .space 256

 #define TMP0 FP-8
 #define TMP1 FP-12
 #define TMP2 FP-16


 .text
 DEFUN(vp3_idct,mL1,
        (int16_t *block)):

 /********************** Function Prologue *********************************/
    link 16;
    [--SP] = (R7:4, P5:3);   // Push the registers onto the stack.
    B0 = R0;                 // Pointer to Input matrix
    RELOC(R1, P3, coefs);    // Pointer to Coefficients
    RELOC(R2, P3, vtmp);     // Pointer to Temporary matrix
    B3 = R1;
    B2 = R2;
    L3 = 20;                // L3 is used for making the coefficient array
                            // circular.
                            // MUST BE RESTORED TO ZERO at function exit.
    M1 = 16 (X);            // All these registers are initialized for
    M3 = 8(X);              // modifying address offsets.

    I0 = B0;                // I0 points to Input Element (0, 0).
    I2 = B0;                // I2 points to Input Element (0, 0).
    I2 += M3 || R0.H = W[I0];
                            // Element 0 is read into R0.H
    I1 = I2;                // I1 points to input Element (0, 6).
    I1 += 4  || R0.L = W[I2++];
                            // I2 points to input Element (0, 4).
                            // Element 4 is read into R0.L.
    P2 = 8 (X);
    P3 = 32 (X);
    P4 = -32 (X);
    P5 = 98 (X);
    R7 = 0x8000(Z);
    I3 = B3;                // I3 points to Coefficients
    P0 = B2;                // P0 points to array Element (0, 0) of temp
    P1 = B2;
    R7 = [I3++] || [TMP2]=R7;            // Coefficient C4 is read into R7.H and R7.L.
    MNOP;
    NOP;

    /*
     *   A1 =      Y0 * cos(pi/4)
     *   A0 =      Y0 * cos(pi/4)
     *   A1 = A1 + Y4 * cos(pi/4)
     *   A0 = A0 - Y4 * cos(pi/4)
     *   load:
     *     R1=(Y2,Y6)
     *     R7=(C2,C6)
     *   res:
     *     R3=Y0, R2=Y4
     */
    A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || I0+= 4       || R1.L=W[I1++];
    R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];

    LSETUP (.0, .1) LC0 = P2; // perform 8 1d idcts

    P2 = 112 (X);
    P1 = P1 + P2;           // P1 points to element (7, 0) of temp buffer.
    P2 = -94(X);

 .0:
       /*
        *   A1 =      Y2 * cos(3pi/8)
        *   A0 =      Y2 * cos(pi/8)
        *   A1 = A1 - Y6 * cos(pi/8)
        *   A0 = A0 + Y6 * cos(3pi/8)
        *      R5 = (Y1,Y7)
        *      R7 = (C1,C7)
        *   res:
        *      R1=Y2, R0=Y6
        */
        A1=R7.L*R1.H,       A0=R7.H*R1.H (IS)        || I0+=4        || R5.H=W[I0];
        R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS)  || R5.L=W[I1--] || R7=[I3++];
        /*
        *   Y0 = Y0 + Y6.
        *   Y4 = Y4 + Y2.
        *   Y2 = Y4 - Y2.
        *   Y6 = Y0 - Y6.
        *     R3 is saved
        *     R6.l=Y3
        * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
        */
        R3=R3+R0, R0=R3-R0;
        R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
        /*
         *  Compute the odd portion (1,3,5,7) even is done.
         *
         *  Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
         *  Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
         *  Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
         *  Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
         */
        //  R5=(Y1,Y7)  R6=(Y5,Y3)                                                   // R7=(C1,C7)
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || [TMP1]=R2 || R6.H=W[I2--];
        A1-=R7.H*R5.L,       A0+=R7.L*R5.L (IS)       || I0-=4     || R7=[I3++];
        A1+=R7.H*R6.H,       A0+=R7.L*R6.H (IS)       || I0+=M1;                     // R7=(C3,C5)
        R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || R4=[TMP0];
        A1+=R7.H*R5.L,       A0-=R7.L*R5.L (IS)       || I1+=M1    || R7=[I3++];     // R7=(C1,C7)
        A1+=R7.L*R6.H,       A0-=R7.H*R6.H (IS);
        R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
        // R3=Y1, R2=Y7, R7=Y5, R6=Y3

        /* Transpose write column. */
        R5.H=R4+R2 (RND12);                                   // Y0=Y0+Y7
        R5.L=R4-R2 (RND12) || R4 = [TMP1];                    // Y7=Y7-Y0
        R2.H=R1+R7 (RND12) || W[P0++P3]=R5.H;                 // Y2=Y2+Y5 st Y0
        R2.L=R1-R7 (RND12) || W[P1++P4]=R5.L || R7=[I3++];    // Y5=Y2-Y5 st Y7
        R5.H=R0-R3 (RND12) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
        R5.L=R0+R3 (RND12) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
        R3.H=R4-R6 (RND12) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
        R3.L=R4+R6 (RND12) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6

        /* pipeline loop start, + drain Y3, Y4 */
        A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || W[P0++P2]= R3.H || R1.H = W[I0--];
 .1:     R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];



    I0 = B2;                // I0 points to Input Element (0, 0)
    I2 = B2;                // I2 points to Input Element (0, 0)
    I2 += M3 || R0.H = W[I0];
                            // Y0 is read in R0.H
    I1 = I2;                // I1 points to input Element (0, 6)
    I1 += 4  || R0.L = W[I2++];
                            // I2 points to input Element (0, 4)
                            // Y4 is read in R0.L
    P2 = 8 (X);
    I3 = B3;                // I3 points to Coefficients
    P0 = B0;                // P0 points to array Element (0, 0) for writing
                            // output
    P1 = B0;
    R7 = [I3++];            // R7.H = C4 and R7.L = C4
    NOP;

    /*
     *   A1 =      Y0 * cos(pi/4)
     *   A0 =      Y0 * cos(pi/4)
     *   A1 = A1 + Y4 * cos(pi/4)
     *   A0 = A0 - Y4 * cos(pi/4)
     *   load:
     *     R1=(Y2,Y6)
     *     R7=(C2,C6)
     *   res:
     *     R3=Y0, R2=Y4
     */
    A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || I0+=4        || R1.L=W[I1++];
    R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || R1.H=W[I0--] || R7=[I3++];

    LSETUP (.2, .3) LC0 = P2; // peform 8 1d idcts
    P2 = 112 (X);
    P1 = P1 + P2;
    P2 = -94(X);

 .2:
        /*
         *   A1 =      Y2 * cos(3pi/8)
         *   A0 =      Y2 * cos(pi/8)
         *   A1 = A1 - Y6 * cos(pi/8)
         *   A0 = A0 + Y6 * cos(3pi/8)
         *      R5 = (Y1,Y7)
         *      R7 = (C1,C7)
         *   res:
         *      R1=Y2, R0=Y6
         */
        A1=R7.L*R1.H,       A0=R7.H*R1.H (IS)        || I0+=4        || R5.H=W[I0];
        R1=(A1-=R7.H*R1.L), R0=(A0+=R7.L*R1.L) (IS)  || R5.L=W[I1--] || R7=[I3++];
        /*
        *   Y0 = Y0 + Y6.
        *   Y4 = Y4 + Y2.
        *   Y2 = Y4 - Y2.
        *   Y6 = Y0 - Y6.
        *     R3 is saved
        *     R6.l=Y3
        * note: R3: Y0, R2: Y4, R1: Y2, R0: Y6
        */
        R3=R3+R0, R0=R3-R0;
        R2=R2+R1, R1=R2-R1 || [TMP0]=R3 || R6.L=W[I0--];
        /*
         *  Compute the odd portion (1,3,5,7) even is done.
         *
         *  Y1 = C7 * Y1 - C1 * Y7 + C3 * Y5 - C5 * Y3.
         *  Y7 = C1 * Y1 + C7 * Y7 + C5 * Y5 + C3 * Y3.
         *  Y5 = C5 * Y1 + C3 * Y7 + C7 * Y5 - C1 * Y3.
         *  Y3 = C3 * Y1 - C5 * Y7 - C1 * Y5 - C7 * Y3.
         */
        //  R5=(Y1,Y7)  R6=(Y5,Y3)                                                   // R7=(C1,C7)
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || [TMP1]=R2 || R6.H=W[I2--];
        A1-=R7.H*R5.L,       A0+=R7.L*R5.L (IS)       || I0-=4     || R7=[I3++];
        A1+=R7.H*R6.H,       A0+=R7.L*R6.H (IS)       || I0+=M1;                     // R7=(C3,C5)
        R3 =(A1-=R7.L*R6.L), R2 =(A0+=R7.H*R6.L) (IS);
        A1 =R7.L*R5.H,       A0 =R7.H*R5.H (IS)       || R4=[TMP0];
        A1+=R7.H*R5.L,       A0-=R7.L*R5.L (IS)       || I1+=M1    || R7=[I3++];     // R7=(C1,C7)
        A1+=R7.L*R6.H,       A0-=R7.H*R6.H (IS);
        R7 =(A1-=R7.H*R6.L), R6 =(A0-=R7.L*R6.L) (IS) || I2+=M1;
        // R3=Y1, R2=Y7, R7=Y5, R6=Y3

        /* Transpose write column. */
        R5.H=R4+R2 (RND20);                                   // Y0=Y0+Y7
        R5.L=R4-R2 (RND20) || R4 = [TMP1];                    // Y7=Y7-Y0
        R5=R5>>>2(v);
        R2.H=R1+R7 (RND20) || W[P0++P3]=R5.H;                 // Y2=Y2+Y5 st Y0
        R2.L=R1-R7 (RND20) || W[P1++P4]=R5.L || R7=[I3++];    // Y5=Y2-Y5 st Y7
        R2=R2>>>2(v);
        R5.H=R0-R3 (RND20) || W[P0++P3]=R2.H || R1.L=W[I1++]; // Y1=Y6-Y1 st Y2
        R5.L=R0+R3 (RND20) || W[P1++P4]=R2.L || R0.H=W[I0++]; // Y6=Y6+Y1 st Y5
        R5=R5>>>2(v);
        R3.H=R4-R6 (RND20) || W[P0++P3]=R5.H || R0.L=W[I2++]; // Y3=Y3-Y4 st Y1
        R3.L=R4+R6 (RND20) || W[P1++P4]=R5.L || R1.H=W[I0++]; // Y4=Y3+Y4 st Y6
        R3=R3>>>2(v);
        /* pipeline loop start, + drain Y3, Y4 */
        A1=R7.H*R0.H,       A0=R7.H*R0.H (IS)       || W[P0++P2]= R3.H || R1.H = W[I0--];
 .3:     R3=(A1+=R7.H*R0.L), R2=(A0-=R7.H*R0.L) (IS) || W[P1++P5]= R3.L || R7 = [I3++];

    L3 = 0;
    (R7:4,P5:3)=[SP++];
    unlink;
    RTS;
 DEFUN_END(vp3_idct)
--- a/libavcodec/bfin/vp3dsp_init.c
+++ b/libavcodec/bfin/vp3dsp_init.c
@@ -1,66 +0,0 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>
 #include <string.h>

 #include "libavutil/attributes.h"
 #include "libavcodec/avcodec.h"
 #include "libavcodec/mathops.h"
 #include "libavcodec/vp3dsp.h"

 void ff_bfin_vp3_idct(int16_t *block);

 /* Intra iDCT offset 128 */
 static void bfin_vp3_idct_put(uint8_t *dest, int line_size, int16_t *block)
 {
    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP + 128;
    int i,j;

    ff_bfin_vp3_idct (block);

    for (i=0;i<8;i++)
        for (j=0;j<8;j++)
            dest[line_size*i + j] = cm[block[j*8 + i]];

    memset(block, 0, 128);
 }

 /* Inter iDCT */
 static void bfin_vp3_idct_add(uint8_t *dest, int line_size, int16_t *block)
 {
    const uint8_t *cm = ff_crop_tab + MAX_NEG_CROP;
    int i, j;

    ff_bfin_vp3_idct (block);
    for (i = 0; i < 8; i++)
        for (j = 0; j < 8; j++)
            dest[line_size*i + j] = cm[dest[line_size*i + j] + block[j*8 + i]];

    memset(block, 0, 128);
 }

 av_cold void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags)
 {
    if (!(flags & CODEC_FLAG_BITEXACT)) {
        c->idct_add = bfin_vp3_idct_add;
        c->idct_put = bfin_vp3_idct_put;
    }
 }
--- a/libavcodec/dct-test.c
+++ b/libavcodec/dct-test.c
@@ -47,10 +47,6 @@
 #include "x86/idct_xvid.h"
 #include "dctref.h"

 // BFIN
 void ff_bfin_idct(int16_t *block);
 void ff_bfin_fdct(int16_t *block);

 // ALTIVEC
 void ff_fdct_altivec(int16_t *block);

@@ -92,10 +88,6 @@ static const struct algo fdct_tab[] = {
    { "altivecfdct",    ff_fdct_altivec,       NO_PERM,   AV_CPU_FLAG_ALTIVEC },
 #endif

 #if ARCH_BFIN
    { "BFINfdct",       ff_bfin_fdct,          NO_PERM  },
 #endif

    { 0 }
 };

@@ -153,10 +145,6 @@ static const struct algo idct_tab[] = {
 #endif
 #endif

 #if ARCH_BFIN
    { "BFINidct",       ff_bfin_idct,          NO_PERM  },
 #endif

 #if ARCH_ARM
    { "SIMPLE-ARM",     ff_simple_idct_arm,    NO_PERM  },
    { "INT-ARM",        ff_j_rev_dct_arm,      MMX_PERM },
--- a/libavcodec/dsputil.c
+++ b/libavcodec/dsputil.c
@@ -1727,8 +1727,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx)
        ff_dsputil_init_alpha(c, avctx);
    if (ARCH_ARM)
        ff_dsputil_init_arm(c, avctx, high_bit_depth);
    if (ARCH_BFIN)
        ff_dsputil_init_bfin(c, avctx, high_bit_depth);
    if (ARCH_PPC)
        ff_dsputil_init_ppc(c, avctx, high_bit_depth);
    if (ARCH_X86)
--- a/libavcodec/dsputil.h
+++ b/libavcodec/dsputil.h
@@ -252,8 +252,6 @@ void ff_set_cmp(DSPContext *c, me_cmp_func *cmp, int type);
 void ff_dsputil_init_alpha(DSPContext* c, AVCodecContext *avctx);
 void ff_dsputil_init_arm(DSPContext *c, AVCodecContext *avctx,
                         unsigned high_bit_depth);
 void ff_dsputil_init_bfin(DSPContext *c, AVCodecContext *avctx,
                          unsigned high_bit_depth);
 void ff_dsputil_init_ppc(DSPContext *c, AVCodecContext *avctx,
                         unsigned high_bit_depth);
 void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
--- a/libavcodec/hpeldsp.c
+++ b/libavcodec/hpeldsp.c
@@ -361,8 +361,6 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags)
        ff_hpeldsp_init_alpha(c, flags);
    if (ARCH_ARM)
        ff_hpeldsp_init_arm(c, flags);
    if (ARCH_BFIN)
        ff_hpeldsp_init_bfin(c, flags);
    if (ARCH_PPC)
        ff_hpeldsp_init_ppc(c, flags);
    if (ARCH_X86)
--- a/libavcodec/hpeldsp.h
+++ b/libavcodec/hpeldsp.h
@@ -97,7 +97,6 @@ void ff_hpeldsp_init(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_aarch64(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags);
 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags);

--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -289,8 +289,6 @@ av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)

    if (ARCH_ARM)
        ff_vp3dsp_init_arm(c, flags);
    if (ARCH_BFIN)
        ff_vp3dsp_init_bfin(c, flags);
    if (ARCH_PPC)
        ff_vp3dsp_init_ppc(c, flags);
    if (ARCH_X86)
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -47,7 +47,6 @@ typedef struct VP3DSPContext {

 void ff_vp3dsp_init(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags);
 void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags);

--- a/libavutil/bfin/asm.h
+++ b/libavutil/bfin/asm.h
@@ -1,54 +0,0 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVUTIL_BFIN_ASM_H
 #define AVUTIL_BFIN_ASM_H

 #include "config.h"

 #define mL3 .text

 #if defined(__FDPIC__) && CONFIG_SRAM
 #define mL1 .l1.text
 #define SECTION_L1_DATA_A .section .l1.data.A,"aw",@progbits
 #define SECTION_L1_DATA_B .section .l1.data.B,"aw",@progbits
 #else
 #define mL1 mL3
 #define SECTION_L1_DATA_A
 #define SECTION_L1_DATA_B .data
 #endif

 #define DEFUN(fname, where, interface)          \
    .section where;                             \
    .global _ff_bfin_ ## fname;                 \
    .type _ff_bfin_ ## fname, STT_FUNC;         \
    .align 8;                                   \
    _ff_bfin_ ## fname

 #define DEFUN_END(fname) \
    .size _ff_bfin_ ## fname, . - _ff_bfin_ ## fname

 #ifdef __FDPIC__
 #define RELOC(reg, got, obj) reg = [got + obj@GOT17M4]
 #else
 #define RELOC(reg, got, obj) reg.L = obj; reg.H = obj
 #endif

 #endif /* AVUTIL_BFIN_ASM_H */
--- a/libavutil/bfin/attributes.h
+++ b/libavutil/bfin/attributes.h
@@ -1,34 +0,0 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <mmh@pleasantst.com>
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #ifndef AVUTIL_BFIN_ATTRIBUTES_H
 #define AVUTIL_BFIN_ATTRIBUTES_H

 #include "config.h"

 #if defined(__FDPIC__) && CONFIG_SRAM
 #define attribute_l1_text   __attribute__((l1_text))
 #define attribute_l1_data_b __attribute__((l1_data_B))
 #else
 #define attribute_l1_text
 #define attribute_l1_data_b
 #endif

 #endif /* AVUTIL_BFIN_ATTRIBUTES_H */
--- a/libswscale/bfin/Makefile
+++ b/libswscale/bfin/Makefile
@@ -1,3 +0,0 @@
 OBJS +=  bfin/internal_bfin.o                                           \
         bfin/swscale_bfin.o                                            \
         bfin/yuv2rgb_bfin.o                                            \
--- a/libswscale/bfin/internal_bfin.S
+++ b/libswscale/bfin/internal_bfin.S
@@ -1,599 +0,0 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *                    April 20, 2007
 *
 * Blackfin video color space converter operations
 * convert I420 YV12 to RGB in various formats
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */


 /*
 YUV420 to RGB565 conversion. This routine takes a YUV 420 planar macroblock
 and converts it to RGB565. R:5 bits, G:6 bits, B:5 bits.. packed into shorts.


 The following calculation is used for the conversion:

  r = clipz((y - oy) * cy  + crv * (v - 128))
  g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))
  b = clipz((y - oy) * cy  + cbu * (u - 128))

 y, u, v are prescaled by a factor of 4 i.e. left-shifted to gain precision.


 New factorization to eliminate the truncation error which was
 occurring due to the byteop3p.


 1) Use the bytop16m to subtract quad bytes we use this in U8 this
 then so the offsets need to be renormalized to 8bits.

 2) Scale operands up by a factor of 4 not 8 because Blackfin
   multiplies include a shift.

 3) Compute into the accumulators cy * yx0, cy * yx1.

 4) Compute each of the linear equations:
     r = clipz((y - oy) * cy  + crv * (v - 128))

     g = clipz((y - oy) * cy  + cgv * (v - 128) + cgu * (u - 128))

     b = clipz((y - oy) * cy  + cbu * (u - 128))

   Reuse of the accumulators requires that we actually multiply
   twice once with addition and the second time with a subtraction.

   Because of this we need to compute the equations in the order R B
   then G saving the writes for B in the case of 24/32 bit color
   formats.

   API: yuv2rgb_kind (uint8_t *Y, uint8_t *U, uint8_t *V, int *out,
                      int dW, uint32_t *coeffs);

       A          B
       ---        ---
       i2 = cb    i3 = cr
       i1 = coeff i0 = y

 Where coeffs have the following layout in memory.

 uint32_t oy, oc, zero, cy, crv, rmask, cbu, bmask, cgu, cgv;

 coeffs is a pointer to oy.

 The {rgb} masks are only utilized by the 565 packing algorithm. Note the data
 replication is used to simplify the internal algorithms for the dual Mac
 architecture of BlackFin.

 All routines are exported with _ff_bfin_ as a symbol prefix.

 Rough performance gain compared against -O3:

 2779809/1484290 187.28%

 which translates to ~33c/pel to ~57c/pel for the reference vs 17.5
 c/pel for the optimized implementations. Not sure why there is such a
 huge variation on the reference codes on Blackfin I guess it must have
 to do with the memory system.
 */

 #include "libavutil/bfin/asm.h"

 #define MEM mL1


 .text

 #define COEFF_LEN        11*4
 #define COEFF_REL_CY_OFF 4*4

 #define ARG_OUT   20
 #define ARG_W     24
 #define ARG_COEFF 28

 DEFUN(yuv2rgb565_line,MEM,
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
        link 0;
        [--sp] = (r7:4);
        p1 = [fp+ARG_OUT];
        r3 = [fp+ARG_W];

        i0 = r0;
        i2 = r1;
        i3 = r2;

        r0 = [fp+ARG_COEFF];
        i1 = r0;
        b1 = i1;
        l1 = COEFF_LEN;
        m0 = COEFF_REL_CY_OFF;
        p0 = r3;

        r0   = [i0++];         // 2Y
        r1.l = w[i2++];        // 2u
        r1.h = w[i3++];        // 2v
        p0 = p0>>2;

        lsetup (.L0565, .L1565) lc0 = p0;

        /*
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
           r0 -- used to load 4ys
           r1 -- used to load 2us,2vs
           r4 -- y3,y2
           r5 -- y1,y0
           r6 -- u1,u0
           r7 -- v1,v0
        */
                                                              r2=[i1++]; // oy
 .L0565:
        /*
        rrrrrrrr gggggggg bbbbbbbb
         5432109876543210
                    bbbbb >>3
              gggggggg    <<3
         rrrrrrrr         <<8
         rrrrrggggggbbbbb
        */
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
        r5 = r5 << 2 (v);                                                // y1,y0
        r4 = r4 << 2 (v);                                                // y3,y2
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
        /* Y' = y*cy */
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 8 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
        r2 = r2 << 3 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1=[i1++]; // cy

        /* Y' = y*cy */

        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 8 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0   =  [i0++];        // 2Y
        r2 = r2 << 3 (v)                                   || r1.l = w[i2++];        // 2u
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1.h = w[i3++];        // 2v
 .L1565:                                                       r2=[i1++]; // oy

        l1 = 0;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuv2rgb565_line)

 DEFUN(yuv2rgb555_line,MEM,
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
        link 0;
        [--sp] = (r7:4);
        p1 = [fp+ARG_OUT];
        r3 = [fp+ARG_W];

        i0 = r0;
        i2 = r1;
        i3 = r2;

        r0 = [fp+ARG_COEFF];
        i1 = r0;
        b1 = i1;
        l1 = COEFF_LEN;
        m0 = COEFF_REL_CY_OFF;
        p0 = r3;

        r0   = [i0++];         // 2Y
        r1.l = w[i2++];        // 2u
        r1.h = w[i3++];        // 2v
        p0 = p0>>2;

        lsetup (.L0555, .L1555) lc0 = p0;

        /*
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
           r0 -- used to load 4ys
           r1 -- used to load 2us,2vs
           r4 -- y3,y2
           r5 -- y1,y0
           r6 -- u1,u0
           r7 -- v1,v0
        */
                                                              r2=[i1++]; // oy
 .L0555:
        /*
        rrrrrrrr gggggggg bbbbbbbb
         5432109876543210
                    bbbbb >>3
               gggggggg   <<2
          rrrrrrrr        <<7
         xrrrrrgggggbbbbb
        */

        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
        r5 = r5 << 2 (v);                                                // y1,y0
        r4 = r4 << 2 (v);                                                // y3,y2
        r6 = r6 << 2 (v)                                   || r0=[i1++]; // u1,u0, r0=zero
        r7 = r7 << 2 (v)                                   || r1=[i1++]; // v1,v0  r1=cy
        /* Y' = y*cy */
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 7 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask
        r2 = r2 << 2 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1=[i1++]; // cy

        /* Y' = y*cy */

        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2 = r2 >> 3 (v);
        r3 = r2 & r5;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu
        r2 = r2 << 7 (v);
        r2 = r2 & r5;
        r3 = r3 | r2;

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h) || r5=[i1++]; // gmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r0=[i0++];     // 4Y
        r2 = r2 << 2 (v)                                   || r1.l=w[i2++];  // 2u
        r2 = r2 & r5;
        r3 = r3 | r2;
        [p1++]=r3                                          || r1.h=w[i3++]; // 2v

 .L1555:                                                       r2=[i1++]; // oy

        l1 = 0;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuv2rgb555_line)

 DEFUN(yuv2rgb24_line,MEM,
   (uint8_t *Y, uint8_t *U, uint8_t *V, int *out, int dW, uint32_t *coeffs)):
        link 0;
        [--sp] = (r7:4);
        p1 = [fp+ARG_OUT];
        r3 = [fp+ARG_W];
        p2 = p1;
        p2 += 3;

        i0 = r0;
        i2 = r1;
        i3 = r2;

        r0 = [fp+ARG_COEFF]; // coeff buffer
        i1 = r0;
        b1 = i1;
        l1 = COEFF_LEN;
        m0 = COEFF_REL_CY_OFF;
        p0 = r3;

        r0   = [i0++];         // 2Y
        r1.l = w[i2++];        // 2u
        r1.h = w[i3++];        // 2v
        p0 = p0>>2;

        lsetup (.L0888, .L1888) lc0 = p0;

        /*
           uint32_t oy,oc,zero,cy,crv,rmask,cbu,bmask,cgu,cgv
           r0 -- used to load 4ys
           r1 -- used to load 2us,2vs
           r4 -- y3,y2
           r5 -- y1,y0
           r6 -- u1,u0
           r7 -- v1,v0
        */
                                                              r2=[i1++]; // oy
 .L0888:
        (r4,r5) = byteop16m (r1:0, r3:2)                   || r3=[i1++]; // oc
        (r7,r6) = byteop16m (r1:0, r3:2) (r);
        r5 = r5 << 2 (v);               // y1,y0
        r4 = r4 << 2 (v);               // y3,y2
        r6 = r6 << 2 (v) || r0=[i1++];  // u1,u0, r0=zero
        r7 = r7 << 2 (v) || r1=[i1++];  // v1,v0  r1=cy

        /* Y' = y*cy */
        a1 = r1.h*r5.h, a0 = r1.l*r5.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
                a1 -= r1.h*r7.l,          a0 -= r1.l*r7.l  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2=r2>>16 || B[p1++]=r2;
                     B[p2++]=r2;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.l), r2.l = (a0 += r1.l*r6.l);
                a1 -= r1.h*r6.l,          a0 -= r1.l*r6.l  || r5=[i1++]; // bmask
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.l,          a0 += r1.l*r6.l  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.l), r2.l = (a0 += r1.l*r7.l);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++m0]; // gmask, oy,cy,zero

        r2=r2>>16 || B[p1++]=r2;
                     B[p2++]=r2;

        r3=r3>>16 || B[p1++]=r3;
                     B[p2++]=r3                            || r1=[i1++]; // cy

        p1+=3;
        p2+=3;
        /* Y' = y*cy */
        a1 = r1.h*r4.h, a0 = r1.l*r4.l                     || r1=[i1++]; // crv

        /* R = Y+ crv*(Cr-128) */
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
                a1 -= r1.h*r7.h,          a0 -= r1.l*r7.h  || r5=[i1++]; // rmask
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cbu
        r2=r2>>16 || B[p1++]=r2;
        B[p2++]=r2;

        /* B = Y+ cbu*(Cb-128) */
        r2.h = (a1 += r1.h*r6.h), r2.l = (a0 += r1.l*r6.h);
                a1 -= r1.h*r6.h,          a0 -= r1.l*r6.h  || r5=[i1++]; // bmask
        r3 = byteop3p(r3:2, r1:0)(LO)                      || r1=[i1++]; // cgu

        /* G = Y+ cgu*(Cb-128)+cgv*(Cr-128) */
                a1 += r1.h*r6.h,          a0 += r1.l*r6.h  || r1=[i1++]; // cgv
        r2.h = (a1 += r1.h*r7.h), r2.l = (a0 += r1.l*r7.h);
        r2 = byteop3p(r3:2, r1:0)(LO)                      || r5=[i1++]; // gmask
        r2=r2>>16 || B[p1++]=r2 || r0 = [i0++];    // 4y
                     B[p2++]=r2 || r1.l = w[i2++]; // 2u
        r3=r3>>16 || B[p1++]=r3 || r1.h = w[i3++]; // 2v
                     B[p2++]=r3 || r2=[i1++];      // oy

        p1+=3;
 .L1888: p2+=3;

        l1 = 0;

        (r7:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuv2rgb24_line)



 #define ARG_vdst        20
 #define ARG_width       24
 #define ARG_height      28
 #define ARG_lumStride   32
 #define ARG_chromStride 36
 #define ARG_srcStride   40

 DEFUN(uyvytoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                         int width, int height,
                         int lumStride, int chromStride, int srcStride)):
        link 0;
        [--sp] = (r7:4,p5:4);

        p0 = r1;       // Y top even

        i2 = r2; // *u
        r2 = [fp + ARG_vdst];
        i3 = r2; // *v

        r1 = [fp + ARG_srcStride];
        r2 = r0 + r1;
        i0 = r0;  // uyvy_T even
        i1 = r2;  // uyvy_B odd

        p2 = [fp + ARG_lumStride];
        p1 = p0 + p2;  // Y bot odd

        p5 = [fp + ARG_width];
        p4 = [fp + ARG_height];
        r0 = p5;
        p4 = p4 >> 1;
        p5 = p5 >> 2;

        r2 = r0 << 1;
        r1 = r1 << 1;
        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
        r1 += -8;  // i0,i1 is pre read need to correct
        m0 = r1;

        r2 = [fp + ARG_chromStride];
        r0 = r0 >> 1;
        r2 = r2 - r0;
        m1 = r2;

        /*   I0,I1 - src input line pointers
         *   p0,p1 - luma output line pointers
         *   I2    - dstU
         *   I3    - dstV
         */

        lsetup (0f, 1f) lc1 = p4;   // H/2
 0:        r0 = [i0++] || r2 = [i1++];
          r1 = [i0++] || r3 = [i1++];
          r4 = byteop1p(r1:0, r3:2);
          r5 = byteop1p(r1:0, r3:2) (r);
          lsetup (2f, 3f) lc0 = p5; // W/4
 2:          r0 = r0 >> 8(v);
            r1 = r1 >> 8(v);
            r2 = r2 >> 8(v);
            r3 = r3 >> 8(v);
            r0 = bytepack(r0, r1);
            r2 = bytepack(r2, r3)         ||  [p0++] = r0;    // yyyy
            r6 = pack(r5.l, r4.l)         ||  [p1++] = r2;    // yyyy
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
            r4 = byteop1p(r1:0, r3:2)     ||  w[i2++] = r6.l; // uu
 3:          r5 = byteop1p(r1:0, r3:2) (r) ||  w[i3++] = r6.h; // vv

          i0 += m0;
          i1 += m0;
          i2 += m1;
          i3 += m1;
          p0 = p0 + p2;
 1:        p1 = p1 + p2;

        (r7:4,p5:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(uyvytoyv12)

 DEFUN(yuyvtoyv12, mL3,  (const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                         int width, int height,
                         int lumStride, int chromStride, int srcStride)):
        link 0;
        [--sp] = (r7:4,p5:4);

        p0 = r1;       // Y top even

        i2 = r2; // *u
        r2 = [fp + ARG_vdst];
        i3 = r2; // *v

        r1 = [fp + ARG_srcStride];
        r2 = r0 + r1;

        i0 = r0;  // uyvy_T even
        i1 = r2;  // uyvy_B odd

        p2 = [fp + ARG_lumStride];
        p1 = p0 + p2;  // Y bot odd

        p5 = [fp + ARG_width];
        p4 = [fp + ARG_height];
        r0 = p5;
        p4 = p4 >> 1;
        p5 = p5 >> 2;

        r2 = r0 << 1;
        r1 = r1 << 1;
        r1 = r1 - r2;  // srcStride + (srcStride - 2*width)
        r1 += -8;  // i0,i1 is pre read need to correct
        m0 = r1;

        r2 = [fp + ARG_chromStride];
        r0 = r0 >> 1;
        r2 = r2 - r0;
        m1 = r2;

        /*   I0,I1 - src input line pointers
         *   p0,p1 - luma output line pointers
         *   I2    - dstU
         *   I3    - dstV
         */

        lsetup (0f, 1f) lc1 = p4;   // H/2
 0:        r0 = [i0++] || r2 = [i1++];
          r1 = [i0++] || r3 = [i1++];
          r4 = bytepack(r0, r1);
          r5 = bytepack(r2, r3);
          lsetup (2f, 3f) lc0 = p5; // W/4
 2:          r0 = r0 >> 8(v) || [p0++] = r4;  // yyyy-even
            r1 = r1 >> 8(v) || [p1++] = r5;  // yyyy-odd
            r2 = r2 >> 8(v);
            r3 = r3 >> 8(v);
            r4 = byteop1p(r1:0, r3:2);
            r5 = byteop1p(r1:0, r3:2) (r);
            r6 = pack(r5.l, r4.l);
            r7 = pack(r5.h, r4.h)         ||  r0 = [i0++] || r2 = [i1++];
            r6 = bytepack(r6, r7)         ||  r1 = [i0++] || r3 = [i1++];
            r4 = bytepack(r0, r1)         ||  w[i2++] = r6.l; // uu
 3:          r5 = bytepack(r2, r3)         ||  w[i3++] = r6.h; // vv

          i0 += m0;
          i1 += m0;
          i2 += m1;
          i3 += m1;
          p0 = p0 + p2;
 1:        p1 = p1 + p2;

        (r7:4,p5:4) = [sp++];
        unlink;
        rts;
 DEFUN_END(yuyvtoyv12)
--- a/libswscale/bfin/swscale_bfin.c
+++ b/libswscale/bfin/swscale_bfin.c
@@ -1,84 +0,0 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * Blackfin software video scaler operations
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include <stdint.h>

 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/bfin/attributes.h"
 #include "libswscale/swscale_internal.h"

 int ff_bfin_uyvytoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                       uint8_t *vdst, int width, int height,
                       int lumStride, int chromStride,
                       int srcStride) attribute_l1_text;

 int ff_bfin_yuyvtoyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
                       uint8_t *vdst, int width, int height,
                       int lumStride, int chromStride,
                       int srcStride) attribute_l1_text;

 static int uyvytoyv12_unscaled(SwsContext *c, const uint8_t *src[],
                               int srcStride[], int srcSliceY, int srcSliceH,
                               uint8_t *dst[], int dstStride[])
 {
    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
    const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
    int w = dstStride[0];

    ff_bfin_uyvytoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
                       dstStride[0], dstStride[1], srcStride[0]);

    return srcSliceH;
 }

 static int yuyvtoyv12_unscaled(SwsContext *c, const uint8_t *src[],
                               int srcStride[], int srcSliceY, int srcSliceH,
                               uint8_t *dst[], int dstStride[])
 {
    uint8_t *dsty = dst[0] + dstStride[0] * srcSliceY;
    uint8_t *dstu = dst[1] + dstStride[1] * srcSliceY / 2;
    uint8_t *dstv = dst[2] + dstStride[2] * srcSliceY / 2;
    const uint8_t *ip = src[0] + srcStride[0] * srcSliceY;
    int w = dstStride[0];

    ff_bfin_yuyvtoyv12(ip, dsty, dstu, dstv, w, srcSliceH,
                       dstStride[0], dstStride[1], srcStride[0]);

    return srcSliceH;
 }

 av_cold void ff_get_unscaled_swscale_bfin(SwsContext *c)
 {
    if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_UYVY422) {
        av_log(NULL, AV_LOG_VERBOSE,
               "selecting Blackfin optimized uyvytoyv12_unscaled\n");
        c->swscale = uyvytoyv12_unscaled;
    }
    if (c->dstFormat == AV_PIX_FMT_YUV420P && c->srcFormat == AV_PIX_FMT_YUYV422) {
        av_log(NULL, AV_LOG_VERBOSE,
               "selecting Blackfin optimized yuyvtoyv12_unscaled\n");
        c->swscale = yuyvtoyv12_unscaled;
    }
 }
--- a/libswscale/bfin/yuv2rgb_bfin.c
+++ b/libswscale/bfin/yuv2rgb_bfin.c
@@ -1,198 +0,0 @@
 /*
 * Copyright (C) 2007 Marc Hoffman <marc.hoffman@analog.com>
 *
 * Blackfin video color space converter operations
 * convert I420 YV12 to RGB in various formats
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

 #include "libavutil/pixdesc.h"
 #include <stdint.h>

 #include "config.h"
 #include "libavutil/attributes.h"
 #include "libavutil/bfin/attributes.h"
 #include "libswscale/swscale_internal.h"

 void ff_bfin_yuv2rgb555_line(const uint8_t *Y, const uint8_t *U,
                             const uint8_t *V, uint8_t *out,
                             int w, uint32_t *coeffs) attribute_l1_text;

 void ff_bfin_yuv2rgb565_line(const uint8_t *Y, const uint8_t *U,
                             const uint8_t *V, uint8_t *out,
                             int w, uint32_t *coeffs) attribute_l1_text;

 void ff_bfin_yuv2rgb24_line(const uint8_t *Y, const uint8_t *U,
                            const uint8_t *V, uint8_t *out,
                            int w, uint32_t *coeffs) attribute_l1_text;

 typedef void (*ltransform)(const uint8_t *Y, const uint8_t *U, const uint8_t *V,
                           uint8_t *out, int w, uint32_t *coeffs);

 static void bfin_prepare_coefficients(SwsContext *c, int rgb, int masks)
 {
    int oy;
    oy = c->yOffset & 0xffff;
    oy = oy >> 3;      // keep everything U8.0 for offset calculation

    c->oc = 128 * 0x01010101U;
    c->oy = oy * 0x01010101U;

    /* copy 64bit vector coeffs down to 32bit vector coeffs */
    c->cy   = c->yCoeff;
    c->zero = 0;

    if (rgb) {
        c->crv = c->vrCoeff;
        c->cbu = c->ubCoeff;
        c->cgu = c->ugCoeff;
        c->cgv = c->vgCoeff;
    } else {
        c->crv = c->ubCoeff;
        c->cbu = c->vrCoeff;
        c->cgu = c->vgCoeff;
        c->cgv = c->ugCoeff;
    }

    if (masks == 555) {
        c->rmask = 0x001f * 0x00010001U;
        c->gmask = 0x03e0 * 0x00010001U;
        c->bmask = 0x7c00 * 0x00010001U;
    } else if (masks == 565) {
        c->rmask = 0x001f * 0x00010001U;
        c->gmask = 0x07e0 * 0x00010001U;
        c->bmask = 0xf800 * 0x00010001U;
    }
 }

 static int core_yuv420_rgb(SwsContext *c, const uint8_t **in, int *instrides,
                           int srcSliceY, int srcSliceH, uint8_t **oplanes,
                           int *outstrides, ltransform lcscf,
                           int rgb, int masks)
 {
    const uint8_t *py, *pu, *pv;
    uint8_t *op;
    int w  = instrides[0];
    int h2 = srcSliceH >> 1;
    int i;

    bfin_prepare_coefficients(c, rgb, masks);

    py = in[0];
    pu = in[1 + (1 ^ rgb)];
    pv = in[1 + (0 ^ rgb)];

    op = oplanes[0] + srcSliceY * outstrides[0];

    for (i = 0; i < h2; i++) {
        lcscf(py, pu, pv, op, w, &c->oy);

        py += instrides[0];
        op += outstrides[0];

        lcscf(py, pu, pv, op, w, &c->oy);

        py += instrides[0];
        pu += instrides[1];
        pv += instrides[2];
        op += outstrides[0];
    }

    return srcSliceH;
 }

 static int bfin_yuv420_rgb555(SwsContext *c, const uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
                           outstrides, ff_bfin_yuv2rgb555_line, 1, 555);
 }

 static int bfin_yuv420_bgr555(SwsContext *c, const uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
                           outstrides, ff_bfin_yuv2rgb555_line, 0, 555);
 }

 static int bfin_yuv420_rgb24(SwsContext *c, const uint8_t **in, int *instrides,
                             int srcSliceY, int srcSliceH,
                             uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
                           outstrides, ff_bfin_yuv2rgb24_line, 1, 888);
 }

 static int bfin_yuv420_bgr24(SwsContext *c, const uint8_t **in, int *instrides,
                             int srcSliceY, int srcSliceH,
                             uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
                           outstrides, ff_bfin_yuv2rgb24_line, 0, 888);
 }

 static int bfin_yuv420_rgb565(SwsContext *c, const uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
                           outstrides, ff_bfin_yuv2rgb565_line, 1, 565);
 }

 static int bfin_yuv420_bgr565(SwsContext *c, const uint8_t **in, int *instrides,
                              int srcSliceY, int srcSliceH,
                              uint8_t **oplanes, int *outstrides)
 {
    return core_yuv420_rgb(c, in, instrides, srcSliceY, srcSliceH, oplanes,
                           outstrides, ff_bfin_yuv2rgb565_line, 0, 565);
 }

 av_cold SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c)
 {
    SwsFunc f;

    switch (c->dstFormat) {
    case AV_PIX_FMT_RGB555:
        f = bfin_yuv420_rgb555;
        break;
    case AV_PIX_FMT_BGR555:
        f = bfin_yuv420_bgr555;
        break;
    case AV_PIX_FMT_RGB565:
        f = bfin_yuv420_rgb565;
        break;
    case AV_PIX_FMT_BGR565:
        f = bfin_yuv420_bgr565;
        break;
    case AV_PIX_FMT_RGB24:
        f = bfin_yuv420_rgb24;
        break;
    case AV_PIX_FMT_BGR24:
        f = bfin_yuv420_bgr24;
        break;
    default:
        return 0;
    }

    av_log(c, AV_LOG_INFO, "BlackFin accelerated color space converter %s\n",
           av_get_pix_fmt_name(c->dstFormat));

    return f;
 }
--- a/libswscale/swscale.h
+++ b/libswscale/swscale.h
@@ -92,7 +92,9 @@ const char *swscale_license(void);
 #define SWS_CPU_CAPS_MMX2     0x20000000
 #define SWS_CPU_CAPS_3DNOW    0x40000000
 #define SWS_CPU_CAPS_ALTIVEC  0x10000000
 #if FF_API_ARCH_BFIN
 #define SWS_CPU_CAPS_BFIN     0x01000000
 #endif
 #define SWS_CPU_CAPS_SSE2     0x02000000
 #endif

--- a/libswscale/swscale_internal.h
+++ b/libswscale/swscale_internal.h
@@ -473,20 +473,6 @@ typedef struct SwsContext {
    vector signed short  *vYCoeffsBank, *vCCoeffsBank;
 #endif

 #if ARCH_BFIN
    DECLARE_ALIGNED(4, uint32_t, oy);
    DECLARE_ALIGNED(4, uint32_t, oc);
    DECLARE_ALIGNED(4, uint32_t, zero);
    DECLARE_ALIGNED(4, uint32_t, cy);
    DECLARE_ALIGNED(4, uint32_t, crv);
    DECLARE_ALIGNED(4, uint32_t, rmask);
    DECLARE_ALIGNED(4, uint32_t, cbu);
    DECLARE_ALIGNED(4, uint32_t, bmask);
    DECLARE_ALIGNED(4, uint32_t, cgu);
    DECLARE_ALIGNED(4, uint32_t, cgv);
    DECLARE_ALIGNED(4, uint32_t, gmask);
 #endif

    int use_mmx_vfilter;

 /* pre defined color-spaces gamma */
@@ -622,7 +608,6 @@ av_cold void ff_sws_init_range_convert(SwsContext *c);

 SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
 SwsFunc ff_yuv2rgb_init_bfin(SwsContext *c);

 #if FF_API_SWS_FORMAT_NAME
 /**
@@ -851,7 +836,6 @@ extern const AVClass sws_context_class;
 * source and destination formats, bit depths, flags, etc.
 */
 void ff_get_unscaled_swscale(SwsContext *c);
 void ff_get_unscaled_swscale_bfin(SwsContext *c);
 void ff_get_unscaled_swscale_ppc(SwsContext *c);
 void ff_get_unscaled_swscale_arm(SwsContext *c);

--- a/libswscale/swscale_unscaled.c
+++ b/libswscale/swscale_unscaled.c
@@ -1749,8 +1749,6 @@ void ff_get_unscaled_swscale(SwsContext *c)
            c->swscale = planarCopyWrapper;
    }

    if (ARCH_BFIN)
        ff_get_unscaled_swscale_bfin(c);
    if (ARCH_PPC)
        ff_get_unscaled_swscale_ppc(c);
 //     if (ARCH_ARM)
--- a/libswscale/version.h
+++ b/libswscale/version.h
@@ -55,5 +55,8 @@
 #ifndef FF_API_SWS_FORMAT_NAME
 #define FF_API_SWS_FORMAT_NAME  (LIBSWSCALE_VERSION_MAJOR < 3)
 #endif
 #ifndef FF_API_ARCH_BFIN
 #define FF_API_ARCH_BFIN       (LIBSWSCALE_VERSION_MAJOR < 3)
 #endif

 #endif /* SWSCALE_VERSION_H */
--- a/libswscale/yuv2rgb.c
+++ b/libswscale/yuv2rgb.c
@@ -601,8 +601,6 @@ SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
 {
    SwsFunc t = NULL;

    if (ARCH_BFIN)
        t = ff_yuv2rgb_init_bfin(c);
    if (ARCH_PPC)
        t = ff_yuv2rgb_init_ppc(c);
    if (ARCH_X86)