* commit 'c166148409fe8f0dbccef2fe684286a40ba1e37d': dsputil: Move pix_sum, pix_norm1, shrink function pointers to mpegvideoenc Conflicts: libavcodec/dsputil.c libavcodec/mpegvideo_enc.c libavcodec/x86/dsputilenc.asm libavcodec/x86/dsputilenc_mmx.c Merged-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.3
@@ -22,6 +22,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ | |||
OBJS-$(CONFIG_LLAUDDSP) += arm/lossless_audiodsp_init_arm.o | |||
OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o | |||
OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o | |||
OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o | |||
OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o | |||
OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o | |||
OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o | |||
@@ -61,6 +62,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ | |||
arm/idctdsp_armv6.o \ | |||
arm/simple_idct_armv6.o | |||
ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o | |||
ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o | |||
ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o | |||
ARMV6-OBJS-$(CONFIG_VC1_DECODER) += arm/startcode_armv6.o | |||
@@ -297,58 +297,3 @@ function ff_sse16_armv6, export=1 | |||
pop {r4-r9, pc} | |||
endfunc | |||
function ff_pix_norm1_armv6, export=1 | |||
push {r4-r6, lr} | |||
mov r12, #16 | |||
mov lr, #0 | |||
1: | |||
ldm r0, {r2-r5} | |||
uxtb16 r6, r2 | |||
uxtb16 r2, r2, ror #8 | |||
smlad lr, r6, r6, lr | |||
uxtb16 r6, r3 | |||
smlad lr, r2, r2, lr | |||
uxtb16 r3, r3, ror #8 | |||
smlad lr, r6, r6, lr | |||
uxtb16 r6, r4 | |||
smlad lr, r3, r3, lr | |||
uxtb16 r4, r4, ror #8 | |||
smlad lr, r6, r6, lr | |||
uxtb16 r6, r5 | |||
smlad lr, r4, r4, lr | |||
uxtb16 r5, r5, ror #8 | |||
smlad lr, r6, r6, lr | |||
subs r12, r12, #1 | |||
add r0, r0, r1 | |||
smlad lr, r5, r5, lr | |||
bgt 1b | |||
mov r0, lr | |||
pop {r4-r6, pc} | |||
endfunc | |||
function ff_pix_sum_armv6, export=1 | |||
push {r4-r7, lr} | |||
mov r12, #16 | |||
mov r2, #0 | |||
mov r3, #0 | |||
mov lr, #0 | |||
ldr r4, [r0] | |||
1: | |||
subs r12, r12, #1 | |||
ldr r5, [r0, #4] | |||
usada8 r2, r4, lr, r2 | |||
ldr r6, [r0, #8] | |||
usada8 r3, r5, lr, r3 | |||
ldr r7, [r0, #12] | |||
usada8 r2, r6, lr, r2 | |||
beq 2f | |||
ldr_pre r4, r0, r1 | |||
usada8 r3, r7, lr, r3 | |||
bgt 1b | |||
2: | |||
usada8 r3, r7, lr, r3 | |||
add r0, r2, r3 | |||
pop {r4-r7, pc} | |||
endfunc |
@@ -43,9 +43,6 @@ int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, | |||
int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, | |||
int line_size, int h); | |||
int ff_pix_norm1_armv6(uint8_t *pix, int line_size); | |||
int ff_pix_sum_armv6(uint8_t *pix, int line_size); | |||
av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, | |||
unsigned high_bit_depth) | |||
{ | |||
@@ -63,7 +60,4 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, | |||
c->sad[1] = ff_pix_abs8_armv6; | |||
c->sse[0] = ff_sse16_armv6; | |||
c->pix_norm1 = ff_pix_norm1_armv6; | |||
c->pix_sum = ff_pix_sum_armv6; | |||
} |
@@ -0,0 +1,76 @@ | |||
/* | |||
* Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |||
* | |||
* This file is part of FFmpeg. | |||
* | |||
* FFmpeg is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* FFmpeg is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with FFmpeg; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
#include "libavutil/arm/asm.S" | |||
function ff_pix_norm1_armv6, export=1 | |||
push {r4-r6, lr} | |||
mov r12, #16 | |||
mov lr, #0 | |||
1: | |||
ldm r0, {r2-r5} | |||
uxtb16 r6, r2 | |||
uxtb16 r2, r2, ror #8 | |||
smlad lr, r6, r6, lr | |||
uxtb16 r6, r3 | |||
smlad lr, r2, r2, lr | |||
uxtb16 r3, r3, ror #8 | |||
smlad lr, r6, r6, lr | |||
uxtb16 r6, r4 | |||
smlad lr, r3, r3, lr | |||
uxtb16 r4, r4, ror #8 | |||
smlad lr, r6, r6, lr | |||
uxtb16 r6, r5 | |||
smlad lr, r4, r4, lr | |||
uxtb16 r5, r5, ror #8 | |||
smlad lr, r6, r6, lr | |||
subs r12, r12, #1 | |||
add r0, r0, r1 | |||
smlad lr, r5, r5, lr | |||
bgt 1b | |||
mov r0, lr | |||
pop {r4-r6, pc} | |||
endfunc | |||
function ff_pix_sum_armv6, export=1 | |||
push {r4-r7, lr} | |||
mov r12, #16 | |||
mov r2, #0 | |||
mov r3, #0 | |||
mov lr, #0 | |||
ldr r4, [r0] | |||
1: | |||
subs r12, r12, #1 | |||
ldr r5, [r0, #4] | |||
usada8 r2, r4, lr, r2 | |||
ldr r6, [r0, #8] | |||
usada8 r3, r5, lr, r3 | |||
ldr r7, [r0, #12] | |||
usada8 r2, r6, lr, r2 | |||
beq 2f | |||
ldr_pre r4, r0, r1 | |||
usada8 r3, r7, lr, r3 | |||
bgt 1b | |||
2: | |||
usada8 r3, r7, lr, r3 | |||
add r0, r2, r3 | |||
pop {r4-r7, pc} | |||
endfunc |
@@ -0,0 +1,38 @@ | |||
/* | |||
* This file is part of FFmpeg. | |||
* | |||
* FFmpeg is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* FFmpeg is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with FFmpeg; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
#include <stdint.h> | |||
#include "libavutil/cpu.h" | |||
#include "libavutil/arm/cpu.h" | |||
#include "libavcodec/avcodec.h" | |||
#include "libavcodec/mpegvideoencdsp.h" | |||
int ff_pix_norm1_armv6(uint8_t *pix, int line_size); | |||
int ff_pix_sum_armv6(uint8_t *pix, int line_size); | |||
av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx) | |||
{ | |||
int cpu_flags = av_get_cpu_flags(); | |||
if (have_armv6(cpu_flags)) { | |||
c->pix_norm1 = ff_pix_norm1_armv6; | |||
c->pix_sum = ff_pix_sum_armv6; | |||
} | |||
} |
@@ -323,6 +323,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx) | |||
ff_blockdsp_init(&ctx->bdsp, avctx); | |||
ff_idctdsp_init(&ctx->m.idsp, avctx); | |||
ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx); | |||
ff_dct_common_init(&ctx->m); | |||
ff_dct_encode_init(&ctx->m); | |||
@@ -733,8 +734,8 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, | |||
int varc; | |||
if (!partial_last_row && mb_x * 16 <= avctx->width - 16) { | |||
sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize); | |||
varc = ctx->m.dsp.pix_norm1(pix, ctx->m.linesize); | |||
sum = ctx->m.mpvencdsp.pix_sum(pix, ctx->m.linesize); | |||
varc = ctx->m.mpvencdsp.pix_norm1(pix, ctx->m.linesize); | |||
} else { | |||
int bw = FFMIN(avctx->width - 16 * mb_x, 16); | |||
int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16); | |||
@@ -26,7 +26,6 @@ | |||
*/ | |||
#include "libavutil/attributes.h" | |||
#include "libavutil/imgutils.h" | |||
#include "libavutil/internal.h" | |||
#include "avcodec.h" | |||
#include "copy_block.h" | |||
@@ -34,8 +33,6 @@ | |||
#include "dsputil.h" | |||
#include "simple_idct.h" | |||
#include "faandct.h" | |||
#include "imgconvert.h" | |||
#include "mathops.h" | |||
#include "mpegvideo.h" | |||
#include "config.h" | |||
@@ -48,74 +45,6 @@ uint32_t ff_square_tab[512] = { 0, }; | |||
#define BIT_DEPTH 8 | |||
#include "dsputilenc_template.c" | |||
static int pix_sum_c(uint8_t *pix, int line_size) | |||
{ | |||
int s = 0, i, j; | |||
for (i = 0; i < 16; i++) { | |||
for (j = 0; j < 16; j += 8) { | |||
s += pix[0]; | |||
s += pix[1]; | |||
s += pix[2]; | |||
s += pix[3]; | |||
s += pix[4]; | |||
s += pix[5]; | |||
s += pix[6]; | |||
s += pix[7]; | |||
pix += 8; | |||
} | |||
pix += line_size - 16; | |||
} | |||
return s; | |||
} | |||
static int pix_norm1_c(uint8_t *pix, int line_size) | |||
{ | |||
int s = 0, i, j; | |||
uint32_t *sq = ff_square_tab + 256; | |||
for (i = 0; i < 16; i++) { | |||
for (j = 0; j < 16; j += 8) { | |||
#if 0 | |||
s += sq[pix[0]]; | |||
s += sq[pix[1]]; | |||
s += sq[pix[2]]; | |||
s += sq[pix[3]]; | |||
s += sq[pix[4]]; | |||
s += sq[pix[5]]; | |||
s += sq[pix[6]]; | |||
s += sq[pix[7]]; | |||
#else | |||
#if HAVE_FAST_64BIT | |||
register uint64_t x = *(uint64_t *) pix; | |||
s += sq[x & 0xff]; | |||
s += sq[(x >> 8) & 0xff]; | |||
s += sq[(x >> 16) & 0xff]; | |||
s += sq[(x >> 24) & 0xff]; | |||
s += sq[(x >> 32) & 0xff]; | |||
s += sq[(x >> 40) & 0xff]; | |||
s += sq[(x >> 48) & 0xff]; | |||
s += sq[(x >> 56) & 0xff]; | |||
#else | |||
register uint32_t x = *(uint32_t *) pix; | |||
s += sq[x & 0xff]; | |||
s += sq[(x >> 8) & 0xff]; | |||
s += sq[(x >> 16) & 0xff]; | |||
s += sq[(x >> 24) & 0xff]; | |||
x = *(uint32_t *) (pix + 4); | |||
s += sq[x & 0xff]; | |||
s += sq[(x >> 8) & 0xff]; | |||
s += sq[(x >> 16) & 0xff]; | |||
s += sq[(x >> 24) & 0xff]; | |||
#endif | |||
#endif | |||
pix += 8; | |||
} | |||
pix += line_size - 16; | |||
} | |||
return s; | |||
} | |||
static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |||
int line_size, int h) | |||
{ | |||
@@ -1094,9 +1023,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) | |||
c->sum_abs_dctelem = sum_abs_dctelem_c; | |||
c->pix_sum = pix_sum_c; | |||
c->pix_norm1 = pix_norm1_c; | |||
/* TODO [0] 16 [1] 8 */ | |||
c->pix_abs[0][0] = pix_abs16_c; | |||
c->pix_abs[0][1] = pix_abs16_x2_c; | |||
@@ -1141,11 +1067,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) | |||
ff_dsputil_init_dwt(c); | |||
#endif | |||
c->shrink[0] = av_image_copy_plane; | |||
c->shrink[1] = ff_shrink22; | |||
c->shrink[2] = ff_shrink44; | |||
c->shrink[3] = ff_shrink88; | |||
c->draw_edges = draw_edges_8_c; | |||
switch (avctx->bits_per_raw_sample) { | |||
@@ -72,9 +72,6 @@ typedef struct DSPContext { | |||
int stride); | |||
int (*sum_abs_dctelem)(int16_t *block /* align 16 */); | |||
int (*pix_sum)(uint8_t *pix, int line_size); | |||
int (*pix_norm1)(uint8_t *pix, int line_size); | |||
me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ | |||
me_cmp_func sse[6]; | |||
me_cmp_func hadamard8_diff[6]; | |||
@@ -108,9 +105,6 @@ typedef struct DSPContext { | |||
#define EDGE_WIDTH 16 | |||
#define EDGE_TOP 1 | |||
#define EDGE_BOTTOM 2 | |||
void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, | |||
int src_wrap, int width, int height); | |||
} DSPContext; | |||
void ff_dsputil_static_init(void); | |||
@@ -903,8 +903,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, | |||
/* intra / predictive decision */ | |||
pix = c->src[0][0]; | |||
sum = s->dsp.pix_sum(pix, s->linesize); | |||
varc = s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500; | |||
sum = s->mpvencdsp.pix_sum(pix, s->linesize); | |||
varc = s->mpvencdsp.pix_norm1(pix, s->linesize) - | |||
(((unsigned) sum * sum) >> 8) + 500; | |||
pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; | |||
pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8; | |||
@@ -1010,7 +1010,7 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src, | |||
int offset = x + y * stride; | |||
int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride, | |||
16); | |||
int mean = (s->dsp.pix_sum(src + offset, stride) + 128) >> 8; | |||
int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8; | |||
int sae = get_sae(src + offset, mean, stride); | |||
acc += sae + 500 < sad; | |||
@@ -1278,15 +1278,21 @@ static int estimate_best_b_count(MpegEncContext *s) | |||
data[2] += INPLACE_OFFSET; | |||
} | |||
s->dsp.shrink[scale](s->tmp_frames[i]->data[0], s->tmp_frames[i]->linesize[0], | |||
data[0], pre_input.f->linesize[0], | |||
c->width, c->height); | |||
s->dsp.shrink[scale](s->tmp_frames[i]->data[1], s->tmp_frames[i]->linesize[1], | |||
data[1], pre_input.f->linesize[1], | |||
c->width >> 1, c->height >> 1); | |||
s->dsp.shrink[scale](s->tmp_frames[i]->data[2], s->tmp_frames[i]->linesize[2], | |||
data[2], pre_input.f->linesize[2], | |||
c->width >> 1, c->height >> 1); | |||
s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0], | |||
s->tmp_frames[i]->linesize[0], | |||
data[0], | |||
pre_input.f->linesize[0], | |||
c->width, c->height); | |||
s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1], | |||
s->tmp_frames[i]->linesize[1], | |||
data[1], | |||
pre_input.f->linesize[1], | |||
c->width >> 1, c->height >> 1); | |||
s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2], | |||
s->tmp_frames[i]->linesize[2], | |||
data[2], | |||
pre_input.f->linesize[2], | |||
c->width >> 1, c->height >> 1); | |||
} | |||
} | |||
@@ -2585,9 +2591,10 @@ static int mb_var_thread(AVCodecContext *c, void *arg){ | |||
int yy = mb_y * 16; | |||
uint8_t *pix = s->new_picture.f->data[0] + (yy * s->linesize) + xx; | |||
int varc; | |||
int sum = s->dsp.pix_sum(pix, s->linesize); | |||
int sum = s->mpvencdsp.pix_sum(pix, s->linesize); | |||
varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500 + 128)>>8; | |||
varc = (s->mpvencdsp.pix_norm1(pix, s->linesize) - | |||
(((unsigned) sum * sum) >> 8) + 500 + 128) >> 8; | |||
s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc; | |||
s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; | |||
@@ -22,7 +22,10 @@ | |||
#include "config.h" | |||
#include "libavutil/avassert.h" | |||
#include "libavutil/attributes.h" | |||
#include "libavutil/imgutils.h" | |||
#include "avcodec.h" | |||
#include "dsputil.h" | |||
#include "imgconvert.h" | |||
#include "mpegvideoencdsp.h" | |||
static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], | |||
@@ -54,12 +57,92 @@ static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale) | |||
(BASIS_SHIFT - RECON_SHIFT); | |||
} | |||
static int pix_sum_c(uint8_t *pix, int line_size) | |||
{ | |||
int s = 0, i, j; | |||
for (i = 0; i < 16; i++) { | |||
for (j = 0; j < 16; j += 8) { | |||
s += pix[0]; | |||
s += pix[1]; | |||
s += pix[2]; | |||
s += pix[3]; | |||
s += pix[4]; | |||
s += pix[5]; | |||
s += pix[6]; | |||
s += pix[7]; | |||
pix += 8; | |||
} | |||
pix += line_size - 16; | |||
} | |||
return s; | |||
} | |||
static int pix_norm1_c(uint8_t *pix, int line_size) | |||
{ | |||
int s = 0, i, j; | |||
uint32_t *sq = ff_square_tab + 256; | |||
for (i = 0; i < 16; i++) { | |||
for (j = 0; j < 16; j += 8) { | |||
#if 0 | |||
s += sq[pix[0]]; | |||
s += sq[pix[1]]; | |||
s += sq[pix[2]]; | |||
s += sq[pix[3]]; | |||
s += sq[pix[4]]; | |||
s += sq[pix[5]]; | |||
s += sq[pix[6]]; | |||
s += sq[pix[7]]; | |||
#else | |||
#if HAVE_FAST_64BIT | |||
register uint64_t x = *(uint64_t *) pix; | |||
s += sq[x & 0xff]; | |||
s += sq[(x >> 8) & 0xff]; | |||
s += sq[(x >> 16) & 0xff]; | |||
s += sq[(x >> 24) & 0xff]; | |||
s += sq[(x >> 32) & 0xff]; | |||
s += sq[(x >> 40) & 0xff]; | |||
s += sq[(x >> 48) & 0xff]; | |||
s += sq[(x >> 56) & 0xff]; | |||
#else | |||
register uint32_t x = *(uint32_t *) pix; | |||
s += sq[x & 0xff]; | |||
s += sq[(x >> 8) & 0xff]; | |||
s += sq[(x >> 16) & 0xff]; | |||
s += sq[(x >> 24) & 0xff]; | |||
x = *(uint32_t *) (pix + 4); | |||
s += sq[x & 0xff]; | |||
s += sq[(x >> 8) & 0xff]; | |||
s += sq[(x >> 16) & 0xff]; | |||
s += sq[(x >> 24) & 0xff]; | |||
#endif | |||
#endif | |||
pix += 8; | |||
} | |||
pix += line_size - 16; | |||
} | |||
return s; | |||
} | |||
av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx) | |||
{ | |||
c->try_8x8basis = try_8x8basis_c; | |||
c->add_8x8basis = add_8x8basis_c; | |||
c->shrink[0] = av_image_copy_plane; | |||
c->shrink[1] = ff_shrink22; | |||
c->shrink[2] = ff_shrink44; | |||
c->shrink[3] = ff_shrink88; | |||
c->pix_sum = pix_sum_c; | |||
c->pix_norm1 = pix_norm1_c; | |||
if (ARCH_ARM) | |||
ff_mpegvideoencdsp_init_arm(c, avctx); | |||
if (ARCH_PPC) | |||
ff_mpegvideoencdsp_init_ppc(c, avctx); | |||
if (ARCH_X86) | |||
ff_mpegvideoencdsp_init_x86(c, avctx); | |||
} |
@@ -31,10 +31,19 @@ typedef struct MpegvideoEncDSPContext { | |||
int16_t basis[64], int scale); | |||
void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale); | |||
int (*pix_sum)(uint8_t *pix, int line_size); | |||
int (*pix_norm1)(uint8_t *pix, int line_size); | |||
void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, | |||
int src_wrap, int width, int height); | |||
} MpegvideoEncDSPContext; | |||
void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx); | |||
void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx); | |||
void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx); | |||
void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx); | |||
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o | |||
OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o | |||
OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ | |||
ppc/mpegvideodsp.o | |||
OBJS-$(CONFIG_MPEGVIDEOENC) += ppc/mpegvideoencdsp.o | |||
OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o | |||
OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o | |||
@@ -308,34 +308,6 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |||
return s; | |||
} | |||
static int pix_norm1_altivec(uint8_t *pix, int line_size) | |||
{ | |||
int i, s = 0; | |||
const vector unsigned int zero = | |||
(const vector unsigned int) vec_splat_u32(0); | |||
vector unsigned char perm = vec_lvsl(0, pix); | |||
vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); | |||
vector signed int sum; | |||
for (i = 0; i < 16; i++) { | |||
/* Read the potentially unaligned pixels. */ | |||
vector unsigned char pixl = vec_ld(0, pix); | |||
vector unsigned char pixr = vec_ld(15, pix); | |||
vector unsigned char pixv = vec_perm(pixl, pixr, perm); | |||
/* Square the values, and add them to our sum. */ | |||
sv = vec_msum(pixv, pixv, sv); | |||
pix += line_size; | |||
} | |||
/* Sum up the four partial sums, and put the result into s. */ | |||
sum = vec_sums((vector signed int) sv, (vector signed int) zero); | |||
sum = vec_splat(sum, 3); | |||
vec_ste(sum, 0, &s); | |||
return s; | |||
} | |||
/* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. | |||
* It's the sad8_altivec code above w/ squaring added. */ | |||
static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |||
@@ -430,35 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |||
return s; | |||
} | |||
static int pix_sum_altivec(uint8_t *pix, int line_size) | |||
{ | |||
int i, s; | |||
const vector unsigned int zero = | |||
(const vector unsigned int) vec_splat_u32(0); | |||
vector unsigned char perm = vec_lvsl(0, pix); | |||
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); | |||
vector signed int sumdiffs; | |||
for (i = 0; i < 16; i++) { | |||
/* Read the potentially unaligned 16 pixels into t1. */ | |||
vector unsigned char pixl = vec_ld(0, pix); | |||
vector unsigned char pixr = vec_ld(15, pix); | |||
vector unsigned char t1 = vec_perm(pixl, pixr, perm); | |||
/* Add each 4 pixel group together and put 4 results into sad. */ | |||
sad = vec_sum4s(t1, sad); | |||
pix += line_size; | |||
} | |||
/* Sum up the four partial sums, and put the result into s. */ | |||
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |||
sumdiffs = vec_splat(sumdiffs, 3); | |||
vec_ste(sumdiffs, 0, &s); | |||
return s; | |||
} | |||
static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, | |||
int line_size) | |||
{ | |||
@@ -911,9 +854,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx, | |||
c->sse[0] = sse16_altivec; | |||
c->sse[1] = sse8_altivec; | |||
c->pix_norm1 = pix_norm1_altivec; | |||
c->pix_sum = pix_sum_altivec; | |||
c->diff_pixels = diff_pixels_altivec; | |||
if (!high_bit_depth) { | |||
@@ -0,0 +1,103 @@ | |||
/* | |||
* This file is part of FFmpeg. | |||
* | |||
* FFmpeg is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* FFmpeg is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with FFmpeg; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
#include "config.h" | |||
#include <stdint.h> | |||
#if HAVE_ALTIVEC_H | |||
#include <altivec.h> | |||
#endif | |||
#include "libavutil/attributes.h" | |||
#include "libavutil/cpu.h" | |||
#include "libavutil/ppc/cpu.h" | |||
#include "libavutil/ppc/types_altivec.h" | |||
#include "libavutil/ppc/util_altivec.h" | |||
#include "libavcodec/mpegvideoencdsp.h" | |||
#if HAVE_ALTIVEC | |||
static int pix_norm1_altivec(uint8_t *pix, int line_size) | |||
{ | |||
int i, s = 0; | |||
const vector unsigned int zero = | |||
(const vector unsigned int) vec_splat_u32(0); | |||
vector unsigned char perm = vec_lvsl(0, pix); | |||
vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); | |||
vector signed int sum; | |||
for (i = 0; i < 16; i++) { | |||
/* Read the potentially unaligned pixels. */ | |||
vector unsigned char pixl = vec_ld(0, pix); | |||
vector unsigned char pixr = vec_ld(15, pix); | |||
vector unsigned char pixv = vec_perm(pixl, pixr, perm); | |||
/* Square the values, and add them to our sum. */ | |||
sv = vec_msum(pixv, pixv, sv); | |||
pix += line_size; | |||
} | |||
/* Sum up the four partial sums, and put the result into s. */ | |||
sum = vec_sums((vector signed int) sv, (vector signed int) zero); | |||
sum = vec_splat(sum, 3); | |||
vec_ste(sum, 0, &s); | |||
return s; | |||
} | |||
static int pix_sum_altivec(uint8_t *pix, int line_size) | |||
{ | |||
int i, s; | |||
const vector unsigned int zero = | |||
(const vector unsigned int) vec_splat_u32(0); | |||
vector unsigned char perm = vec_lvsl(0, pix); | |||
vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); | |||
vector signed int sumdiffs; | |||
for (i = 0; i < 16; i++) { | |||
/* Read the potentially unaligned 16 pixels into t1. */ | |||
vector unsigned char pixl = vec_ld(0, pix); | |||
vector unsigned char pixr = vec_ld(15, pix); | |||
vector unsigned char t1 = vec_perm(pixl, pixr, perm); | |||
/* Add each 4 pixel group together and put 4 results into sad. */ | |||
sad = vec_sum4s(t1, sad); | |||
pix += line_size; | |||
} | |||
/* Sum up the four partial sums, and put the result into s. */ | |||
sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |||
sumdiffs = vec_splat(sumdiffs, 3); | |||
vec_ste(sumdiffs, 0, &s); | |||
return s; | |||
} | |||
#endif /* HAVE_ALTIVEC */ | |||
av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx) | |||
{ | |||
#if HAVE_ALTIVEC | |||
if (!PPC_ALTIVEC(av_get_cpu_flags())) | |||
return; | |||
c->pix_norm1 = pix_norm1_altivec; | |||
c->pix_sum = pix_sum_altivec; | |||
#endif /* HAVE_ALTIVEC */ | |||
} |
@@ -517,6 +517,7 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx) | |||
ff_dsputil_init(&s->dsp, avctx); | |||
ff_hpeldsp_init(&s->hdsp, avctx->flags); | |||
ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx); | |||
avctx->coded_frame = av_frame_alloc(); | |||
s->current_picture = av_frame_alloc(); | |||
@@ -109,6 +109,7 @@ YASM-OBJS-$(CONFIG_IDCTDSP) += x86/idctdsp.o | |||
YASM-OBJS-$(CONFIG_LLAUDDSP) += x86/lossless_audiodsp.o | |||
YASM-OBJS-$(CONFIG_LLVIDDSP) += x86/lossless_videodsp.o | |||
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o | |||
YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o | |||
YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ | |||
x86/fpel.o \ | |||
x86/qpel.o | |||
@@ -23,10 +23,6 @@ | |||
%include "libavutil/x86/x86util.asm" | |||
SECTION_RODATA | |||
cextern pw_1 | |||
SECTION .text | |||
%macro DIFF_PIXELS_1 4 | |||
@@ -465,113 +461,6 @@ cglobal diff_pixels, 4, 5, 5 | |||
jne .loop | |||
RET | |||
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) | |||
; %1 = number of xmm registers used | |||
; %2 = number of loops | |||
; %3 = number of GPRs used | |||
%macro PIX_SUM16 4 | |||
cglobal pix_sum16, 2, %3, %1 | |||
movsxdifnidn r1, r1d | |||
mov r2, %2 | |||
%if cpuflag(xop) | |||
lea r3, [r1*3] | |||
%else | |||
pxor m5, m5 | |||
%endif | |||
pxor m4, m4 | |||
.loop: | |||
%if cpuflag(xop) | |||
vphaddubq m0, [r0] | |||
vphaddubq m1, [r0+r1] | |||
vphaddubq m2, [r0+r1*2] | |||
vphaddubq m3, [r0+r3] | |||
%else | |||
mova m0, [r0] | |||
%if mmsize == 8 | |||
mova m1, [r0+8] | |||
%else | |||
mova m1, [r0+r1] | |||
%endif | |||
punpckhbw m2, m0, m5 | |||
punpcklbw m0, m5 | |||
punpckhbw m3, m1, m5 | |||
punpcklbw m1, m5 | |||
%endif ; cpuflag(xop) | |||
paddw m1, m0 | |||
paddw m3, m2 | |||
paddw m3, m1 | |||
paddw m4, m3 | |||
%if mmsize == 8 | |||
add r0, r1 | |||
%else | |||
lea r0, [r0+r1*%4] | |||
%endif | |||
dec r2 | |||
jne .loop | |||
%if cpuflag(xop) | |||
pshufd m0, m4, q0032 | |||
paddd m4, m0 | |||
%else | |||
HADDW m4, m5 | |||
%endif | |||
movd eax, m4 | |||
RET | |||
%endmacro | |||
INIT_MMX mmx | |||
PIX_SUM16 0, 16, 3, 0 | |||
INIT_XMM sse2 | |||
PIX_SUM16 6, 8, 3, 2 | |||
%if HAVE_XOP_EXTERNAL | |||
INIT_XMM xop | |||
PIX_SUM16 5, 4, 4, 4 | |||
%endif | |||
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) | |||
; %1 = number of xmm registers used | |||
; %2 = number of loops | |||
%macro PIX_NORM1 2 | |||
cglobal pix_norm1, 2, 3, %1 | |||
movsxdifnidn r1, r1d | |||
mov r2, %2 | |||
pxor m0, m0 | |||
pxor m5, m5 | |||
.loop: | |||
mova m2, [r0+0] | |||
%if mmsize == 8 | |||
mova m3, [r0+8] | |||
%else | |||
mova m3, [r0+r1] | |||
%endif | |||
punpckhbw m1, m2, m0 | |||
punpcklbw m2, m0 | |||
punpckhbw m4, m3, m0 | |||
punpcklbw m3, m0 | |||
pmaddwd m1, m1 | |||
pmaddwd m2, m2 | |||
pmaddwd m3, m3 | |||
pmaddwd m4, m4 | |||
paddd m2, m1 | |||
paddd m4, m3 | |||
paddd m5, m2 | |||
paddd m5, m4 | |||
%if mmsize == 8 | |||
add r0, r1 | |||
%else | |||
lea r0, [r0+r1*2] | |||
%endif | |||
dec r2 | |||
jne .loop | |||
HADDD m5, m1 | |||
movd eax, m5 | |||
RET | |||
%endmacro | |||
INIT_MMX mmx | |||
PIX_NORM1 0, 16 | |||
INIT_XMM sse2 | |||
PIX_NORM1 6, 8 | |||
;----------------------------------------------- | |||
;int ff_sum_abs_dctelem(int16_t *block) | |||
;----------------------------------------------- | |||
@@ -37,11 +37,6 @@ void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, | |||
int stride); | |||
void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2, | |||
int stride); | |||
int ff_pix_sum16_mmx(uint8_t *pix, int line_size); | |||
int ff_pix_sum16_sse2(uint8_t *pix, int line_size); | |||
int ff_pix_sum16_xop(uint8_t *pix, int line_size); | |||
int ff_pix_norm1_mmx(uint8_t *pix, int line_size); | |||
int ff_pix_norm1_sse2(uint8_t *pix, int line_size); | |||
int ff_sum_abs_dctelem_mmx(int16_t *block); | |||
int ff_sum_abs_dctelem_mmxext(int16_t *block); | |||
int ff_sum_abs_dctelem_sse2(int16_t *block); | |||
@@ -364,8 +359,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||
if (!high_bit_depth) | |||
c->get_pixels = ff_get_pixels_mmx; | |||
c->diff_pixels = ff_diff_pixels_mmx; | |||
c->pix_sum = ff_pix_sum16_mmx; | |||
c->pix_norm1 = ff_pix_norm1_mmx; | |||
} | |||
if (EXTERNAL_SSE2(cpu_flags)) | |||
@@ -431,8 +424,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||
c->sse[0] = ff_sse16_sse2; | |||
c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2; | |||
c->diff_pixels = ff_diff_pixels_sse2; | |||
c->pix_sum = ff_pix_sum16_sse2; | |||
c->pix_norm1 = ff_pix_norm1_sse2; | |||
#if HAVE_ALIGNED_STACK | |||
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; | |||
@@ -448,9 +439,5 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||
#endif | |||
} | |||
if (EXTERNAL_XOP(cpu_flags)) { | |||
c->pix_sum = ff_pix_sum16_xop; | |||
} | |||
ff_dsputil_init_pix_mmx(c, avctx); | |||
} |
@@ -0,0 +1,137 @@ | |||
;***************************************************************************** | |||
;* SIMD-optimized MPEG encoding functions | |||
;***************************************************************************** | |||
;* Copyright (c) 2000, 2001 Fabrice Bellard | |||
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||
;* | |||
;* This file is part of FFmpeg. | |||
;* | |||
;* FFmpeg is free software; you can redistribute it and/or | |||
;* modify it under the terms of the GNU Lesser General Public | |||
;* License as published by the Free Software Foundation; either | |||
;* version 2.1 of the License, or (at your option) any later version. | |||
;* | |||
;* FFmpeg is distributed in the hope that it will be useful, | |||
;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
;* Lesser General Public License for more details. | |||
;* | |||
;* You should have received a copy of the GNU Lesser General Public | |||
;* License along with FFmpeg; if not, write to the Free Software | |||
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
;***************************************************************************** | |||
%include "libavutil/x86/x86util.asm" | |||
SECTION_RODATA | |||
cextern pw_1 | |||
SECTION .text | |||
; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) | |||
; %1 = number of xmm registers used | |||
; %2 = number of loops | |||
; %3 = number of GPRs used | |||
%macro PIX_SUM16 4 | |||
cglobal pix_sum16, 2, %3, %1 | |||
movsxdifnidn r1, r1d | |||
mov r2, %2 | |||
%if cpuflag(xop) | |||
lea r3, [r1*3] | |||
%else | |||
pxor m5, m5 | |||
%endif | |||
pxor m4, m4 | |||
.loop: | |||
%if cpuflag(xop) | |||
vphaddubq m0, [r0] | |||
vphaddubq m1, [r0+r1] | |||
vphaddubq m2, [r0+r1*2] | |||
vphaddubq m3, [r0+r3] | |||
%else | |||
mova m0, [r0] | |||
%if mmsize == 8 | |||
mova m1, [r0+8] | |||
%else | |||
mova m1, [r0+r1] | |||
%endif | |||
punpckhbw m2, m0, m5 | |||
punpcklbw m0, m5 | |||
punpckhbw m3, m1, m5 | |||
punpcklbw m1, m5 | |||
%endif ; cpuflag(xop) | |||
paddw m1, m0 | |||
paddw m3, m2 | |||
paddw m3, m1 | |||
paddw m4, m3 | |||
%if mmsize == 8 | |||
add r0, r1 | |||
%else | |||
lea r0, [r0+r1*%4] | |||
%endif | |||
dec r2 | |||
jne .loop | |||
%if cpuflag(xop) | |||
pshufd m0, m4, q0032 | |||
paddd m4, m0 | |||
%else | |||
HADDW m4, m5 | |||
%endif | |||
movd eax, m4 | |||
RET | |||
%endmacro | |||
INIT_MMX mmx | |||
PIX_SUM16 0, 16, 3, 0 | |||
INIT_XMM sse2 | |||
PIX_SUM16 6, 8, 3, 2 | |||
%if HAVE_XOP_EXTERNAL | |||
INIT_XMM xop | |||
PIX_SUM16 5, 4, 4, 4 | |||
%endif | |||
; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) | |||
; %1 = number of xmm registers used | |||
; %2 = number of loops | |||
%macro PIX_NORM1 2 | |||
cglobal pix_norm1, 2, 3, %1 | |||
movsxdifnidn r1, r1d | |||
mov r2, %2 | |||
pxor m0, m0 | |||
pxor m5, m5 | |||
.loop: | |||
mova m2, [r0+0] | |||
%if mmsize == 8 | |||
mova m3, [r0+8] | |||
%else | |||
mova m3, [r0+r1] | |||
%endif | |||
punpckhbw m1, m2, m0 | |||
punpcklbw m2, m0 | |||
punpckhbw m4, m3, m0 | |||
punpcklbw m3, m0 | |||
pmaddwd m1, m1 | |||
pmaddwd m2, m2 | |||
pmaddwd m3, m3 | |||
pmaddwd m4, m4 | |||
paddd m2, m1 | |||
paddd m4, m3 | |||
paddd m5, m2 | |||
paddd m5, m4 | |||
%if mmsize == 8 | |||
add r0, r1 | |||
%else | |||
lea r0, [r0+r1*2] | |||
%endif | |||
dec r2 | |||
jne .loop | |||
HADDD m5, m1 | |||
movd eax, m5 | |||
RET | |||
%endmacro | |||
INIT_MMX mmx | |||
PIX_NORM1 0, 16 | |||
INIT_XMM sse2 | |||
PIX_NORM1 6, 8 | |||
@@ -22,6 +22,12 @@ | |||
#include "libavcodec/avcodec.h" | |||
#include "libavcodec/mpegvideoencdsp.h" | |||
int ff_pix_sum16_mmx(uint8_t *pix, int line_size); | |||
int ff_pix_sum16_sse2(uint8_t *pix, int line_size); | |||
int ff_pix_sum16_xop(uint8_t *pix, int line_size); | |||
int ff_pix_norm1_mmx(uint8_t *pix, int line_size); | |||
int ff_pix_norm1_sse2(uint8_t *pix, int line_size); | |||
#if HAVE_INLINE_ASM | |||
#define PHADDD(a, t) \ | |||
@@ -95,9 +101,24 @@ | |||
av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | |||
AVCodecContext *avctx) | |||
{ | |||
#if HAVE_INLINE_ASM | |||
int cpu_flags = av_get_cpu_flags(); | |||
if (EXTERNAL_MMX(cpu_flags)) { | |||
c->pix_sum = ff_pix_sum16_mmx; | |||
c->pix_norm1 = ff_pix_norm1_mmx; | |||
} | |||
if (EXTERNAL_SSE2(cpu_flags)) { | |||
c->pix_sum = ff_pix_sum16_sse2; | |||
c->pix_norm1 = ff_pix_norm1_sse2; | |||
} | |||
if (EXTERNAL_XOP(cpu_flags)) { | |||
c->pix_sum = ff_pix_sum16_xop; | |||
} | |||
#if HAVE_INLINE_ASM | |||
if (INLINE_MMX(cpu_flags)) { | |||
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | |||
c->try_8x8basis = try_8x8basis_mmx; | |||