| @@ -21,6 +21,7 @@ OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_arm.o \ | |||||
| arm/simple_idct_arm.o | arm/simple_idct_arm.o | ||||
| OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o | OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_init_arm.o | ||||
| OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o | OBJS-$(CONFIG_MPEGVIDEO) += arm/mpegvideo_arm.o | ||||
| OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_init_arm.o | |||||
| OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o | OBJS-$(CONFIG_NEON_CLOBBER_TEST) += arm/neontest.o | ||||
| OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o | OBJS-$(CONFIG_VIDEODSP) += arm/videodsp_init_arm.o | ||||
| OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o | OBJS-$(CONFIG_VP3DSP) += arm/vp3dsp_init_arm.o | ||||
| @@ -60,6 +61,7 @@ ARMV6-OBJS-$(CONFIG_IDCTDSP) += arm/idctdsp_init_armv6.o \ | |||||
| arm/idctdsp_armv6.o \ | arm/idctdsp_armv6.o \ | ||||
| arm/simple_idct_armv6.o | arm/simple_idct_armv6.o | ||||
| ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o | ARMV6-OBJS-$(CONFIG_MPEGAUDIODSP) += arm/mpegaudiodsp_fixed_armv6.o | ||||
| ARMV6-OBJS-$(CONFIG_MPEGVIDEOENC) += arm/mpegvideoencdsp_armv6.o | |||||
| ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o | ARMV6-OBJS-$(CONFIG_MLP_DECODER) += arm/mlpdsp_armv6.o | ||||
| ARMV6-OBJS-$(CONFIG_VP7_DECODER) += arm/vp8_armv6.o \ | ARMV6-OBJS-$(CONFIG_VP7_DECODER) += arm/vp8_armv6.o \ | ||||
| @@ -297,58 +297,3 @@ function ff_sse16_armv6, export=1 | |||||
| pop {r4-r9, pc} | pop {r4-r9, pc} | ||||
| endfunc | endfunc | ||||
| function ff_pix_norm1_armv6, export=1 | |||||
| push {r4-r6, lr} | |||||
| mov r12, #16 | |||||
| mov lr, #0 | |||||
| 1: | |||||
| ldm r0, {r2-r5} | |||||
| uxtb16 r6, r2 | |||||
| uxtb16 r2, r2, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| uxtb16 r6, r3 | |||||
| smlad lr, r2, r2, lr | |||||
| uxtb16 r3, r3, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| uxtb16 r6, r4 | |||||
| smlad lr, r3, r3, lr | |||||
| uxtb16 r4, r4, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| uxtb16 r6, r5 | |||||
| smlad lr, r4, r4, lr | |||||
| uxtb16 r5, r5, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| subs r12, r12, #1 | |||||
| add r0, r0, r1 | |||||
| smlad lr, r5, r5, lr | |||||
| bgt 1b | |||||
| mov r0, lr | |||||
| pop {r4-r6, pc} | |||||
| endfunc | |||||
| function ff_pix_sum_armv6, export=1 | |||||
| push {r4-r7, lr} | |||||
| mov r12, #16 | |||||
| mov r2, #0 | |||||
| mov r3, #0 | |||||
| mov lr, #0 | |||||
| ldr r4, [r0] | |||||
| 1: | |||||
| subs r12, r12, #1 | |||||
| ldr r5, [r0, #4] | |||||
| usada8 r2, r4, lr, r2 | |||||
| ldr r6, [r0, #8] | |||||
| usada8 r3, r5, lr, r3 | |||||
| ldr r7, [r0, #12] | |||||
| usada8 r2, r6, lr, r2 | |||||
| beq 2f | |||||
| ldr_pre r4, r0, r1 | |||||
| usada8 r3, r7, lr, r3 | |||||
| bgt 1b | |||||
| 2: | |||||
| usada8 r3, r7, lr, r3 | |||||
| add r0, r2, r3 | |||||
| pop {r4-r7, pc} | |||||
| endfunc | |||||
| @@ -43,9 +43,6 @@ int ff_pix_abs8_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, | |||||
| int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, | int ff_sse16_armv6(MpegEncContext *s, uint8_t *blk1, uint8_t *blk2, | ||||
| int line_size, int h); | int line_size, int h); | ||||
| int ff_pix_norm1_armv6(uint8_t *pix, int line_size); | |||||
| int ff_pix_sum_armv6(uint8_t *pix, int line_size); | |||||
| av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, | av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, | ||||
| unsigned high_bit_depth) | unsigned high_bit_depth) | ||||
| { | { | ||||
| @@ -63,7 +60,4 @@ av_cold void ff_dsputil_init_armv6(DSPContext *c, AVCodecContext *avctx, | |||||
| c->sad[1] = ff_pix_abs8_armv6; | c->sad[1] = ff_pix_abs8_armv6; | ||||
| c->sse[0] = ff_sse16_armv6; | c->sse[0] = ff_sse16_armv6; | ||||
| c->pix_norm1 = ff_pix_norm1_armv6; | |||||
| c->pix_sum = ff_pix_sum_armv6; | |||||
| } | } | ||||
| @@ -0,0 +1,76 @@ | |||||
| /* | |||||
| * Copyright (c) 2009 Mans Rullgard <mans@mansr.com> | |||||
| * | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "libavutil/arm/asm.S" | |||||
| function ff_pix_norm1_armv6, export=1 | |||||
| push {r4-r6, lr} | |||||
| mov r12, #16 | |||||
| mov lr, #0 | |||||
| 1: | |||||
| ldm r0, {r2-r5} | |||||
| uxtb16 r6, r2 | |||||
| uxtb16 r2, r2, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| uxtb16 r6, r3 | |||||
| smlad lr, r2, r2, lr | |||||
| uxtb16 r3, r3, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| uxtb16 r6, r4 | |||||
| smlad lr, r3, r3, lr | |||||
| uxtb16 r4, r4, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| uxtb16 r6, r5 | |||||
| smlad lr, r4, r4, lr | |||||
| uxtb16 r5, r5, ror #8 | |||||
| smlad lr, r6, r6, lr | |||||
| subs r12, r12, #1 | |||||
| add r0, r0, r1 | |||||
| smlad lr, r5, r5, lr | |||||
| bgt 1b | |||||
| mov r0, lr | |||||
| pop {r4-r6, pc} | |||||
| endfunc | |||||
| function ff_pix_sum_armv6, export=1 | |||||
| push {r4-r7, lr} | |||||
| mov r12, #16 | |||||
| mov r2, #0 | |||||
| mov r3, #0 | |||||
| mov lr, #0 | |||||
| ldr r4, [r0] | |||||
| 1: | |||||
| subs r12, r12, #1 | |||||
| ldr r5, [r0, #4] | |||||
| usada8 r2, r4, lr, r2 | |||||
| ldr r6, [r0, #8] | |||||
| usada8 r3, r5, lr, r3 | |||||
| ldr r7, [r0, #12] | |||||
| usada8 r2, r6, lr, r2 | |||||
| beq 2f | |||||
| ldr_pre r4, r0, r1 | |||||
| usada8 r3, r7, lr, r3 | |||||
| bgt 1b | |||||
| 2: | |||||
| usada8 r3, r7, lr, r3 | |||||
| add r0, r2, r3 | |||||
| pop {r4-r7, pc} | |||||
| endfunc | |||||
| @@ -0,0 +1,38 @@ | |||||
| /* | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include <stdint.h> | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavutil/arm/cpu.h" | |||||
| #include "libavcodec/avcodec.h" | |||||
| #include "libavcodec/mpegvideoencdsp.h" | |||||
| int ff_pix_norm1_armv6(uint8_t *pix, int line_size); | |||||
| int ff_pix_sum_armv6(uint8_t *pix, int line_size); | |||||
| av_cold void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, | |||||
| AVCodecContext *avctx) | |||||
| { | |||||
| int cpu_flags = av_get_cpu_flags(); | |||||
| if (have_armv6(cpu_flags)) { | |||||
| c->pix_norm1 = ff_pix_norm1_armv6; | |||||
| c->pix_sum = ff_pix_sum_armv6; | |||||
| } | |||||
| } | |||||
| @@ -309,6 +309,7 @@ static av_cold int dnxhd_encode_init(AVCodecContext *avctx) | |||||
| ff_blockdsp_init(&ctx->bdsp, avctx); | ff_blockdsp_init(&ctx->bdsp, avctx); | ||||
| ff_dsputil_init(&ctx->m.dsp, avctx); | ff_dsputil_init(&ctx->m.dsp, avctx); | ||||
| ff_idctdsp_init(&ctx->m.idsp, avctx); | ff_idctdsp_init(&ctx->m.idsp, avctx); | ||||
| ff_mpegvideoencdsp_init(&ctx->m.mpvencdsp, avctx); | |||||
| ff_dct_common_init(&ctx->m); | ff_dct_common_init(&ctx->m); | ||||
| if (!ctx->m.dct_quantize) | if (!ctx->m.dct_quantize) | ||||
| ctx->m.dct_quantize = ff_dct_quantize_c; | ctx->m.dct_quantize = ff_dct_quantize_c; | ||||
| @@ -719,8 +720,8 @@ static int dnxhd_mb_var_thread(AVCodecContext *avctx, void *arg, | |||||
| int varc; | int varc; | ||||
| if (!partial_last_row && mb_x * 16 <= avctx->width - 16) { | if (!partial_last_row && mb_x * 16 <= avctx->width - 16) { | ||||
| sum = ctx->m.dsp.pix_sum(pix, ctx->m.linesize); | |||||
| varc = ctx->m.dsp.pix_norm1(pix, ctx->m.linesize); | |||||
| sum = ctx->m.mpvencdsp.pix_sum(pix, ctx->m.linesize); | |||||
| varc = ctx->m.mpvencdsp.pix_norm1(pix, ctx->m.linesize); | |||||
| } else { | } else { | ||||
| int bw = FFMIN(avctx->width - 16 * mb_x, 16); | int bw = FFMIN(avctx->width - 16 * mb_x, 16); | ||||
| int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16); | int bh = FFMIN((avctx->height >> ctx->interlaced) - 16 * mb_y, 16); | ||||
| @@ -26,15 +26,12 @@ | |||||
| */ | */ | ||||
| #include "libavutil/attributes.h" | #include "libavutil/attributes.h" | ||||
| #include "libavutil/imgutils.h" | |||||
| #include "avcodec.h" | #include "avcodec.h" | ||||
| #include "copy_block.h" | #include "copy_block.h" | ||||
| #include "dct.h" | #include "dct.h" | ||||
| #include "dsputil.h" | #include "dsputil.h" | ||||
| #include "simple_idct.h" | #include "simple_idct.h" | ||||
| #include "faandct.h" | #include "faandct.h" | ||||
| #include "imgconvert.h" | |||||
| #include "mathops.h" | |||||
| #include "mpegvideo.h" | #include "mpegvideo.h" | ||||
| #include "config.h" | #include "config.h" | ||||
| @@ -47,74 +44,6 @@ uint32_t ff_square_tab[512] = { 0, }; | |||||
| #define BIT_DEPTH 8 | #define BIT_DEPTH 8 | ||||
| #include "dsputilenc_template.c" | #include "dsputilenc_template.c" | ||||
| static int pix_sum_c(uint8_t *pix, int line_size) | |||||
| { | |||||
| int s = 0, i, j; | |||||
| for (i = 0; i < 16; i++) { | |||||
| for (j = 0; j < 16; j += 8) { | |||||
| s += pix[0]; | |||||
| s += pix[1]; | |||||
| s += pix[2]; | |||||
| s += pix[3]; | |||||
| s += pix[4]; | |||||
| s += pix[5]; | |||||
| s += pix[6]; | |||||
| s += pix[7]; | |||||
| pix += 8; | |||||
| } | |||||
| pix += line_size - 16; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| static int pix_norm1_c(uint8_t *pix, int line_size) | |||||
| { | |||||
| int s = 0, i, j; | |||||
| uint32_t *sq = ff_square_tab + 256; | |||||
| for (i = 0; i < 16; i++) { | |||||
| for (j = 0; j < 16; j += 8) { | |||||
| #if 0 | |||||
| s += sq[pix[0]]; | |||||
| s += sq[pix[1]]; | |||||
| s += sq[pix[2]]; | |||||
| s += sq[pix[3]]; | |||||
| s += sq[pix[4]]; | |||||
| s += sq[pix[5]]; | |||||
| s += sq[pix[6]]; | |||||
| s += sq[pix[7]]; | |||||
| #else | |||||
| #if HAVE_FAST_64BIT | |||||
| register uint64_t x = *(uint64_t *) pix; | |||||
| s += sq[x & 0xff]; | |||||
| s += sq[(x >> 8) & 0xff]; | |||||
| s += sq[(x >> 16) & 0xff]; | |||||
| s += sq[(x >> 24) & 0xff]; | |||||
| s += sq[(x >> 32) & 0xff]; | |||||
| s += sq[(x >> 40) & 0xff]; | |||||
| s += sq[(x >> 48) & 0xff]; | |||||
| s += sq[(x >> 56) & 0xff]; | |||||
| #else | |||||
| register uint32_t x = *(uint32_t *) pix; | |||||
| s += sq[x & 0xff]; | |||||
| s += sq[(x >> 8) & 0xff]; | |||||
| s += sq[(x >> 16) & 0xff]; | |||||
| s += sq[(x >> 24) & 0xff]; | |||||
| x = *(uint32_t *) (pix + 4); | |||||
| s += sq[x & 0xff]; | |||||
| s += sq[(x >> 8) & 0xff]; | |||||
| s += sq[(x >> 16) & 0xff]; | |||||
| s += sq[(x >> 24) & 0xff]; | |||||
| #endif | |||||
| #endif | |||||
| pix += 8; | |||||
| } | |||||
| pix += line_size - 16; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | static int sse4_c(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | ||||
| int line_size, int h) | int line_size, int h) | ||||
| { | { | ||||
| @@ -1055,9 +984,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) | |||||
| c->sum_abs_dctelem = sum_abs_dctelem_c; | c->sum_abs_dctelem = sum_abs_dctelem_c; | ||||
| c->pix_sum = pix_sum_c; | |||||
| c->pix_norm1 = pix_norm1_c; | |||||
| /* TODO [0] 16 [1] 8 */ | /* TODO [0] 16 [1] 8 */ | ||||
| c->pix_abs[0][0] = pix_abs16_c; | c->pix_abs[0][0] = pix_abs16_c; | ||||
| c->pix_abs[0][1] = pix_abs16_x2_c; | c->pix_abs[0][1] = pix_abs16_x2_c; | ||||
| @@ -1097,11 +1023,6 @@ av_cold void ff_dsputil_init(DSPContext *c, AVCodecContext *avctx) | |||||
| c->nsse[0] = nsse16_c; | c->nsse[0] = nsse16_c; | ||||
| c->nsse[1] = nsse8_c; | c->nsse[1] = nsse8_c; | ||||
| c->shrink[0] = av_image_copy_plane; | |||||
| c->shrink[1] = ff_shrink22; | |||||
| c->shrink[2] = ff_shrink44; | |||||
| c->shrink[3] = ff_shrink88; | |||||
| c->draw_edges = draw_edges_8_c; | c->draw_edges = draw_edges_8_c; | ||||
| switch (avctx->bits_per_raw_sample) { | switch (avctx->bits_per_raw_sample) { | ||||
| @@ -58,9 +58,6 @@ typedef struct DSPContext { | |||||
| int stride); | int stride); | ||||
| int (*sum_abs_dctelem)(int16_t *block /* align 16 */); | int (*sum_abs_dctelem)(int16_t *block /* align 16 */); | ||||
| int (*pix_sum)(uint8_t *pix, int line_size); | |||||
| int (*pix_norm1)(uint8_t *pix, int line_size); | |||||
| me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ | me_cmp_func sad[6]; /* identical to pix_absAxA except additional void * */ | ||||
| me_cmp_func sse[6]; | me_cmp_func sse[6]; | ||||
| me_cmp_func hadamard8_diff[6]; | me_cmp_func hadamard8_diff[6]; | ||||
| @@ -92,9 +89,6 @@ typedef struct DSPContext { | |||||
| #define EDGE_WIDTH 16 | #define EDGE_WIDTH 16 | ||||
| #define EDGE_TOP 1 | #define EDGE_TOP 1 | ||||
| #define EDGE_BOTTOM 2 | #define EDGE_BOTTOM 2 | ||||
| void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, | |||||
| int src_wrap, int width, int height); | |||||
| } DSPContext; | } DSPContext; | ||||
| void ff_dsputil_static_init(void); | void ff_dsputil_static_init(void); | ||||
| @@ -881,8 +881,9 @@ void ff_estimate_p_frame_motion(MpegEncContext * s, | |||||
| /* intra / predictive decision */ | /* intra / predictive decision */ | ||||
| pix = c->src[0][0]; | pix = c->src[0][0]; | ||||
| sum = s->dsp.pix_sum(pix, s->linesize); | |||||
| varc = s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500; | |||||
| sum = s->mpvencdsp.pix_sum(pix, s->linesize); | |||||
| varc = s->mpvencdsp.pix_norm1(pix, s->linesize) - | |||||
| (((unsigned) sum * sum) >> 8) + 500; | |||||
| pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; | pic->mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; | ||||
| pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8; | pic->mb_var [s->mb_stride * mb_y + mb_x] = (varc+128)>>8; | ||||
| @@ -895,7 +895,7 @@ static int get_intra_count(MpegEncContext *s, uint8_t *src, | |||||
| int offset = x + y * stride; | int offset = x + y * stride; | ||||
| int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride, | int sad = s->dsp.sad[0](NULL, src + offset, ref + offset, stride, | ||||
| 16); | 16); | ||||
| int mean = (s->dsp.pix_sum(src + offset, stride) + 128) >> 8; | |||||
| int mean = (s->mpvencdsp.pix_sum(src + offset, stride) + 128) >> 8; | |||||
| int sae = get_sae(src + offset, mean, stride); | int sae = get_sae(src + offset, mean, stride); | ||||
| acc += sae + 500 < sad; | acc += sae + 500 < sad; | ||||
| @@ -1138,15 +1138,21 @@ static int estimate_best_b_count(MpegEncContext *s) | |||||
| pre_input.f->data[2] += INPLACE_OFFSET; | pre_input.f->data[2] += INPLACE_OFFSET; | ||||
| } | } | ||||
| s->dsp.shrink[scale](s->tmp_frames[i]->data[0], s->tmp_frames[i]->linesize[0], | |||||
| pre_input.f->data[0], pre_input.f->linesize[0], | |||||
| c->width, c->height); | |||||
| s->dsp.shrink[scale](s->tmp_frames[i]->data[1], s->tmp_frames[i]->linesize[1], | |||||
| pre_input.f->data[1], pre_input.f->linesize[1], | |||||
| c->width >> 1, c->height >> 1); | |||||
| s->dsp.shrink[scale](s->tmp_frames[i]->data[2], s->tmp_frames[i]->linesize[2], | |||||
| pre_input.f->data[2], pre_input.f->linesize[2], | |||||
| c->width >> 1, c->height >> 1); | |||||
| s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[0], | |||||
| s->tmp_frames[i]->linesize[0], | |||||
| pre_input.f->data[0], | |||||
| pre_input.f->linesize[0], | |||||
| c->width, c->height); | |||||
| s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[1], | |||||
| s->tmp_frames[i]->linesize[1], | |||||
| pre_input.f->data[1], | |||||
| pre_input.f->linesize[1], | |||||
| c->width >> 1, c->height >> 1); | |||||
| s->mpvencdsp.shrink[scale](s->tmp_frames[i]->data[2], | |||||
| s->tmp_frames[i]->linesize[2], | |||||
| pre_input.f->data[2], | |||||
| pre_input.f->linesize[2], | |||||
| c->width >> 1, c->height >> 1); | |||||
| } | } | ||||
| } | } | ||||
| @@ -2420,9 +2426,10 @@ static int mb_var_thread(AVCodecContext *c, void *arg){ | |||||
| int yy = mb_y * 16; | int yy = mb_y * 16; | ||||
| uint8_t *pix = s->new_picture.f->data[0] + (yy * s->linesize) + xx; | uint8_t *pix = s->new_picture.f->data[0] + (yy * s->linesize) + xx; | ||||
| int varc; | int varc; | ||||
| int sum = s->dsp.pix_sum(pix, s->linesize); | |||||
| int sum = s->mpvencdsp.pix_sum(pix, s->linesize); | |||||
| varc = (s->dsp.pix_norm1(pix, s->linesize) - (((unsigned)sum*sum)>>8) + 500 + 128)>>8; | |||||
| varc = (s->mpvencdsp.pix_norm1(pix, s->linesize) - | |||||
| (((unsigned) sum * sum) >> 8) + 500 + 128) >> 8; | |||||
| s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc; | s->current_picture.mb_var [s->mb_stride * mb_y + mb_x] = varc; | ||||
| s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; | s->current_picture.mb_mean[s->mb_stride * mb_y + mb_x] = (sum+128)>>8; | ||||
| @@ -21,7 +21,10 @@ | |||||
| #include "config.h" | #include "config.h" | ||||
| #include "libavutil/attributes.h" | #include "libavutil/attributes.h" | ||||
| #include "libavutil/imgutils.h" | |||||
| #include "avcodec.h" | #include "avcodec.h" | ||||
| #include "dsputil.h" | |||||
| #include "imgconvert.h" | |||||
| #include "mpegvideoencdsp.h" | #include "mpegvideoencdsp.h" | ||||
| static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], | static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], | ||||
| @@ -53,12 +56,92 @@ static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale) | |||||
| (BASIS_SHIFT - RECON_SHIFT); | (BASIS_SHIFT - RECON_SHIFT); | ||||
| } | } | ||||
| static int pix_sum_c(uint8_t *pix, int line_size) | |||||
| { | |||||
| int s = 0, i, j; | |||||
| for (i = 0; i < 16; i++) { | |||||
| for (j = 0; j < 16; j += 8) { | |||||
| s += pix[0]; | |||||
| s += pix[1]; | |||||
| s += pix[2]; | |||||
| s += pix[3]; | |||||
| s += pix[4]; | |||||
| s += pix[5]; | |||||
| s += pix[6]; | |||||
| s += pix[7]; | |||||
| pix += 8; | |||||
| } | |||||
| pix += line_size - 16; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| static int pix_norm1_c(uint8_t *pix, int line_size) | |||||
| { | |||||
| int s = 0, i, j; | |||||
| uint32_t *sq = ff_square_tab + 256; | |||||
| for (i = 0; i < 16; i++) { | |||||
| for (j = 0; j < 16; j += 8) { | |||||
| #if 0 | |||||
| s += sq[pix[0]]; | |||||
| s += sq[pix[1]]; | |||||
| s += sq[pix[2]]; | |||||
| s += sq[pix[3]]; | |||||
| s += sq[pix[4]]; | |||||
| s += sq[pix[5]]; | |||||
| s += sq[pix[6]]; | |||||
| s += sq[pix[7]]; | |||||
| #else | |||||
| #if HAVE_FAST_64BIT | |||||
| register uint64_t x = *(uint64_t *) pix; | |||||
| s += sq[x & 0xff]; | |||||
| s += sq[(x >> 8) & 0xff]; | |||||
| s += sq[(x >> 16) & 0xff]; | |||||
| s += sq[(x >> 24) & 0xff]; | |||||
| s += sq[(x >> 32) & 0xff]; | |||||
| s += sq[(x >> 40) & 0xff]; | |||||
| s += sq[(x >> 48) & 0xff]; | |||||
| s += sq[(x >> 56) & 0xff]; | |||||
| #else | |||||
| register uint32_t x = *(uint32_t *) pix; | |||||
| s += sq[x & 0xff]; | |||||
| s += sq[(x >> 8) & 0xff]; | |||||
| s += sq[(x >> 16) & 0xff]; | |||||
| s += sq[(x >> 24) & 0xff]; | |||||
| x = *(uint32_t *) (pix + 4); | |||||
| s += sq[x & 0xff]; | |||||
| s += sq[(x >> 8) & 0xff]; | |||||
| s += sq[(x >> 16) & 0xff]; | |||||
| s += sq[(x >> 24) & 0xff]; | |||||
| #endif | |||||
| #endif | |||||
| pix += 8; | |||||
| } | |||||
| pix += line_size - 16; | |||||
| } | |||||
| return s; | |||||
| } | |||||
| av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, | av_cold void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, | ||||
| AVCodecContext *avctx) | AVCodecContext *avctx) | ||||
| { | { | ||||
| c->try_8x8basis = try_8x8basis_c; | c->try_8x8basis = try_8x8basis_c; | ||||
| c->add_8x8basis = add_8x8basis_c; | c->add_8x8basis = add_8x8basis_c; | ||||
| c->shrink[0] = av_image_copy_plane; | |||||
| c->shrink[1] = ff_shrink22; | |||||
| c->shrink[2] = ff_shrink44; | |||||
| c->shrink[3] = ff_shrink88; | |||||
| c->pix_sum = pix_sum_c; | |||||
| c->pix_norm1 = pix_norm1_c; | |||||
| if (ARCH_ARM) | |||||
| ff_mpegvideoencdsp_init_arm(c, avctx); | |||||
| if (ARCH_PPC) | |||||
| ff_mpegvideoencdsp_init_ppc(c, avctx); | |||||
| if (ARCH_X86) | if (ARCH_X86) | ||||
| ff_mpegvideoencdsp_init_x86(c, avctx); | ff_mpegvideoencdsp_init_x86(c, avctx); | ||||
| } | } | ||||
| @@ -31,10 +31,19 @@ typedef struct MpegvideoEncDSPContext { | |||||
| int16_t basis[64], int scale); | int16_t basis[64], int scale); | ||||
| void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale); | void (*add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale); | ||||
| int (*pix_sum)(uint8_t *pix, int line_size); | |||||
| int (*pix_norm1)(uint8_t *pix, int line_size); | |||||
| void (*shrink[4])(uint8_t *dst, int dst_wrap, const uint8_t *src, | |||||
| int src_wrap, int width, int height); | |||||
| } MpegvideoEncDSPContext; | } MpegvideoEncDSPContext; | ||||
| void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, | void ff_mpegvideoencdsp_init(MpegvideoEncDSPContext *c, | ||||
| AVCodecContext *avctx); | AVCodecContext *avctx); | ||||
| void ff_mpegvideoencdsp_init_arm(MpegvideoEncDSPContext *c, | |||||
| AVCodecContext *avctx); | |||||
| void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, | |||||
| AVCodecContext *avctx); | |||||
| void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | ||||
| AVCodecContext *avctx); | AVCodecContext *avctx); | ||||
| @@ -13,6 +13,7 @@ OBJS-$(CONFIG_IDCTDSP) += ppc/idctdsp.o | |||||
| OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o | OBJS-$(CONFIG_MPEGAUDIODSP) += ppc/mpegaudiodsp_altivec.o | ||||
| OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ | OBJS-$(CONFIG_MPEGVIDEO) += ppc/mpegvideo_altivec.o \ | ||||
| ppc/mpegvideodsp.o | ppc/mpegvideodsp.o | ||||
| OBJS-$(CONFIG_MPEGVIDEOENC) += ppc/mpegvideoencdsp.o | |||||
| OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o | OBJS-$(CONFIG_VIDEODSP) += ppc/videodsp_ppc.o | ||||
| OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o | OBJS-$(CONFIG_VP3DSP) += ppc/vp3dsp_altivec.o | ||||
| @@ -308,34 +308,6 @@ static int sad8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |||||
| return s; | return s; | ||||
| } | } | ||||
| static int pix_norm1_altivec(uint8_t *pix, int line_size) | |||||
| { | |||||
| int i, s = 0; | |||||
| const vector unsigned int zero = | |||||
| (const vector unsigned int) vec_splat_u32(0); | |||||
| vector unsigned char perm = vec_lvsl(0, pix); | |||||
| vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); | |||||
| vector signed int sum; | |||||
| for (i = 0; i < 16; i++) { | |||||
| /* Read the potentially unaligned pixels. */ | |||||
| vector unsigned char pixl = vec_ld(0, pix); | |||||
| vector unsigned char pixr = vec_ld(15, pix); | |||||
| vector unsigned char pixv = vec_perm(pixl, pixr, perm); | |||||
| /* Square the values, and add them to our sum. */ | |||||
| sv = vec_msum(pixv, pixv, sv); | |||||
| pix += line_size; | |||||
| } | |||||
| /* Sum up the four partial sums, and put the result into s. */ | |||||
| sum = vec_sums((vector signed int) sv, (vector signed int) zero); | |||||
| sum = vec_splat(sum, 3); | |||||
| vec_ste(sum, 0, &s); | |||||
| return s; | |||||
| } | |||||
| /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. | /* Sum of Squared Errors for an 8x8 block, AltiVec-enhanced. | ||||
| * It's the sad8_altivec code above w/ squaring added. */ | * It's the sad8_altivec code above w/ squaring added. */ | ||||
| static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | static int sse8_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | ||||
| @@ -430,35 +402,6 @@ static int sse16_altivec(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2, | |||||
| return s; | return s; | ||||
| } | } | ||||
| static int pix_sum_altivec(uint8_t *pix, int line_size) | |||||
| { | |||||
| int i, s; | |||||
| const vector unsigned int zero = | |||||
| (const vector unsigned int) vec_splat_u32(0); | |||||
| vector unsigned char perm = vec_lvsl(0, pix); | |||||
| vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); | |||||
| vector signed int sumdiffs; | |||||
| for (i = 0; i < 16; i++) { | |||||
| /* Read the potentially unaligned 16 pixels into t1. */ | |||||
| vector unsigned char pixl = vec_ld(0, pix); | |||||
| vector unsigned char pixr = vec_ld(15, pix); | |||||
| vector unsigned char t1 = vec_perm(pixl, pixr, perm); | |||||
| /* Add each 4 pixel group together and put 4 results into sad. */ | |||||
| sad = vec_sum4s(t1, sad); | |||||
| pix += line_size; | |||||
| } | |||||
| /* Sum up the four partial sums, and put the result into s. */ | |||||
| sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |||||
| sumdiffs = vec_splat(sumdiffs, 3); | |||||
| vec_ste(sumdiffs, 0, &s); | |||||
| return s; | |||||
| } | |||||
| static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, | static void get_pixels_altivec(int16_t *restrict block, const uint8_t *pixels, | ||||
| int line_size) | int line_size) | ||||
| { | { | ||||
| @@ -911,9 +854,6 @@ av_cold void ff_dsputil_init_altivec(DSPContext *c, AVCodecContext *avctx, | |||||
| c->sse[0] = sse16_altivec; | c->sse[0] = sse16_altivec; | ||||
| c->sse[1] = sse8_altivec; | c->sse[1] = sse8_altivec; | ||||
| c->pix_norm1 = pix_norm1_altivec; | |||||
| c->pix_sum = pix_sum_altivec; | |||||
| c->diff_pixels = diff_pixels_altivec; | c->diff_pixels = diff_pixels_altivec; | ||||
| if (!high_bit_depth) { | if (!high_bit_depth) { | ||||
| @@ -0,0 +1,103 @@ | |||||
| /* | |||||
| * This file is part of Libav. | |||||
| * | |||||
| * Libav is free software; you can redistribute it and/or | |||||
| * modify it under the terms of the GNU Lesser General Public | |||||
| * License as published by the Free Software Foundation; either | |||||
| * version 2.1 of the License, or (at your option) any later version. | |||||
| * | |||||
| * Libav is distributed in the hope that it will be useful, | |||||
| * but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| * Lesser General Public License for more details. | |||||
| * | |||||
| * You should have received a copy of the GNU Lesser General Public | |||||
| * License along with Libav; if not, write to the Free Software | |||||
| * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| */ | |||||
| #include "config.h" | |||||
| #include <stdint.h> | |||||
| #if HAVE_ALTIVEC_H | |||||
| #include <altivec.h> | |||||
| #endif | |||||
| #include "libavutil/attributes.h" | |||||
| #include "libavutil/cpu.h" | |||||
| #include "libavutil/ppc/cpu.h" | |||||
| #include "libavutil/ppc/types_altivec.h" | |||||
| #include "libavutil/ppc/util_altivec.h" | |||||
| #include "libavcodec/mpegvideoencdsp.h" | |||||
| #if HAVE_ALTIVEC | |||||
| static int pix_norm1_altivec(uint8_t *pix, int line_size) | |||||
| { | |||||
| int i, s = 0; | |||||
| const vector unsigned int zero = | |||||
| (const vector unsigned int) vec_splat_u32(0); | |||||
| vector unsigned char perm = vec_lvsl(0, pix); | |||||
| vector unsigned int sv = (vector unsigned int) vec_splat_u32(0); | |||||
| vector signed int sum; | |||||
| for (i = 0; i < 16; i++) { | |||||
| /* Read the potentially unaligned pixels. */ | |||||
| vector unsigned char pixl = vec_ld(0, pix); | |||||
| vector unsigned char pixr = vec_ld(15, pix); | |||||
| vector unsigned char pixv = vec_perm(pixl, pixr, perm); | |||||
| /* Square the values, and add them to our sum. */ | |||||
| sv = vec_msum(pixv, pixv, sv); | |||||
| pix += line_size; | |||||
| } | |||||
| /* Sum up the four partial sums, and put the result into s. */ | |||||
| sum = vec_sums((vector signed int) sv, (vector signed int) zero); | |||||
| sum = vec_splat(sum, 3); | |||||
| vec_ste(sum, 0, &s); | |||||
| return s; | |||||
| } | |||||
| static int pix_sum_altivec(uint8_t *pix, int line_size) | |||||
| { | |||||
| int i, s; | |||||
| const vector unsigned int zero = | |||||
| (const vector unsigned int) vec_splat_u32(0); | |||||
| vector unsigned char perm = vec_lvsl(0, pix); | |||||
| vector unsigned int sad = (vector unsigned int) vec_splat_u32(0); | |||||
| vector signed int sumdiffs; | |||||
| for (i = 0; i < 16; i++) { | |||||
| /* Read the potentially unaligned 16 pixels into t1. */ | |||||
| vector unsigned char pixl = vec_ld(0, pix); | |||||
| vector unsigned char pixr = vec_ld(15, pix); | |||||
| vector unsigned char t1 = vec_perm(pixl, pixr, perm); | |||||
| /* Add each 4 pixel group together and put 4 results into sad. */ | |||||
| sad = vec_sum4s(t1, sad); | |||||
| pix += line_size; | |||||
| } | |||||
| /* Sum up the four partial sums, and put the result into s. */ | |||||
| sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); | |||||
| sumdiffs = vec_splat(sumdiffs, 3); | |||||
| vec_ste(sumdiffs, 0, &s); | |||||
| return s; | |||||
| } | |||||
| #endif /* HAVE_ALTIVEC */ | |||||
| av_cold void ff_mpegvideoencdsp_init_ppc(MpegvideoEncDSPContext *c, | |||||
| AVCodecContext *avctx) | |||||
| { | |||||
| #if HAVE_ALTIVEC | |||||
| if (!PPC_ALTIVEC(av_get_cpu_flags())) | |||||
| return; | |||||
| c->pix_norm1 = pix_norm1_altivec; | |||||
| c->pix_sum = pix_sum_altivec; | |||||
| #endif /* HAVE_ALTIVEC */ | |||||
| } | |||||
| @@ -511,6 +511,7 @@ static av_cold int svq1_encode_init(AVCodecContext *avctx) | |||||
| ff_dsputil_init(&s->dsp, avctx); | ff_dsputil_init(&s->dsp, avctx); | ||||
| ff_hpeldsp_init(&s->hdsp, avctx->flags); | ff_hpeldsp_init(&s->hdsp, avctx->flags); | ||||
| ff_mpegvideoencdsp_init(&s->m.mpvencdsp, avctx); | |||||
| avctx->coded_frame = av_frame_alloc(); | avctx->coded_frame = av_frame_alloc(); | ||||
| s->current_picture = av_frame_alloc(); | s->current_picture = av_frame_alloc(); | ||||
| @@ -92,6 +92,7 @@ YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ | |||||
| x86/hpeldsp.o | x86/hpeldsp.o | ||||
| YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o | YASM-OBJS-$(CONFIG_HUFFYUVDSP) += x86/huffyuvdsp.o | ||||
| YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o | YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o | ||||
| YASM-OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoencdsp.o | |||||
| YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ | YASM-OBJS-$(CONFIG_QPELDSP) += x86/qpeldsp.o \ | ||||
| x86/fpel.o \ | x86/fpel.o \ | ||||
| x86/qpel.o | x86/qpel.o | ||||
| @@ -418,72 +418,3 @@ cglobal diff_pixels, 4,5 | |||||
| add r4, 16 | add r4, 16 | ||||
| jne .loop | jne .loop | ||||
| REP_RET | REP_RET | ||||
| INIT_MMX mmx | |||||
| ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) | |||||
| cglobal pix_sum16, 2, 3 | |||||
| movsxdifnidn r1, r1d | |||||
| mov r2, r1 | |||||
| neg r2 | |||||
| shl r2, 4 | |||||
| sub r0, r2 | |||||
| pxor m7, m7 | |||||
| pxor m6, m6 | |||||
| .loop: | |||||
| mova m0, [r0+r2+0] | |||||
| mova m1, [r0+r2+0] | |||||
| mova m2, [r0+r2+8] | |||||
| mova m3, [r0+r2+8] | |||||
| punpcklbw m0, m7 | |||||
| punpckhbw m1, m7 | |||||
| punpcklbw m2, m7 | |||||
| punpckhbw m3, m7 | |||||
| paddw m1, m0 | |||||
| paddw m3, m2 | |||||
| paddw m3, m1 | |||||
| paddw m6, m3 | |||||
| add r2, r1 | |||||
| js .loop | |||||
| mova m5, m6 | |||||
| psrlq m6, 32 | |||||
| paddw m6, m5 | |||||
| mova m5, m6 | |||||
| psrlq m6, 16 | |||||
| paddw m6, m5 | |||||
| movd eax, m6 | |||||
| and eax, 0xffff | |||||
| RET | |||||
| INIT_MMX mmx | |||||
| ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) | |||||
| cglobal pix_norm1, 2, 4 | |||||
| movsxdifnidn r1, r1d | |||||
| mov r2, 16 | |||||
| pxor m0, m0 | |||||
| pxor m7, m7 | |||||
| .loop: | |||||
| mova m2, [r0+0] | |||||
| mova m3, [r0+8] | |||||
| mova m1, m2 | |||||
| punpckhbw m1, m0 | |||||
| punpcklbw m2, m0 | |||||
| mova m4, m3 | |||||
| punpckhbw m3, m0 | |||||
| punpcklbw m4, m0 | |||||
| pmaddwd m1, m1 | |||||
| pmaddwd m2, m2 | |||||
| pmaddwd m3, m3 | |||||
| pmaddwd m4, m4 | |||||
| paddd m2, m1 | |||||
| paddd m4, m3 | |||||
| paddd m7, m2 | |||||
| add r0, r1 | |||||
| paddd m7, m4 | |||||
| dec r2 | |||||
| jne .loop | |||||
| mova m1, m7 | |||||
| psrlq m7, 32 | |||||
| paddd m1, m7 | |||||
| movd eax, m1 | |||||
| RET | |||||
| @@ -35,8 +35,6 @@ void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size); | |||||
| void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size); | void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size); | ||||
| void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, | void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, | ||||
| int stride); | int stride); | ||||
| int ff_pix_sum16_mmx(uint8_t *pix, int line_size); | |||||
| int ff_pix_norm1_mmx(uint8_t *pix, int line_size); | |||||
| #if HAVE_INLINE_ASM | #if HAVE_INLINE_ASM | ||||
| @@ -831,8 +829,6 @@ av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx, | |||||
| if (!high_bit_depth) | if (!high_bit_depth) | ||||
| c->get_pixels = ff_get_pixels_mmx; | c->get_pixels = ff_get_pixels_mmx; | ||||
| c->diff_pixels = ff_diff_pixels_mmx; | c->diff_pixels = ff_diff_pixels_mmx; | ||||
| c->pix_sum = ff_pix_sum16_mmx; | |||||
| c->pix_norm1 = ff_pix_norm1_mmx; | |||||
| } | } | ||||
| if (EXTERNAL_SSE2(cpu_flags)) | if (EXTERNAL_SSE2(cpu_flags)) | ||||
| @@ -0,0 +1,95 @@ | |||||
| ;***************************************************************************** | |||||
| ;* SIMD-optimized MPEG encoding functions | |||||
| ;***************************************************************************** | |||||
| ;* Copyright (c) 2000, 2001 Fabrice Bellard | |||||
| ;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> | |||||
| ;* | |||||
| ;* This file is part of Libav. | |||||
| ;* | |||||
| ;* Libav is free software; you can redistribute it and/or | |||||
| ;* modify it under the terms of the GNU Lesser General Public | |||||
| ;* License as published by the Free Software Foundation; either | |||||
| ;* version 2.1 of the License, or (at your option) any later version. | |||||
| ;* | |||||
| ;* Libav is distributed in the hope that it will be useful, | |||||
| ;* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
| ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
| ;* Lesser General Public License for more details. | |||||
| ;* | |||||
| ;* You should have received a copy of the GNU Lesser General Public | |||||
| ;* License along with Libav; if not, write to the Free Software | |||||
| ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
| ;***************************************************************************** | |||||
| %include "libavutil/x86/x86util.asm" | |||||
| SECTION .text | |||||
| INIT_MMX mmx | |||||
| ; int ff_pix_sum16_mmx(uint8_t *pix, int line_size) | |||||
| cglobal pix_sum16, 2, 3 | |||||
| movsxdifnidn r1, r1d | |||||
| mov r2, r1 | |||||
| neg r2 | |||||
| shl r2, 4 | |||||
| sub r0, r2 | |||||
| pxor m7, m7 | |||||
| pxor m6, m6 | |||||
| .loop: | |||||
| mova m0, [r0+r2+0] | |||||
| mova m1, [r0+r2+0] | |||||
| mova m2, [r0+r2+8] | |||||
| mova m3, [r0+r2+8] | |||||
| punpcklbw m0, m7 | |||||
| punpckhbw m1, m7 | |||||
| punpcklbw m2, m7 | |||||
| punpckhbw m3, m7 | |||||
| paddw m1, m0 | |||||
| paddw m3, m2 | |||||
| paddw m3, m1 | |||||
| paddw m6, m3 | |||||
| add r2, r1 | |||||
| js .loop | |||||
| mova m5, m6 | |||||
| psrlq m6, 32 | |||||
| paddw m6, m5 | |||||
| mova m5, m6 | |||||
| psrlq m6, 16 | |||||
| paddw m6, m5 | |||||
| movd eax, m6 | |||||
| and eax, 0xffff | |||||
| RET | |||||
| INIT_MMX mmx | |||||
| ; int ff_pix_norm1_mmx(uint8_t *pix, int line_size) | |||||
| cglobal pix_norm1, 2, 4 | |||||
| movsxdifnidn r1, r1d | |||||
| mov r2, 16 | |||||
| pxor m0, m0 | |||||
| pxor m7, m7 | |||||
| .loop: | |||||
| mova m2, [r0+0] | |||||
| mova m3, [r0+8] | |||||
| mova m1, m2 | |||||
| punpckhbw m1, m0 | |||||
| punpcklbw m2, m0 | |||||
| mova m4, m3 | |||||
| punpckhbw m3, m0 | |||||
| punpcklbw m4, m0 | |||||
| pmaddwd m1, m1 | |||||
| pmaddwd m2, m2 | |||||
| pmaddwd m3, m3 | |||||
| pmaddwd m4, m4 | |||||
| paddd m2, m1 | |||||
| paddd m4, m3 | |||||
| paddd m7, m2 | |||||
| add r0, r1 | |||||
| paddd m7, m4 | |||||
| dec r2 | |||||
| jne .loop | |||||
| mova m1, m7 | |||||
| psrlq m7, 32 | |||||
| paddd m1, m7 | |||||
| movd eax, m1 | |||||
| RET | |||||
| @@ -22,6 +22,9 @@ | |||||
| #include "libavcodec/avcodec.h" | #include "libavcodec/avcodec.h" | ||||
| #include "libavcodec/mpegvideoencdsp.h" | #include "libavcodec/mpegvideoencdsp.h" | ||||
| int ff_pix_sum16_mmx(uint8_t *pix, int line_size); | |||||
| int ff_pix_norm1_mmx(uint8_t *pix, int line_size); | |||||
| #if HAVE_INLINE_ASM | #if HAVE_INLINE_ASM | ||||
| #define PHADDD(a, t) \ | #define PHADDD(a, t) \ | ||||
| @@ -95,9 +98,15 @@ | |||||
| av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | av_cold void ff_mpegvideoencdsp_init_x86(MpegvideoEncDSPContext *c, | ||||
| AVCodecContext *avctx) | AVCodecContext *avctx) | ||||
| { | { | ||||
| #if HAVE_INLINE_ASM | |||||
| int cpu_flags = av_get_cpu_flags(); | int cpu_flags = av_get_cpu_flags(); | ||||
| if (EXTERNAL_MMX(cpu_flags)) { | |||||
| c->pix_sum = ff_pix_sum16_mmx; | |||||
| c->pix_norm1 = ff_pix_norm1_mmx; | |||||
| } | |||||
| #if HAVE_INLINE_ASM | |||||
| if (INLINE_MMX(cpu_flags)) { | if (INLINE_MMX(cpu_flags)) { | ||||
| if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { | ||||
| c->try_8x8basis = try_8x8basis_mmx; | c->try_8x8basis = try_8x8basis_mmx; | ||||