Signed-off-by: Martin Storsjö <martin@martin.st>tags/n2.0
@@ -4,4 +4,6 @@ OBJS += alpha/dsputil_alpha.o \ | |||
alpha/motion_est_mvi_asm.o \ | |||
alpha/simple_idct_alpha.o \ | |||
OBJS-$(CONFIG_HPELDSP) += alpha/hpeldsp_alpha.o \ | |||
alpha/hpeldsp_alpha_asm.o | |||
OBJS-$(CONFIG_MPEGVIDEO) += alpha/mpegvideo_alpha.o |
@@ -119,196 +119,11 @@ static void clear_blocks_axp(int16_t *blocks) { | |||
} while (n); | |||
} | |||
static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) | |||
{ | |||
return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||
} | |||
static inline uint64_t avg2(uint64_t a, uint64_t b) | |||
{ | |||
return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||
} | |||
#if 0 | |||
/* The XY2 routines basically utilize this scheme, but reuse parts in | |||
each iteration. */ | |||
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |||
{ | |||
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |||
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) | |||
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) | |||
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); | |||
uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |||
+ (l2 & BYTE_VEC(0x03)) | |||
+ (l3 & BYTE_VEC(0x03)) | |||
+ (l4 & BYTE_VEC(0x03)) | |||
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |||
return r1 + r2; | |||
} | |||
#endif | |||
#define OP(LOAD, STORE) \ | |||
do { \ | |||
STORE(LOAD(pixels), block); \ | |||
pixels += line_size; \ | |||
block += line_size; \ | |||
} while (--h) | |||
#define OP_X2(LOAD, STORE) \ | |||
do { \ | |||
uint64_t pix1, pix2; \ | |||
\ | |||
pix1 = LOAD(pixels); \ | |||
pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||
STORE(AVG2(pix1, pix2), block); \ | |||
pixels += line_size; \ | |||
block += line_size; \ | |||
} while (--h) | |||
#define OP_Y2(LOAD, STORE) \ | |||
do { \ | |||
uint64_t pix = LOAD(pixels); \ | |||
do { \ | |||
uint64_t next_pix; \ | |||
\ | |||
pixels += line_size; \ | |||
next_pix = LOAD(pixels); \ | |||
STORE(AVG2(pix, next_pix), block); \ | |||
block += line_size; \ | |||
pix = next_pix; \ | |||
} while (--h); \ | |||
} while (0) | |||
#define OP_XY2(LOAD, STORE) \ | |||
do { \ | |||
uint64_t pix1 = LOAD(pixels); \ | |||
uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||
uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ | |||
+ (pix2 & BYTE_VEC(0x03)); \ | |||
uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||
+ ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||
\ | |||
do { \ | |||
uint64_t npix1, npix2; \ | |||
uint64_t npix_l, npix_h; \ | |||
uint64_t avg; \ | |||
\ | |||
pixels += line_size; \ | |||
npix1 = LOAD(pixels); \ | |||
npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||
npix_l = (npix1 & BYTE_VEC(0x03)) \ | |||
+ (npix2 & BYTE_VEC(0x03)); \ | |||
npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||
+ ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||
avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ | |||
+ pix_h + npix_h; \ | |||
STORE(avg, block); \ | |||
\ | |||
block += line_size; \ | |||
pix_l = npix_l; \ | |||
pix_h = npix_h; \ | |||
} while (--h); \ | |||
} while (0) | |||
#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ | |||
static void OPNAME ## _pixels ## SUFF ## _axp \ | |||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||
ptrdiff_t line_size, int h) \ | |||
{ \ | |||
if ((size_t) pixels & 0x7) { \ | |||
OPKIND(uldq, STORE); \ | |||
} else { \ | |||
OPKIND(ldq, STORE); \ | |||
} \ | |||
} \ | |||
\ | |||
static void OPNAME ## _pixels16 ## SUFF ## _axp \ | |||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||
ptrdiff_t line_size, int h) \ | |||
{ \ | |||
OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ | |||
OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ | |||
} | |||
#define PIXOP(OPNAME, STORE) \ | |||
MAKE_OP(OPNAME, , OP, STORE) \ | |||
MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ | |||
MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ | |||
MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) | |||
/* Rounding primitives. */ | |||
#define AVG2 avg2 | |||
#define AVG4 avg4 | |||
#define AVG4_ROUNDER BYTE_VEC(0x02) | |||
#define STORE(l, b) stq(l, b) | |||
PIXOP(put, STORE); | |||
#undef STORE | |||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||
PIXOP(avg, STORE); | |||
/* Not rounding primitives. */ | |||
#undef AVG2 | |||
#undef AVG4 | |||
#undef AVG4_ROUNDER | |||
#undef STORE | |||
#define AVG2 avg2_no_rnd | |||
#define AVG4 avg4_no_rnd | |||
#define AVG4_ROUNDER BYTE_VEC(0x01) | |||
#define STORE(l, b) stq(l, b) | |||
PIXOP(put_no_rnd, STORE); | |||
#undef STORE | |||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||
PIXOP(avg_no_rnd, STORE); | |||
static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
ptrdiff_t line_size, int h) | |||
{ | |||
put_pixels_axp_asm(block, pixels, line_size, h); | |||
put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); | |||
} | |||
av_cold void ff_dsputil_init_alpha(DSPContext *c, AVCodecContext *avctx) | |||
{ | |||
const int high_bit_depth = avctx->bits_per_raw_sample > 8; | |||
if (!high_bit_depth) { | |||
c->put_pixels_tab[0][0] = put_pixels16_axp_asm; | |||
c->put_pixels_tab[0][1] = put_pixels16_x2_axp; | |||
c->put_pixels_tab[0][2] = put_pixels16_y2_axp; | |||
c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; | |||
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; | |||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; | |||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; | |||
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; | |||
c->avg_pixels_tab[0][0] = avg_pixels16_axp; | |||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; | |||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; | |||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; | |||
c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; | |||
c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; | |||
c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; | |||
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; | |||
c->put_pixels_tab[1][0] = put_pixels_axp_asm; | |||
c->put_pixels_tab[1][1] = put_pixels_x2_axp; | |||
c->put_pixels_tab[1][2] = put_pixels_y2_axp; | |||
c->put_pixels_tab[1][3] = put_pixels_xy2_axp; | |||
c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; | |||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; | |||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; | |||
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; | |||
c->avg_pixels_tab[1][0] = avg_pixels_axp; | |||
c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; | |||
c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; | |||
c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; | |||
c->clear_blocks = clear_blocks_axp; | |||
} | |||
@@ -26,8 +26,6 @@ void ff_simple_idct_axp(int16_t *block); | |||
void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block); | |||
void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block); | |||
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
ptrdiff_t line_size, int h); | |||
void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | |||
int line_size); | |||
void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | |||
@@ -26,114 +26,11 @@ | |||
#include "regdef.h" | |||
/* Some nicer register names. */ | |||
#define ta t10 | |||
#define tb t11 | |||
#define tc t12 | |||
#define td AT | |||
/* Danger: these overlap with the argument list and the return value */ | |||
#define te a5 | |||
#define tf a4 | |||
#define tg a3 | |||
#define th v0 | |||
.set noat | |||
.set noreorder | |||
.arch pca56 | |||
.text | |||
/************************************************************************ | |||
* void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
* int line_size, int h) | |||
*/ | |||
.align 6 | |||
.globl put_pixels_axp_asm | |||
.ent put_pixels_axp_asm | |||
put_pixels_axp_asm: | |||
.frame sp, 0, ra | |||
.prologue 0 | |||
and a1, 7, t0 | |||
beq t0, $aligned | |||
.align 4 | |||
$unaligned: | |||
ldq_u t0, 0(a1) | |||
ldq_u t1, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t2, 0(a1) | |||
ldq_u t3, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t4, 0(a1) | |||
ldq_u t5, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t6, 0(a1) | |||
ldq_u t7, 8(a1) | |||
extql t0, a1, t0 | |||
addq a1, a2, a1 | |||
extqh t1, a1, t1 | |||
addq a0, a2, t8 | |||
extql t2, a1, t2 | |||
addq t8, a2, t9 | |||
extqh t3, a1, t3 | |||
addq t9, a2, ta | |||
extql t4, a1, t4 | |||
or t0, t1, t0 | |||
extqh t5, a1, t5 | |||
or t2, t3, t2 | |||
extql t6, a1, t6 | |||
or t4, t5, t4 | |||
extqh t7, a1, t7 | |||
or t6, t7, t6 | |||
stq t0, 0(a0) | |||
stq t2, 0(t8) | |||
stq t4, 0(t9) | |||
subq a3, 4, a3 | |||
stq t6, 0(ta) | |||
addq ta, a2, a0 | |||
bne a3, $unaligned | |||
ret | |||
.align 4 | |||
$aligned: | |||
ldq t0, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t1, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t2, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t3, 0(a1) | |||
addq a0, a2, t4 | |||
addq a1, a2, a1 | |||
addq t4, a2, t5 | |||
subq a3, 4, a3 | |||
stq t0, 0(a0) | |||
addq t5, a2, t6 | |||
stq t1, 0(t4) | |||
addq t6, a2, a0 | |||
stq t2, 0(t5) | |||
stq t3, 0(t6) | |||
bne a3, $aligned | |||
ret | |||
.end put_pixels_axp_asm | |||
/************************************************************************ | |||
* void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | |||
* int line_size) | |||
@@ -0,0 +1,213 @@ | |||
/* | |||
* Alpha optimized DSP utils | |||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||
* | |||
* This file is part of Libav. | |||
* | |||
* Libav is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* Libav is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with Libav; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
#include "libavutil/attributes.h" | |||
#include "libavcodec/hpeldsp.h" | |||
#include "hpeldsp_alpha.h" | |||
#include "asm.h" | |||
static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) | |||
{ | |||
return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||
} | |||
static inline uint64_t avg2(uint64_t a, uint64_t b) | |||
{ | |||
return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||
} | |||
#if 0 | |||
/* The XY2 routines basically utilize this scheme, but reuse parts in | |||
each iteration. */ | |||
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |||
{ | |||
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |||
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) | |||
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) | |||
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); | |||
uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |||
+ (l2 & BYTE_VEC(0x03)) | |||
+ (l3 & BYTE_VEC(0x03)) | |||
+ (l4 & BYTE_VEC(0x03)) | |||
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |||
return r1 + r2; | |||
} | |||
#endif | |||
#define OP(LOAD, STORE) \ | |||
do { \ | |||
STORE(LOAD(pixels), block); \ | |||
pixels += line_size; \ | |||
block += line_size; \ | |||
} while (--h) | |||
#define OP_X2(LOAD, STORE) \ | |||
do { \ | |||
uint64_t pix1, pix2; \ | |||
\ | |||
pix1 = LOAD(pixels); \ | |||
pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||
STORE(AVG2(pix1, pix2), block); \ | |||
pixels += line_size; \ | |||
block += line_size; \ | |||
} while (--h) | |||
#define OP_Y2(LOAD, STORE) \ | |||
do { \ | |||
uint64_t pix = LOAD(pixels); \ | |||
do { \ | |||
uint64_t next_pix; \ | |||
\ | |||
pixels += line_size; \ | |||
next_pix = LOAD(pixels); \ | |||
STORE(AVG2(pix, next_pix), block); \ | |||
block += line_size; \ | |||
pix = next_pix; \ | |||
} while (--h); \ | |||
} while (0) | |||
#define OP_XY2(LOAD, STORE) \ | |||
do { \ | |||
uint64_t pix1 = LOAD(pixels); \ | |||
uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||
uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ | |||
+ (pix2 & BYTE_VEC(0x03)); \ | |||
uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||
+ ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||
\ | |||
do { \ | |||
uint64_t npix1, npix2; \ | |||
uint64_t npix_l, npix_h; \ | |||
uint64_t avg; \ | |||
\ | |||
pixels += line_size; \ | |||
npix1 = LOAD(pixels); \ | |||
npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||
npix_l = (npix1 & BYTE_VEC(0x03)) \ | |||
+ (npix2 & BYTE_VEC(0x03)); \ | |||
npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||
+ ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||
avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ | |||
+ pix_h + npix_h; \ | |||
STORE(avg, block); \ | |||
\ | |||
block += line_size; \ | |||
pix_l = npix_l; \ | |||
pix_h = npix_h; \ | |||
} while (--h); \ | |||
} while (0) | |||
#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ | |||
static void OPNAME ## _pixels ## SUFF ## _axp \ | |||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||
ptrdiff_t line_size, int h) \ | |||
{ \ | |||
if ((size_t) pixels & 0x7) { \ | |||
OPKIND(uldq, STORE); \ | |||
} else { \ | |||
OPKIND(ldq, STORE); \ | |||
} \ | |||
} \ | |||
\ | |||
static void OPNAME ## _pixels16 ## SUFF ## _axp \ | |||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||
ptrdiff_t line_size, int h) \ | |||
{ \ | |||
OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ | |||
OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ | |||
} | |||
#define PIXOP(OPNAME, STORE) \ | |||
MAKE_OP(OPNAME, , OP, STORE) \ | |||
MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ | |||
MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ | |||
MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) | |||
/* Rounding primitives. */ | |||
#define AVG2 avg2 | |||
#define AVG4 avg4 | |||
#define AVG4_ROUNDER BYTE_VEC(0x02) | |||
#define STORE(l, b) stq(l, b) | |||
PIXOP(put, STORE); | |||
#undef STORE | |||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||
PIXOP(avg, STORE); | |||
/* Not rounding primitives. */ | |||
#undef AVG2 | |||
#undef AVG4 | |||
#undef AVG4_ROUNDER | |||
#undef STORE | |||
#define AVG2 avg2_no_rnd | |||
#define AVG4 avg4_no_rnd | |||
#define AVG4_ROUNDER BYTE_VEC(0x01) | |||
#define STORE(l, b) stq(l, b) | |||
PIXOP(put_no_rnd, STORE); | |||
#undef STORE | |||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||
PIXOP(avg_no_rnd, STORE); | |||
static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
ptrdiff_t line_size, int h) | |||
{ | |||
put_pixels_axp_asm(block, pixels, line_size, h); | |||
put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); | |||
} | |||
av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags) | |||
{ | |||
c->put_pixels_tab[0][0] = put_pixels16_axp_asm; | |||
c->put_pixels_tab[0][1] = put_pixels16_x2_axp; | |||
c->put_pixels_tab[0][2] = put_pixels16_y2_axp; | |||
c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; | |||
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; | |||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; | |||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; | |||
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; | |||
c->avg_pixels_tab[0][0] = avg_pixels16_axp; | |||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; | |||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; | |||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; | |||
c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; | |||
c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; | |||
c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; | |||
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; | |||
c->put_pixels_tab[1][0] = put_pixels_axp_asm; | |||
c->put_pixels_tab[1][1] = put_pixels_x2_axp; | |||
c->put_pixels_tab[1][2] = put_pixels_y2_axp; | |||
c->put_pixels_tab[1][3] = put_pixels_xy2_axp; | |||
c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; | |||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; | |||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; | |||
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; | |||
c->avg_pixels_tab[1][0] = avg_pixels_axp; | |||
c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; | |||
c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; | |||
c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; | |||
} |
@@ -0,0 +1,28 @@ | |||
/* | |||
* This file is part of Libav. | |||
* | |||
* Libav is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* Libav is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with Libav; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H | |||
#define AVCODEC_ALPHA_HPELDSP_ALPHA_H | |||
#include <stdint.h> | |||
#include <stddef.h> | |||
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
ptrdiff_t line_size, int h); | |||
#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */ |
@@ -0,0 +1,124 @@ | |||
/* | |||
* Alpha optimized DSP utils | |||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||
* | |||
* This file is part of Libav. | |||
* | |||
* Libav is free software; you can redistribute it and/or | |||
* modify it under the terms of the GNU Lesser General Public | |||
* License as published by the Free Software Foundation; either | |||
* version 2.1 of the License, or (at your option) any later version. | |||
* | |||
* Libav is distributed in the hope that it will be useful, | |||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||
* Lesser General Public License for more details. | |||
* | |||
* You should have received a copy of the GNU Lesser General Public | |||
* License along with Libav; if not, write to the Free Software | |||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||
*/ | |||
/* | |||
* These functions are scheduled for pca56. They should work | |||
* reasonably on ev6, though. | |||
*/ | |||
#include "regdef.h" | |||
.set noat | |||
.set noreorder | |||
.arch pca56 | |||
.text | |||
/************************************************************************ | |||
* void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||
* int line_size, int h) | |||
*/ | |||
.align 6 | |||
.globl put_pixels_axp_asm | |||
.ent put_pixels_axp_asm | |||
put_pixels_axp_asm: | |||
.frame sp, 0, ra | |||
.prologue 0 | |||
and a1, 7, t0 | |||
beq t0, $aligned | |||
.align 4 | |||
$unaligned: | |||
ldq_u t0, 0(a1) | |||
ldq_u t1, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t2, 0(a1) | |||
ldq_u t3, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t4, 0(a1) | |||
ldq_u t5, 8(a1) | |||
addq a1, a2, a1 | |||
nop | |||
ldq_u t6, 0(a1) | |||
ldq_u t7, 8(a1) | |||
extql t0, a1, t0 | |||
addq a1, a2, a1 | |||
extqh t1, a1, t1 | |||
addq a0, a2, t8 | |||
extql t2, a1, t2 | |||
addq t8, a2, t9 | |||
extqh t3, a1, t3 | |||
addq t9, a2, ta | |||
extql t4, a1, t4 | |||
or t0, t1, t0 | |||
extqh t5, a1, t5 | |||
or t2, t3, t2 | |||
extql t6, a1, t6 | |||
or t4, t5, t4 | |||
extqh t7, a1, t7 | |||
or t6, t7, t6 | |||
stq t0, 0(a0) | |||
stq t2, 0(t8) | |||
stq t4, 0(t9) | |||
subq a3, 4, a3 | |||
stq t6, 0(ta) | |||
addq ta, a2, a0 | |||
bne a3, $unaligned | |||
ret | |||
.align 4 | |||
$aligned: | |||
ldq t0, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t1, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t2, 0(a1) | |||
addq a1, a2, a1 | |||
ldq t3, 0(a1) | |||
addq a0, a2, t4 | |||
addq a1, a2, a1 | |||
addq t4, a2, t5 | |||
subq a3, 4, a3 | |||
stq t0, 0(a0) | |||
addq t5, a2, t6 | |||
stq t1, 0(t4) | |||
addq t6, a2, a0 | |||
stq t2, 0(t5) | |||
stq t3, 0(t6) | |||
bne a3, $aligned | |||
ret | |||
.end put_pixels_axp_asm |
@@ -63,4 +63,15 @@ | |||
#define sp $30 /* stack pointer */ | |||
#define zero $31 /* reads as zero, writes are noops */ | |||
/* Some nicer register names. */ | |||
#define ta t10 | |||
#define tb t11 | |||
#define tc t12 | |||
#define td AT | |||
/* Danger: these overlap with the argument list and the return value */ | |||
#define te a5 | |||
#define tf a4 | |||
#define tg a3 | |||
#define th v0 | |||
#endif /* AVCODEC_ALPHA_REGDEF_H */ |
@@ -54,6 +54,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags) | |||
hpel_funcs(avg, [3], 2); | |||
hpel_funcs(avg_no_rnd,, 16); | |||
if (ARCH_ALPHA) | |||
ff_hpeldsp_init_alpha(c, flags); | |||
if (ARCH_ARM) | |||
ff_hpeldsp_init_arm(c, flags); | |||
if (ARCH_BFIN) | |||
@@ -94,6 +94,7 @@ typedef struct HpelDSPContext { | |||
void ff_hpeldsp_init(HpelDSPContext *c, int flags); | |||
void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags); | |||
void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); | |||
void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags); | |||
void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags); | |||