Signed-off-by: Martin Storsjö <martin@martin.st>tags/n2.0
@@ -4,4 +4,6 @@ OBJS += alpha/dsputil_alpha.o \ | |||||
alpha/motion_est_mvi_asm.o \ | alpha/motion_est_mvi_asm.o \ | ||||
alpha/simple_idct_alpha.o \ | alpha/simple_idct_alpha.o \ | ||||
OBJS-$(CONFIG_HPELDSP) += alpha/hpeldsp_alpha.o \ | |||||
alpha/hpeldsp_alpha_asm.o | |||||
OBJS-$(CONFIG_MPEGVIDEO) += alpha/mpegvideo_alpha.o | OBJS-$(CONFIG_MPEGVIDEO) += alpha/mpegvideo_alpha.o |
@@ -119,196 +119,11 @@ static void clear_blocks_axp(int16_t *blocks) { | |||||
} while (n); | } while (n); | ||||
} | } | ||||
static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) | |||||
{ | |||||
return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||||
} | |||||
static inline uint64_t avg2(uint64_t a, uint64_t b) | |||||
{ | |||||
return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||||
} | |||||
#if 0 | |||||
/* The XY2 routines basically utilize this scheme, but reuse parts in | |||||
each iteration. */ | |||||
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |||||
{ | |||||
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |||||
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) | |||||
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) | |||||
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); | |||||
uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |||||
+ (l2 & BYTE_VEC(0x03)) | |||||
+ (l3 & BYTE_VEC(0x03)) | |||||
+ (l4 & BYTE_VEC(0x03)) | |||||
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |||||
return r1 + r2; | |||||
} | |||||
#endif | |||||
#define OP(LOAD, STORE) \ | |||||
do { \ | |||||
STORE(LOAD(pixels), block); \ | |||||
pixels += line_size; \ | |||||
block += line_size; \ | |||||
} while (--h) | |||||
#define OP_X2(LOAD, STORE) \ | |||||
do { \ | |||||
uint64_t pix1, pix2; \ | |||||
\ | |||||
pix1 = LOAD(pixels); \ | |||||
pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||||
STORE(AVG2(pix1, pix2), block); \ | |||||
pixels += line_size; \ | |||||
block += line_size; \ | |||||
} while (--h) | |||||
#define OP_Y2(LOAD, STORE) \ | |||||
do { \ | |||||
uint64_t pix = LOAD(pixels); \ | |||||
do { \ | |||||
uint64_t next_pix; \ | |||||
\ | |||||
pixels += line_size; \ | |||||
next_pix = LOAD(pixels); \ | |||||
STORE(AVG2(pix, next_pix), block); \ | |||||
block += line_size; \ | |||||
pix = next_pix; \ | |||||
} while (--h); \ | |||||
} while (0) | |||||
#define OP_XY2(LOAD, STORE) \ | |||||
do { \ | |||||
uint64_t pix1 = LOAD(pixels); \ | |||||
uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||||
uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ | |||||
+ (pix2 & BYTE_VEC(0x03)); \ | |||||
uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||||
+ ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||||
\ | |||||
do { \ | |||||
uint64_t npix1, npix2; \ | |||||
uint64_t npix_l, npix_h; \ | |||||
uint64_t avg; \ | |||||
\ | |||||
pixels += line_size; \ | |||||
npix1 = LOAD(pixels); \ | |||||
npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||||
npix_l = (npix1 & BYTE_VEC(0x03)) \ | |||||
+ (npix2 & BYTE_VEC(0x03)); \ | |||||
npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||||
+ ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||||
avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ | |||||
+ pix_h + npix_h; \ | |||||
STORE(avg, block); \ | |||||
\ | |||||
block += line_size; \ | |||||
pix_l = npix_l; \ | |||||
pix_h = npix_h; \ | |||||
} while (--h); \ | |||||
} while (0) | |||||
#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ | |||||
static void OPNAME ## _pixels ## SUFF ## _axp \ | |||||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||||
ptrdiff_t line_size, int h) \ | |||||
{ \ | |||||
if ((size_t) pixels & 0x7) { \ | |||||
OPKIND(uldq, STORE); \ | |||||
} else { \ | |||||
OPKIND(ldq, STORE); \ | |||||
} \ | |||||
} \ | |||||
\ | |||||
static void OPNAME ## _pixels16 ## SUFF ## _axp \ | |||||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||||
ptrdiff_t line_size, int h) \ | |||||
{ \ | |||||
OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ | |||||
OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ | |||||
} | |||||
#define PIXOP(OPNAME, STORE) \ | |||||
MAKE_OP(OPNAME, , OP, STORE) \ | |||||
MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ | |||||
MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ | |||||
MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) | |||||
/* Rounding primitives. */ | |||||
#define AVG2 avg2 | |||||
#define AVG4 avg4 | |||||
#define AVG4_ROUNDER BYTE_VEC(0x02) | |||||
#define STORE(l, b) stq(l, b) | |||||
PIXOP(put, STORE); | |||||
#undef STORE | |||||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||||
PIXOP(avg, STORE); | |||||
/* Not rounding primitives. */ | |||||
#undef AVG2 | |||||
#undef AVG4 | |||||
#undef AVG4_ROUNDER | |||||
#undef STORE | |||||
#define AVG2 avg2_no_rnd | |||||
#define AVG4 avg4_no_rnd | |||||
#define AVG4_ROUNDER BYTE_VEC(0x01) | |||||
#define STORE(l, b) stq(l, b) | |||||
PIXOP(put_no_rnd, STORE); | |||||
#undef STORE | |||||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||||
PIXOP(avg_no_rnd, STORE); | |||||
static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
ptrdiff_t line_size, int h) | |||||
{ | |||||
put_pixels_axp_asm(block, pixels, line_size, h); | |||||
put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
av_cold void ff_dsputil_init_alpha(DSPContext *c, AVCodecContext *avctx) | av_cold void ff_dsputil_init_alpha(DSPContext *c, AVCodecContext *avctx) | ||||
{ | { | ||||
const int high_bit_depth = avctx->bits_per_raw_sample > 8; | const int high_bit_depth = avctx->bits_per_raw_sample > 8; | ||||
if (!high_bit_depth) { | if (!high_bit_depth) { | ||||
c->put_pixels_tab[0][0] = put_pixels16_axp_asm; | |||||
c->put_pixels_tab[0][1] = put_pixels16_x2_axp; | |||||
c->put_pixels_tab[0][2] = put_pixels16_y2_axp; | |||||
c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; | |||||
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; | |||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; | |||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; | |||||
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; | |||||
c->avg_pixels_tab[0][0] = avg_pixels16_axp; | |||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; | |||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; | |||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; | |||||
c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; | |||||
c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; | |||||
c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; | |||||
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; | |||||
c->put_pixels_tab[1][0] = put_pixels_axp_asm; | |||||
c->put_pixels_tab[1][1] = put_pixels_x2_axp; | |||||
c->put_pixels_tab[1][2] = put_pixels_y2_axp; | |||||
c->put_pixels_tab[1][3] = put_pixels_xy2_axp; | |||||
c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; | |||||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; | |||||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; | |||||
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; | |||||
c->avg_pixels_tab[1][0] = avg_pixels_axp; | |||||
c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; | |||||
c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; | |||||
c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; | |||||
c->clear_blocks = clear_blocks_axp; | c->clear_blocks = clear_blocks_axp; | ||||
} | } | ||||
@@ -26,8 +26,6 @@ void ff_simple_idct_axp(int16_t *block); | |||||
void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block); | void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block); | ||||
void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block); | void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block); | ||||
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
ptrdiff_t line_size, int h); | |||||
void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | ||||
int line_size); | int line_size); | ||||
void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | void add_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | ||||
@@ -26,114 +26,11 @@ | |||||
#include "regdef.h" | #include "regdef.h" | ||||
/* Some nicer register names. */ | |||||
#define ta t10 | |||||
#define tb t11 | |||||
#define tc t12 | |||||
#define td AT | |||||
/* Danger: these overlap with the argument list and the return value */ | |||||
#define te a5 | |||||
#define tf a4 | |||||
#define tg a3 | |||||
#define th v0 | |||||
.set noat | .set noat | ||||
.set noreorder | .set noreorder | ||||
.arch pca56 | .arch pca56 | ||||
.text | .text | ||||
/************************************************************************ | |||||
* void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
* int line_size, int h) | |||||
*/ | |||||
.align 6 | |||||
.globl put_pixels_axp_asm | |||||
.ent put_pixels_axp_asm | |||||
put_pixels_axp_asm: | |||||
.frame sp, 0, ra | |||||
.prologue 0 | |||||
and a1, 7, t0 | |||||
beq t0, $aligned | |||||
.align 4 | |||||
$unaligned: | |||||
ldq_u t0, 0(a1) | |||||
ldq_u t1, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t2, 0(a1) | |||||
ldq_u t3, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t4, 0(a1) | |||||
ldq_u t5, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t6, 0(a1) | |||||
ldq_u t7, 8(a1) | |||||
extql t0, a1, t0 | |||||
addq a1, a2, a1 | |||||
extqh t1, a1, t1 | |||||
addq a0, a2, t8 | |||||
extql t2, a1, t2 | |||||
addq t8, a2, t9 | |||||
extqh t3, a1, t3 | |||||
addq t9, a2, ta | |||||
extql t4, a1, t4 | |||||
or t0, t1, t0 | |||||
extqh t5, a1, t5 | |||||
or t2, t3, t2 | |||||
extql t6, a1, t6 | |||||
or t4, t5, t4 | |||||
extqh t7, a1, t7 | |||||
or t6, t7, t6 | |||||
stq t0, 0(a0) | |||||
stq t2, 0(t8) | |||||
stq t4, 0(t9) | |||||
subq a3, 4, a3 | |||||
stq t6, 0(ta) | |||||
addq ta, a2, a0 | |||||
bne a3, $unaligned | |||||
ret | |||||
.align 4 | |||||
$aligned: | |||||
ldq t0, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t1, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t2, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t3, 0(a1) | |||||
addq a0, a2, t4 | |||||
addq a1, a2, a1 | |||||
addq t4, a2, t5 | |||||
subq a3, 4, a3 | |||||
stq t0, 0(a0) | |||||
addq t5, a2, t6 | |||||
stq t1, 0(t4) | |||||
addq t6, a2, a0 | |||||
stq t2, 0(t5) | |||||
stq t3, 0(t6) | |||||
bne a3, $aligned | |||||
ret | |||||
.end put_pixels_axp_asm | |||||
/************************************************************************ | /************************************************************************ | ||||
* void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | * void put_pixels_clamped_mvi_asm(const int16_t *block, uint8_t *pixels, | ||||
* int line_size) | * int line_size) | ||||
@@ -0,0 +1,213 @@ | |||||
/* | |||||
* Alpha optimized DSP utils | |||||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||||
* | |||||
* This file is part of Libav. | |||||
* | |||||
* Libav is free software; you can redistribute it and/or | |||||
* modify it under the terms of the GNU Lesser General Public | |||||
* License as published by the Free Software Foundation; either | |||||
* version 2.1 of the License, or (at your option) any later version. | |||||
* | |||||
* Libav is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
* Lesser General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU Lesser General Public | |||||
* License along with Libav; if not, write to the Free Software | |||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
*/ | |||||
#include "libavutil/attributes.h" | |||||
#include "libavcodec/hpeldsp.h" | |||||
#include "hpeldsp_alpha.h" | |||||
#include "asm.h" | |||||
static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) | |||||
{ | |||||
return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||||
} | |||||
static inline uint64_t avg2(uint64_t a, uint64_t b) | |||||
{ | |||||
return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); | |||||
} | |||||
#if 0 | |||||
/* The XY2 routines basically utilize this scheme, but reuse parts in | |||||
each iteration. */ | |||||
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) | |||||
{ | |||||
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) | |||||
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) | |||||
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) | |||||
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); | |||||
uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) | |||||
+ (l2 & BYTE_VEC(0x03)) | |||||
+ (l3 & BYTE_VEC(0x03)) | |||||
+ (l4 & BYTE_VEC(0x03)) | |||||
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); | |||||
return r1 + r2; | |||||
} | |||||
#endif | |||||
#define OP(LOAD, STORE) \ | |||||
do { \ | |||||
STORE(LOAD(pixels), block); \ | |||||
pixels += line_size; \ | |||||
block += line_size; \ | |||||
} while (--h) | |||||
#define OP_X2(LOAD, STORE) \ | |||||
do { \ | |||||
uint64_t pix1, pix2; \ | |||||
\ | |||||
pix1 = LOAD(pixels); \ | |||||
pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||||
STORE(AVG2(pix1, pix2), block); \ | |||||
pixels += line_size; \ | |||||
block += line_size; \ | |||||
} while (--h) | |||||
#define OP_Y2(LOAD, STORE) \ | |||||
do { \ | |||||
uint64_t pix = LOAD(pixels); \ | |||||
do { \ | |||||
uint64_t next_pix; \ | |||||
\ | |||||
pixels += line_size; \ | |||||
next_pix = LOAD(pixels); \ | |||||
STORE(AVG2(pix, next_pix), block); \ | |||||
block += line_size; \ | |||||
pix = next_pix; \ | |||||
} while (--h); \ | |||||
} while (0) | |||||
#define OP_XY2(LOAD, STORE) \ | |||||
do { \ | |||||
uint64_t pix1 = LOAD(pixels); \ | |||||
uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||||
uint64_t pix_l = (pix1 & BYTE_VEC(0x03)) \ | |||||
+ (pix2 & BYTE_VEC(0x03)); \ | |||||
uint64_t pix_h = ((pix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||||
+ ((pix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||||
\ | |||||
do { \ | |||||
uint64_t npix1, npix2; \ | |||||
uint64_t npix_l, npix_h; \ | |||||
uint64_t avg; \ | |||||
\ | |||||
pixels += line_size; \ | |||||
npix1 = LOAD(pixels); \ | |||||
npix2 = npix1 >> 8 | ((uint64_t) pixels[8] << 56); \ | |||||
npix_l = (npix1 & BYTE_VEC(0x03)) \ | |||||
+ (npix2 & BYTE_VEC(0x03)); \ | |||||
npix_h = ((npix1 & ~BYTE_VEC(0x03)) >> 2) \ | |||||
+ ((npix2 & ~BYTE_VEC(0x03)) >> 2); \ | |||||
avg = (((pix_l + npix_l + AVG4_ROUNDER) >> 2) & BYTE_VEC(0x03)) \ | |||||
+ pix_h + npix_h; \ | |||||
STORE(avg, block); \ | |||||
\ | |||||
block += line_size; \ | |||||
pix_l = npix_l; \ | |||||
pix_h = npix_h; \ | |||||
} while (--h); \ | |||||
} while (0) | |||||
#define MAKE_OP(OPNAME, SUFF, OPKIND, STORE) \ | |||||
static void OPNAME ## _pixels ## SUFF ## _axp \ | |||||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||||
ptrdiff_t line_size, int h) \ | |||||
{ \ | |||||
if ((size_t) pixels & 0x7) { \ | |||||
OPKIND(uldq, STORE); \ | |||||
} else { \ | |||||
OPKIND(ldq, STORE); \ | |||||
} \ | |||||
} \ | |||||
\ | |||||
static void OPNAME ## _pixels16 ## SUFF ## _axp \ | |||||
(uint8_t *restrict block, const uint8_t *restrict pixels, \ | |||||
ptrdiff_t line_size, int h) \ | |||||
{ \ | |||||
OPNAME ## _pixels ## SUFF ## _axp(block, pixels, line_size, h); \ | |||||
OPNAME ## _pixels ## SUFF ## _axp(block + 8, pixels + 8, line_size, h); \ | |||||
} | |||||
#define PIXOP(OPNAME, STORE) \ | |||||
MAKE_OP(OPNAME, , OP, STORE) \ | |||||
MAKE_OP(OPNAME, _x2, OP_X2, STORE) \ | |||||
MAKE_OP(OPNAME, _y2, OP_Y2, STORE) \ | |||||
MAKE_OP(OPNAME, _xy2, OP_XY2, STORE) | |||||
/* Rounding primitives. */ | |||||
#define AVG2 avg2 | |||||
#define AVG4 avg4 | |||||
#define AVG4_ROUNDER BYTE_VEC(0x02) | |||||
#define STORE(l, b) stq(l, b) | |||||
PIXOP(put, STORE); | |||||
#undef STORE | |||||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||||
PIXOP(avg, STORE); | |||||
/* Not rounding primitives. */ | |||||
#undef AVG2 | |||||
#undef AVG4 | |||||
#undef AVG4_ROUNDER | |||||
#undef STORE | |||||
#define AVG2 avg2_no_rnd | |||||
#define AVG4 avg4_no_rnd | |||||
#define AVG4_ROUNDER BYTE_VEC(0x01) | |||||
#define STORE(l, b) stq(l, b) | |||||
PIXOP(put_no_rnd, STORE); | |||||
#undef STORE | |||||
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); | |||||
PIXOP(avg_no_rnd, STORE); | |||||
static void put_pixels16_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
ptrdiff_t line_size, int h) | |||||
{ | |||||
put_pixels_axp_asm(block, pixels, line_size, h); | |||||
put_pixels_axp_asm(block + 8, pixels + 8, line_size, h); | |||||
} | |||||
av_cold void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags) | |||||
{ | |||||
c->put_pixels_tab[0][0] = put_pixels16_axp_asm; | |||||
c->put_pixels_tab[0][1] = put_pixels16_x2_axp; | |||||
c->put_pixels_tab[0][2] = put_pixels16_y2_axp; | |||||
c->put_pixels_tab[0][3] = put_pixels16_xy2_axp; | |||||
c->put_no_rnd_pixels_tab[0][0] = put_pixels16_axp_asm; | |||||
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_axp; | |||||
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_axp; | |||||
c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_axp; | |||||
c->avg_pixels_tab[0][0] = avg_pixels16_axp; | |||||
c->avg_pixels_tab[0][1] = avg_pixels16_x2_axp; | |||||
c->avg_pixels_tab[0][2] = avg_pixels16_y2_axp; | |||||
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_axp; | |||||
c->avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels16_axp; | |||||
c->avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels16_x2_axp; | |||||
c->avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels16_y2_axp; | |||||
c->avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels16_xy2_axp; | |||||
c->put_pixels_tab[1][0] = put_pixels_axp_asm; | |||||
c->put_pixels_tab[1][1] = put_pixels_x2_axp; | |||||
c->put_pixels_tab[1][2] = put_pixels_y2_axp; | |||||
c->put_pixels_tab[1][3] = put_pixels_xy2_axp; | |||||
c->put_no_rnd_pixels_tab[1][0] = put_pixels_axp_asm; | |||||
c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels_x2_axp; | |||||
c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels_y2_axp; | |||||
c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels_xy2_axp; | |||||
c->avg_pixels_tab[1][0] = avg_pixels_axp; | |||||
c->avg_pixels_tab[1][1] = avg_pixels_x2_axp; | |||||
c->avg_pixels_tab[1][2] = avg_pixels_y2_axp; | |||||
c->avg_pixels_tab[1][3] = avg_pixels_xy2_axp; | |||||
} |
@@ -0,0 +1,28 @@ | |||||
/* | |||||
* This file is part of Libav. | |||||
* | |||||
* Libav is free software; you can redistribute it and/or | |||||
* modify it under the terms of the GNU Lesser General Public | |||||
* License as published by the Free Software Foundation; either | |||||
* version 2.1 of the License, or (at your option) any later version. | |||||
* | |||||
* Libav is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
* Lesser General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU Lesser General Public | |||||
* License along with Libav; if not, write to the Free Software | |||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
*/ | |||||
#ifndef AVCODEC_ALPHA_HPELDSP_ALPHA_H | |||||
#define AVCODEC_ALPHA_HPELDSP_ALPHA_H | |||||
#include <stdint.h> | |||||
#include <stddef.h> | |||||
void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
ptrdiff_t line_size, int h); | |||||
#endif /* AVCODEC_ALPHA_HPELDSP_ALPHA_H */ |
@@ -0,0 +1,124 @@ | |||||
/* | |||||
* Alpha optimized DSP utils | |||||
* Copyright (c) 2002 Falk Hueffner <falk@debian.org> | |||||
* | |||||
* This file is part of Libav. | |||||
* | |||||
* Libav is free software; you can redistribute it and/or | |||||
* modify it under the terms of the GNU Lesser General Public | |||||
* License as published by the Free Software Foundation; either | |||||
* version 2.1 of the License, or (at your option) any later version. | |||||
* | |||||
* Libav is distributed in the hope that it will be useful, | |||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of | |||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |||||
* Lesser General Public License for more details. | |||||
* | |||||
* You should have received a copy of the GNU Lesser General Public | |||||
* License along with Libav; if not, write to the Free Software | |||||
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA | |||||
*/ | |||||
/* | |||||
* These functions are scheduled for pca56. They should work | |||||
* reasonably on ev6, though. | |||||
*/ | |||||
#include "regdef.h" | |||||
.set noat | |||||
.set noreorder | |||||
.arch pca56 | |||||
.text | |||||
/************************************************************************ | |||||
* void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels, | |||||
* int line_size, int h) | |||||
*/ | |||||
.align 6 | |||||
.globl put_pixels_axp_asm | |||||
.ent put_pixels_axp_asm | |||||
put_pixels_axp_asm: | |||||
.frame sp, 0, ra | |||||
.prologue 0 | |||||
and a1, 7, t0 | |||||
beq t0, $aligned | |||||
.align 4 | |||||
$unaligned: | |||||
ldq_u t0, 0(a1) | |||||
ldq_u t1, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t2, 0(a1) | |||||
ldq_u t3, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t4, 0(a1) | |||||
ldq_u t5, 8(a1) | |||||
addq a1, a2, a1 | |||||
nop | |||||
ldq_u t6, 0(a1) | |||||
ldq_u t7, 8(a1) | |||||
extql t0, a1, t0 | |||||
addq a1, a2, a1 | |||||
extqh t1, a1, t1 | |||||
addq a0, a2, t8 | |||||
extql t2, a1, t2 | |||||
addq t8, a2, t9 | |||||
extqh t3, a1, t3 | |||||
addq t9, a2, ta | |||||
extql t4, a1, t4 | |||||
or t0, t1, t0 | |||||
extqh t5, a1, t5 | |||||
or t2, t3, t2 | |||||
extql t6, a1, t6 | |||||
or t4, t5, t4 | |||||
extqh t7, a1, t7 | |||||
or t6, t7, t6 | |||||
stq t0, 0(a0) | |||||
stq t2, 0(t8) | |||||
stq t4, 0(t9) | |||||
subq a3, 4, a3 | |||||
stq t6, 0(ta) | |||||
addq ta, a2, a0 | |||||
bne a3, $unaligned | |||||
ret | |||||
.align 4 | |||||
$aligned: | |||||
ldq t0, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t1, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t2, 0(a1) | |||||
addq a1, a2, a1 | |||||
ldq t3, 0(a1) | |||||
addq a0, a2, t4 | |||||
addq a1, a2, a1 | |||||
addq t4, a2, t5 | |||||
subq a3, 4, a3 | |||||
stq t0, 0(a0) | |||||
addq t5, a2, t6 | |||||
stq t1, 0(t4) | |||||
addq t6, a2, a0 | |||||
stq t2, 0(t5) | |||||
stq t3, 0(t6) | |||||
bne a3, $aligned | |||||
ret | |||||
.end put_pixels_axp_asm |
@@ -63,4 +63,15 @@ | |||||
#define sp $30 /* stack pointer */ | #define sp $30 /* stack pointer */ | ||||
#define zero $31 /* reads as zero, writes are noops */ | #define zero $31 /* reads as zero, writes are noops */ | ||||
/* Some nicer register names. */ | |||||
#define ta t10 | |||||
#define tb t11 | |||||
#define tc t12 | |||||
#define td AT | |||||
/* Danger: these overlap with the argument list and the return value */ | |||||
#define te a5 | |||||
#define tf a4 | |||||
#define tg a3 | |||||
#define th v0 | |||||
#endif /* AVCODEC_ALPHA_REGDEF_H */ | #endif /* AVCODEC_ALPHA_REGDEF_H */ |
@@ -54,6 +54,8 @@ av_cold void ff_hpeldsp_init(HpelDSPContext *c, int flags) | |||||
hpel_funcs(avg, [3], 2); | hpel_funcs(avg, [3], 2); | ||||
hpel_funcs(avg_no_rnd,, 16); | hpel_funcs(avg_no_rnd,, 16); | ||||
if (ARCH_ALPHA) | |||||
ff_hpeldsp_init_alpha(c, flags); | |||||
if (ARCH_ARM) | if (ARCH_ARM) | ||||
ff_hpeldsp_init_arm(c, flags); | ff_hpeldsp_init_arm(c, flags); | ||||
if (ARCH_BFIN) | if (ARCH_BFIN) | ||||
@@ -94,6 +94,7 @@ typedef struct HpelDSPContext { | |||||
void ff_hpeldsp_init(HpelDSPContext *c, int flags); | void ff_hpeldsp_init(HpelDSPContext *c, int flags); | ||||
void ff_hpeldsp_init_alpha(HpelDSPContext *c, int flags); | |||||
void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); | void ff_hpeldsp_init_arm(HpelDSPContext *c, int flags); | ||||
void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags); | void ff_hpeldsp_init_bfin(HpelDSPContext *c, int flags); | ||||
void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags); | void ff_hpeldsp_init_ppc(HpelDSPContext *c, int flags); | ||||