|
|
@@ -105,132 +105,137 @@ void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, |
|
|
|
} |
|
|
|
#endif |
|
|
|
|
|
|
|
/* Average 8 unsigned bytes in parallel: (b1 + b2) >> 1 |
|
|
|
Since the immediate result could be greater than 255, we do the |
|
|
|
shift first. The result is too low by one if the bytes were both |
|
|
|
odd, so we need to add (l1 & l2) & BYTE_VEC(0x01). */ |
|
|
|
static inline UINT64 avg2_no_rnd(UINT64 l1, UINT64 l2) |
|
|
|
static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b) |
|
|
|
{ |
|
|
|
UINT64 correction = (l1 & l2) & BYTE_VEC(0x01); |
|
|
|
l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; |
|
|
|
l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; |
|
|
|
return l1 + l2 + correction; |
|
|
|
return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1); |
|
|
|
} |
|
|
|
|
|
|
|
/* Average 8 bytes with rounding: (b1 + b2 + 1) >> 1 |
|
|
|
The '1' only has an effect when one byte is even and the other odd, |
|
|
|
i. e. we also need to add (l1 ^ l2) & BYTE_VEC(0x01). |
|
|
|
Incidentally, that is equivalent to (l1 | l2) & BYTE_VEC(0x01). */ |
|
|
|
static inline UINT64 avg2(UINT64 l1, UINT64 l2) |
|
|
|
static inline uint64_t avg2(uint64_t a, uint64_t b) |
|
|
|
{ |
|
|
|
UINT64 correction = (l1 | l2) & BYTE_VEC(0x01); |
|
|
|
l1 = (l1 & ~BYTE_VEC(0x01)) >> 1; |
|
|
|
l2 = (l2 & ~BYTE_VEC(0x01)) >> 1; |
|
|
|
return l1 + l2 + correction; |
|
|
|
return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1); |
|
|
|
} |
|
|
|
|
|
|
|
static inline UINT64 avg4(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) |
|
|
|
static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4) |
|
|
|
{ |
|
|
|
UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); |
|
|
|
UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) |
|
|
|
+ (l2 & BYTE_VEC(0x03)) |
|
|
|
+ (l3 & BYTE_VEC(0x03)) |
|
|
|
+ (l4 & BYTE_VEC(0x03)) |
|
|
|
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); |
|
|
|
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); |
|
|
|
uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) |
|
|
|
+ (l2 & BYTE_VEC(0x03)) |
|
|
|
+ (l3 & BYTE_VEC(0x03)) |
|
|
|
+ (l4 & BYTE_VEC(0x03)) |
|
|
|
+ BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03); |
|
|
|
return r1 + r2; |
|
|
|
} |
|
|
|
|
|
|
|
static inline UINT64 avg4_no_rnd(UINT64 l1, UINT64 l2, UINT64 l3, UINT64 l4) |
|
|
|
static inline uint64_t avg4_no_rnd(uint64_t l1, uint64_t l2, |
|
|
|
uint64_t l3, uint64_t l4) |
|
|
|
{ |
|
|
|
UINT64 r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); |
|
|
|
UINT64 r2 = (( (l1 & BYTE_VEC(0x03)) |
|
|
|
+ (l2 & BYTE_VEC(0x03)) |
|
|
|
+ (l3 & BYTE_VEC(0x03)) |
|
|
|
+ (l4 & BYTE_VEC(0x03)) |
|
|
|
+ BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); |
|
|
|
uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l2 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l3 & ~BYTE_VEC(0x03)) >> 2) |
|
|
|
+ ((l4 & ~BYTE_VEC(0x03)) >> 2); |
|
|
|
uint64_t r2 = (( (l1 & BYTE_VEC(0x03)) |
|
|
|
+ (l2 & BYTE_VEC(0x03)) |
|
|
|
+ (l3 & BYTE_VEC(0x03)) |
|
|
|
+ (l4 & BYTE_VEC(0x03)) |
|
|
|
+ BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03); |
|
|
|
return r1 + r2; |
|
|
|
} |
|
|
|
|
|
|
|
#define PIXOPNAME(suffix) put ## suffix |
|
|
|
#define BTYPE UINT8 |
|
|
|
#define AVG2 avg2 |
|
|
|
#define AVG4 avg4 |
|
|
|
#define STORE(l, b) stq(l, b) |
|
|
|
#include "pixops.h" |
|
|
|
#undef PIXOPNAME |
|
|
|
#undef BTYPE |
|
|
|
#undef AVG2 |
|
|
|
#undef AVG4 |
|
|
|
#undef STORE |
|
|
|
#define OP(LOAD, STORE, INCR) \ |
|
|
|
do { \ |
|
|
|
STORE(LOAD(pixels), block); \ |
|
|
|
pixels += line_size; \ |
|
|
|
block += INCR; \ |
|
|
|
} while (--h) |
|
|
|
|
|
|
|
#define PIXOPNAME(suffix) put_no_rnd ## suffix |
|
|
|
#define BTYPE UINT8 |
|
|
|
#define AVG2 avg2_no_rnd |
|
|
|
#define AVG4 avg4_no_rnd |
|
|
|
#define STORE(l, b) stq(l, b) |
|
|
|
#include "pixops.h" |
|
|
|
#undef PIXOPNAME |
|
|
|
#undef BTYPE |
|
|
|
#undef AVG2 |
|
|
|
#undef AVG4 |
|
|
|
#undef STORE |
|
|
|
#define OP_X2(LOAD, STORE, INCR) \ |
|
|
|
do { \ |
|
|
|
uint64_t pix1, pix2; \ |
|
|
|
\ |
|
|
|
pix1 = LOAD(pixels); \ |
|
|
|
pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ |
|
|
|
STORE(AVG2(pix1, pix2), block); \ |
|
|
|
pixels += line_size; \ |
|
|
|
block += INCR; \ |
|
|
|
} while (--h) |
|
|
|
|
|
|
|
/* The following functions are untested. */ |
|
|
|
#if 0 |
|
|
|
#define OP_Y2(LOAD, STORE, INCR) \ |
|
|
|
do { \ |
|
|
|
uint64_t pix = LOAD(pixels); \ |
|
|
|
do { \ |
|
|
|
uint64_t next_pix; \ |
|
|
|
\ |
|
|
|
pixels += line_size; \ |
|
|
|
next_pix = LOAD(pixels); \ |
|
|
|
STORE(AVG2(pix, next_pix), block); \ |
|
|
|
block += INCR; \ |
|
|
|
pix = next_pix; \ |
|
|
|
} while (--h); \ |
|
|
|
} while (0) |
|
|
|
|
|
|
|
#define OP_XY2(LOAD, STORE, INCR) \ |
|
|
|
do { \ |
|
|
|
uint64_t pix1 = LOAD(pixels); \ |
|
|
|
uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ |
|
|
|
\ |
|
|
|
do { \ |
|
|
|
uint64_t next_pix1, next_pix2; \ |
|
|
|
\ |
|
|
|
pixels += line_size; \ |
|
|
|
next_pix1 = LOAD(pixels); \ |
|
|
|
next_pix2 = next_pix1 >> 8 | ((uint64_t) pixels[8] << 56); \ |
|
|
|
\ |
|
|
|
STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block); \ |
|
|
|
\ |
|
|
|
block += INCR; \ |
|
|
|
pix1 = next_pix1; \ |
|
|
|
pix2 = next_pix2; \ |
|
|
|
} while (--h); \ |
|
|
|
} while (0) |
|
|
|
|
|
|
|
#define PIXOPNAME(suffix) avg ## suffix |
|
|
|
#define BTYPE UINT8 |
|
|
|
#define MAKE_OP(BTYPE, OPNAME, SUFF, OPKIND, STORE, INCR) \ |
|
|
|
static void OPNAME ## _pixels ## SUFF ## _axp(BTYPE *block, \ |
|
|
|
const uint8_t *pixels, \ |
|
|
|
int line_size, int h) \ |
|
|
|
{ \ |
|
|
|
if ((size_t) pixels & 0x7) { \ |
|
|
|
OPKIND(uldq, STORE, INCR); \ |
|
|
|
} else { \ |
|
|
|
OPKIND(ldq, STORE, INCR); \ |
|
|
|
} \ |
|
|
|
} |
|
|
|
|
|
|
|
#define PIXOP(BTYPE, OPNAME, STORE, INCR) \ |
|
|
|
MAKE_OP(BTYPE, OPNAME, , OP, STORE, INCR); \ |
|
|
|
MAKE_OP(BTYPE, OPNAME, _x2, OP_X2, STORE, INCR); \ |
|
|
|
MAKE_OP(BTYPE, OPNAME, _y2, OP_Y2, STORE, INCR); \ |
|
|
|
MAKE_OP(BTYPE, OPNAME, _xy2, OP_XY2, STORE, INCR); |
|
|
|
|
|
|
|
/* Rounding primitives. */ |
|
|
|
#define AVG2 avg2 |
|
|
|
#define AVG4 avg4 |
|
|
|
#define STORE(l, b) stq(l, b) |
|
|
|
PIXOP(uint8_t, put, STORE, line_size); |
|
|
|
|
|
|
|
#undef STORE |
|
|
|
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); |
|
|
|
#include "pixops.h" |
|
|
|
#undef PIXOPNAME |
|
|
|
#undef BTYPE |
|
|
|
PIXOP(uint8_t, avg, STORE, line_size); |
|
|
|
|
|
|
|
/* Not rounding primitives. */ |
|
|
|
#undef AVG2 |
|
|
|
#undef AVG4 |
|
|
|
#undef STORE |
|
|
|
|
|
|
|
#define PIXOPNAME(suffix) avg_no_rnd ## suffix |
|
|
|
#define BTYPE UINT8 |
|
|
|
#define AVG2 avg2_no_rnd |
|
|
|
#define AVG4 avg4_no_rnd |
|
|
|
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); |
|
|
|
#include "pixops.h" |
|
|
|
#undef PIXOPNAME |
|
|
|
#undef BTYPE |
|
|
|
#undef AVG2 |
|
|
|
#undef AVG4 |
|
|
|
#undef STORE |
|
|
|
#define STORE(l, b) stq(l, b) |
|
|
|
PIXOP(uint8_t, put_no_rnd, STORE, line_size); |
|
|
|
|
|
|
|
#define PIXOPNAME(suffix) sub ## suffix |
|
|
|
#define BTYPE DCTELEM |
|
|
|
#define AVG2 avg2 |
|
|
|
#define AVG4 avg4 |
|
|
|
#define STORE(l, block) do { \ |
|
|
|
UINT64 xxx = l; \ |
|
|
|
(block)[0] -= (xxx >> 0) & 0xff; \ |
|
|
|
(block)[1] -= (xxx >> 8) & 0xff; \ |
|
|
|
(block)[2] -= (xxx >> 16) & 0xff; \ |
|
|
|
(block)[3] -= (xxx >> 24) & 0xff; \ |
|
|
|
(block)[4] -= (xxx >> 32) & 0xff; \ |
|
|
|
(block)[5] -= (xxx >> 40) & 0xff; \ |
|
|
|
(block)[6] -= (xxx >> 48) & 0xff; \ |
|
|
|
(block)[7] -= (xxx >> 56) & 0xff; \ |
|
|
|
} while (0) |
|
|
|
#include "pixops.h" |
|
|
|
#undef PIXOPNAME |
|
|
|
#undef BTYPE |
|
|
|
#undef AVG2 |
|
|
|
#undef AVG4 |
|
|
|
#undef STORE |
|
|
|
|
|
|
|
#endif |
|
|
|
#define STORE(l, b) stq(AVG2(l, ldq(b)), b); |
|
|
|
PIXOP(uint8_t, avg_no_rnd, STORE, line_size); |
|
|
|
|
|
|
|
void dsputil_init_alpha(void) |
|
|
|
{ |
|
|
@@ -244,6 +249,16 @@ void dsputil_init_alpha(void) |
|
|
|
put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp; |
|
|
|
put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp; |
|
|
|
|
|
|
|
avg_pixels_tab[0] = avg_pixels_axp; |
|
|
|
avg_pixels_tab[1] = avg_pixels_x2_axp; |
|
|
|
avg_pixels_tab[2] = avg_pixels_y2_axp; |
|
|
|
avg_pixels_tab[3] = avg_pixels_xy2_axp; |
|
|
|
|
|
|
|
avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp; |
|
|
|
avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp; |
|
|
|
avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp; |
|
|
|
avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp; |
|
|
|
|
|
|
|
/* amask clears all bits that correspond to present features. */ |
|
|
|
if (amask(AMASK_MVI) == 0) { |
|
|
|
put_pixels_clamped = put_pixels_clamped_mvi_asm; |
|
|
|