|
|
@@ -27,202 +27,163 @@ |
|
|
|
#define ALIGNMENT 16 |
|
|
|
#define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1))) |
|
|
|
|
|
|
|
#define LD_B(RTYPE, psrc) *((RTYPE *)(psrc)) |
|
|
|
#define LD_UB(...) LD_B(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB(...) LD_B(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_H(RTYPE, psrc) *((RTYPE *)(psrc)) |
|
|
|
#define LD_UH(...) LD_H(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH(...) LD_H(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) |
|
|
|
#define LD_UW(...) LD_W(v4u32, __VA_ARGS__) |
|
|
|
#define LD_SW(...) LD_W(v4i32, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
|
|
|
#define ST_UB(...) ST_B(v16u8, __VA_ARGS__) |
|
|
|
#define ST_SB(...) ST_B(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
|
|
|
#define ST_UH(...) ST_H(v8u16, __VA_ARGS__) |
|
|
|
#define ST_SH(...) ST_H(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
|
|
|
#define ST_UW(...) ST_W(v4u32, __VA_ARGS__) |
|
|
|
#define ST_SW(...) ST_W(v4i32, __VA_ARGS__) |
|
|
|
#define LD_V(RTYPE, psrc) *((RTYPE *)(psrc)) |
|
|
|
#define LD_UB(...) LD_V(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB(...) LD_V(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UH(...) LD_V(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH(...) LD_V(v8i16, __VA_ARGS__) |
|
|
|
#define LD_UW(...) LD_V(v4u32, __VA_ARGS__) |
|
|
|
#define LD_SW(...) LD_V(v4i32, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) |
|
|
|
#define ST_UB(...) ST_V(v16u8, __VA_ARGS__) |
|
|
|
#define ST_SB(...) ST_V(v16i8, __VA_ARGS__) |
|
|
|
#define ST_UH(...) ST_V(v8u16, __VA_ARGS__) |
|
|
|
#define ST_SH(...) ST_V(v8i16, __VA_ARGS__) |
|
|
|
#define ST_UW(...) ST_V(v4u32, __VA_ARGS__) |
|
|
|
#define ST_SW(...) ST_V(v4i32, __VA_ARGS__) |
|
|
|
|
|
|
|
#if (__mips_isa_rev >= 6) |
|
|
|
#define LW(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_m = (uint8_t *) (psrc); \ |
|
|
|
uint32_t val_m; \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"lw %[val_m], %[psrc_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [val_m] "=r" (val_m) \ |
|
|
|
: [psrc_m] "m" (*psrc_m) \ |
|
|
|
); \ |
|
|
|
\ |
|
|
|
val_m; \ |
|
|
|
#define LH(psrc) \ |
|
|
|
( { \ |
|
|
|
uint16_t val_lh_m = *(uint16_t *)(psrc); \ |
|
|
|
val_lh_m; \ |
|
|
|
} ) |
|
|
|
|
|
|
|
#define LW(psrc) \ |
|
|
|
( { \ |
|
|
|
uint32_t val_lw_m = *(uint32_t *)(psrc); \ |
|
|
|
val_lw_m; \ |
|
|
|
} ) |
|
|
|
|
|
|
|
#if (__mips == 64) |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_m = (uint8_t *) (psrc); \ |
|
|
|
uint64_t val_m = 0; \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"ld %[val_m], %[psrc_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [val_m] "=r" (val_m) \ |
|
|
|
: [psrc_m] "m" (*psrc_m) \ |
|
|
|
); \ |
|
|
|
\ |
|
|
|
val_m; \ |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint64_t val_ld_m = *(uint64_t *)(psrc); \ |
|
|
|
val_ld_m; \ |
|
|
|
} ) |
|
|
|
#else // !(__mips == 64) |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ |
|
|
|
uint32_t val0_m, val1_m; \ |
|
|
|
uint64_t val_m = 0; \ |
|
|
|
\ |
|
|
|
val0_m = LW(psrc_ld_m); \ |
|
|
|
val1_m = LW(psrc_ld_m + 4); \ |
|
|
|
\ |
|
|
|
val_m = (uint64_t) (val1_m); \ |
|
|
|
val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \ |
|
|
|
val_m = (uint64_t) (val_m | (uint64_t) val0_m); \ |
|
|
|
\ |
|
|
|
val_m; \ |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ |
|
|
|
uint32_t val0_ld_m, val1_ld_m; \ |
|
|
|
uint64_t val_ld_m = 0; \ |
|
|
|
\ |
|
|
|
val0_ld_m = LW(psrc_ld_m); \ |
|
|
|
val1_ld_m = LW(psrc_ld_m + 4); \ |
|
|
|
\ |
|
|
|
val_ld_m = (uint64_t) (val1_ld_m); \ |
|
|
|
val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ |
|
|
|
val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ |
|
|
|
\ |
|
|
|
val_ld_m; \ |
|
|
|
} ) |
|
|
|
#endif // (__mips == 64) |
|
|
|
|
|
|
|
#define SH(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_m = (uint8_t *) (pdst); \ |
|
|
|
uint16_t val_m = (val); \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"sh %[val_m], %[pdst_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [pdst_m] "=m" (*pdst_m) \ |
|
|
|
: [val_m] "r" (val_m) \ |
|
|
|
); \ |
|
|
|
} |
|
|
|
|
|
|
|
#define SW(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_m = (uint8_t *) (pdst); \ |
|
|
|
uint32_t val_m = (val); \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"sw %[val_m], %[pdst_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [pdst_m] "=m" (*pdst_m) \ |
|
|
|
: [val_m] "r" (val_m) \ |
|
|
|
); \ |
|
|
|
} |
|
|
|
#define SH(val, pdst) *(uint16_t *)(pdst) = (val); |
|
|
|
#define SW(val, pdst) *(uint32_t *)(pdst) = (val); |
|
|
|
#define SD(val, pdst) *(uint64_t *)(pdst) = (val); |
|
|
|
|
|
|
|
#define SD(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_m = (uint8_t *) (pdst); \ |
|
|
|
uint64_t val_m = (val); \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"sd %[val_m], %[pdst_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [pdst_m] "=m" (*pdst_m) \ |
|
|
|
: [val_m] "r" (val_m) \ |
|
|
|
); \ |
|
|
|
} |
|
|
|
#else // !(__mips_isa_rev >= 6) |
|
|
|
#define LW(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_m = (uint8_t *) (psrc); \ |
|
|
|
uint32_t val_m; \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"ulw %[val_m], %[psrc_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [val_m] "=r" (val_m) \ |
|
|
|
: [psrc_m] "m" (*psrc_m) \ |
|
|
|
); \ |
|
|
|
\ |
|
|
|
val_m; \ |
|
|
|
#define LH(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_lh_m = (uint8_t *) (psrc); \ |
|
|
|
uint16_t val_lh_m; \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"ulh %[val_lh_m], %[psrc_lh_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [val_lh_m] "=r" (val_lh_m) \ |
|
|
|
: [psrc_lh_m] "m" (*psrc_lh_m) \ |
|
|
|
); \ |
|
|
|
\ |
|
|
|
val_lh_m; \ |
|
|
|
} ) |
|
|
|
|
|
|
|
#define LW(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_lw_m = (uint8_t *) (psrc); \ |
|
|
|
uint32_t val_lw_m; \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"ulw %[val_lw_m], %[psrc_lw_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [val_lw_m] "=r" (val_lw_m) \ |
|
|
|
: [psrc_lw_m] "m" (*psrc_lw_m) \ |
|
|
|
); \ |
|
|
|
\ |
|
|
|
val_lw_m; \ |
|
|
|
} ) |
|
|
|
|
|
|
|
#if (__mips == 64) |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_m = (uint8_t *) (psrc); \ |
|
|
|
uint64_t val_m = 0; \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"uld %[val_m], %[psrc_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [val_m] "=r" (val_m) \ |
|
|
|
: [psrc_m] "m" (*psrc_m) \ |
|
|
|
); \ |
|
|
|
\ |
|
|
|
val_m; \ |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ |
|
|
|
uint64_t val_ld_m = 0; \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"uld %[val_ld_m], %[psrc_ld_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [val_ld_m] "=r" (val_ld_m) \ |
|
|
|
: [psrc_ld_m] "m" (*psrc_ld_m) \ |
|
|
|
); \ |
|
|
|
\ |
|
|
|
val_ld_m; \ |
|
|
|
} ) |
|
|
|
#else // !(__mips == 64) |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ |
|
|
|
uint32_t val0_m, val1_m; \ |
|
|
|
uint64_t val_m = 0; \ |
|
|
|
\ |
|
|
|
val0_m = LW(psrc_ld_m); \ |
|
|
|
val1_m = LW(psrc_ld_m + 4); \ |
|
|
|
\ |
|
|
|
val_m = (uint64_t) (val1_m); \ |
|
|
|
val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \ |
|
|
|
val_m = (uint64_t) (val_m | (uint64_t) val0_m); \ |
|
|
|
\ |
|
|
|
val_m; \ |
|
|
|
#define LD(psrc) \ |
|
|
|
( { \ |
|
|
|
uint8_t *psrc_ld_m = (uint8_t *) (psrc); \ |
|
|
|
uint32_t val0_ld_m, val1_ld_m; \ |
|
|
|
uint64_t val_ld_m = 0; \ |
|
|
|
\ |
|
|
|
val0_ld_m = LW(psrc_ld_m); \ |
|
|
|
val1_ld_m = LW(psrc_ld_m + 4); \ |
|
|
|
\ |
|
|
|
val_ld_m = (uint64_t) (val1_ld_m); \ |
|
|
|
val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \ |
|
|
|
val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \ |
|
|
|
\ |
|
|
|
val_ld_m; \ |
|
|
|
} ) |
|
|
|
#endif // (__mips == 64) |
|
|
|
|
|
|
|
#define SH(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_m = (uint8_t *) (pdst); \ |
|
|
|
uint16_t val_m = (val); \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"ush %[val_m], %[pdst_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [pdst_m] "=m" (*pdst_m) \ |
|
|
|
: [val_m] "r" (val_m) \ |
|
|
|
); \ |
|
|
|
#define SH(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_sh_m = (uint8_t *) (pdst); \ |
|
|
|
uint16_t val_sh_m = (val); \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"ush %[val_sh_m], %[pdst_sh_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [pdst_sh_m] "=m" (*pdst_sh_m) \ |
|
|
|
: [val_sh_m] "r" (val_sh_m) \ |
|
|
|
); \ |
|
|
|
} |
|
|
|
|
|
|
|
#define SW(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_m = (uint8_t *) (pdst); \ |
|
|
|
uint32_t val_m = (val); \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"usw %[val_m], %[pdst_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [pdst_m] "=m" (*pdst_m) \ |
|
|
|
: [val_m] "r" (val_m) \ |
|
|
|
); \ |
|
|
|
#define SW(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_sw_m = (uint8_t *) (pdst); \ |
|
|
|
uint32_t val_sw_m = (val); \ |
|
|
|
\ |
|
|
|
__asm__ volatile ( \ |
|
|
|
"usw %[val_sw_m], %[pdst_sw_m] \n\t" \ |
|
|
|
\ |
|
|
|
: [pdst_sw_m] "=m" (*pdst_sw_m) \ |
|
|
|
: [val_sw_m] "r" (val_sw_m) \ |
|
|
|
); \ |
|
|
|
} |
|
|
|
|
|
|
|
#define SD(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_m1 = (uint8_t *) (pdst); \ |
|
|
|
uint32_t val0_m, val1_m; \ |
|
|
|
\ |
|
|
|
val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ |
|
|
|
val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ |
|
|
|
\ |
|
|
|
SW(val0_m, pdst_m1); \ |
|
|
|
SW(val1_m, pdst_m1 + 4); \ |
|
|
|
#define SD(val, pdst) \ |
|
|
|
{ \ |
|
|
|
uint8_t *pdst_sd_m = (uint8_t *) (pdst); \ |
|
|
|
uint32_t val0_sd_m, val1_sd_m; \ |
|
|
|
\ |
|
|
|
val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \ |
|
|
|
val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \ |
|
|
|
\ |
|
|
|
SW(val0_sd_m, pdst_sd_m); \ |
|
|
|
SW(val1_sd_m, pdst_sd_m + 4); \ |
|
|
|
} |
|
|
|
#endif // (__mips_isa_rev >= 6) |
|
|
|
|
|
|
@@ -291,122 +252,91 @@ |
|
|
|
SD(in3, (pdst) + 3 * stride); \ |
|
|
|
} |
|
|
|
|
|
|
|
/* Description : Load vectors with 16 byte elements with stride |
|
|
|
/* Description : Load vector elements with stride |
|
|
|
Arguments : Inputs - psrc (source pointer to load from) |
|
|
|
- stride |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Loads 16 byte elements in 'out0' from (psrc) |
|
|
|
Loads 16 byte elements in 'out1' from (psrc + stride) |
|
|
|
Details : Loads elements in 'out0' from (psrc) |
|
|
|
Loads elements in 'out1' from (psrc + stride) |
|
|
|
*/ |
|
|
|
#define LD_B2(RTYPE, psrc, stride, out0, out1) \ |
|
|
|
#define LD_V2(RTYPE, psrc, stride, out0, out1) \ |
|
|
|
{ \ |
|
|
|
out0 = LD_B(RTYPE, (psrc)); \ |
|
|
|
out1 = LD_B(RTYPE, (psrc) + stride); \ |
|
|
|
out0 = LD_V(RTYPE, (psrc)); \ |
|
|
|
out1 = LD_V(RTYPE, (psrc) + stride); \ |
|
|
|
} |
|
|
|
#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__) |
|
|
|
#define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \ |
|
|
|
#define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \ |
|
|
|
{ \ |
|
|
|
LD_B2(RTYPE, (psrc), stride, out0, out1); \ |
|
|
|
out2 = LD_B(RTYPE, (psrc) + 2 * stride); \ |
|
|
|
LD_V2(RTYPE, (psrc), stride, out0, out1); \ |
|
|
|
out2 = LD_V(RTYPE, (psrc) + 2 * stride); \ |
|
|
|
} |
|
|
|
#define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \ |
|
|
|
#define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \ |
|
|
|
{ \ |
|
|
|
LD_B2(RTYPE, (psrc), stride, out0, out1); \ |
|
|
|
LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ |
|
|
|
LD_V2(RTYPE, (psrc), stride, out0, out1); \ |
|
|
|
LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \ |
|
|
|
} |
|
|
|
#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ |
|
|
|
#define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \ |
|
|
|
{ \ |
|
|
|
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
out4 = LD_B(RTYPE, (psrc) + 4 * stride); \ |
|
|
|
LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
out4 = LD_V(RTYPE, (psrc) + 4 * stride); \ |
|
|
|
} |
|
|
|
#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ |
|
|
|
#define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ |
|
|
|
{ \ |
|
|
|
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \ |
|
|
|
LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \ |
|
|
|
} |
|
|
|
#define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_B7(RTYPE, psrc, stride, \ |
|
|
|
#define LD_V7(RTYPE, psrc, stride, \ |
|
|
|
out0, out1, out2, out3, out4, out5, out6) \ |
|
|
|
{ \ |
|
|
|
LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ |
|
|
|
LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ |
|
|
|
LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \ |
|
|
|
LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \ |
|
|
|
} |
|
|
|
#define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_B8(RTYPE, psrc, stride, \ |
|
|
|
#define LD_V8(RTYPE, psrc, stride, \ |
|
|
|
out0, out1, out2, out3, out4, out5, out6, out7) \ |
|
|
|
{ \ |
|
|
|
LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ |
|
|
|
} |
|
|
|
#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
/* Description : Load vectors with 8 halfword elements with stride |
|
|
|
Arguments : Inputs - psrc (source pointer to load from) |
|
|
|
- stride |
|
|
|
Outputs - out0, out1 |
|
|
|
Details : Loads 8 halfword elements in 'out0' from (psrc) |
|
|
|
Loads 8 halfword elements in 'out1' from (psrc + stride) |
|
|
|
*/ |
|
|
|
#define LD_H2(RTYPE, psrc, stride, out0, out1) \ |
|
|
|
{ \ |
|
|
|
out0 = LD_H(RTYPE, (psrc)); \ |
|
|
|
out1 = LD_H(RTYPE, (psrc) + (stride)); \ |
|
|
|
} |
|
|
|
#define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \ |
|
|
|
{ \ |
|
|
|
LD_H2(RTYPE, (psrc), stride, out0, out1); \ |
|
|
|
LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \ |
|
|
|
} |
|
|
|
#define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \ |
|
|
|
{ \ |
|
|
|
LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \ |
|
|
|
LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ |
|
|
|
} |
|
|
|
#define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__) |
|
|
|
#define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__) |
|
|
|
#define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__) |
|
|
|
#define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_H8(RTYPE, psrc, stride, \ |
|
|
|
out0, out1, out2, out3, out4, out5, out6, out7) \ |
|
|
|
{ \ |
|
|
|
LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \ |
|
|
|
LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \ |
|
|
|
} |
|
|
|
#define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__) |
|
|
|
#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define LD_H16(RTYPE, psrc, stride, \ |
|
|
|
#define LD_V16(RTYPE, psrc, stride, \ |
|
|
|
out0, out1, out2, out3, out4, out5, out6, out7, \ |
|
|
|
out8, out9, out10, out11, out12, out13, out14, out15) \ |
|
|
|
{ \ |
|
|
|
LD_H8(RTYPE, (psrc), stride, \ |
|
|
|
LD_V8(RTYPE, (psrc), stride, \ |
|
|
|
out0, out1, out2, out3, out4, out5, out6, out7); \ |
|
|
|
LD_H8(RTYPE, (psrc) + 8 * stride, stride, \ |
|
|
|
LD_V8(RTYPE, (psrc) + 8 * stride, stride, \ |
|
|
|
out8, out9, out10, out11, out12, out13, out14, out15); \ |
|
|
|
} |
|
|
|
#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) |
|
|
|
#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
/* Description : Load as 4x4 block of signed halfword elements from 1D source |
|
|
|
data into 4 vectors (Each vector with 4 signed halfwords) |
|
|
@@ -421,103 +351,48 @@ |
|
|
|
out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \ |
|
|
|
} |
|
|
|
|
|
|
|
/* Description : Load 2 vectors of signed word elements with stride |
|
|
|
Arguments : Inputs - psrc (source pointer to load from) |
|
|
|
- stride |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - signed word |
|
|
|
*/ |
|
|
|
#define LD_SW2(psrc, stride, out0, out1) \ |
|
|
|
{ \ |
|
|
|
out0 = LD_SW((psrc)); \ |
|
|
|
out1 = LD_SW((psrc) + stride); \ |
|
|
|
} |
|
|
|
|
|
|
|
/* Description : Store vectors of 16 byte elements with stride |
|
|
|
Arguments : Inputs - in0, in1, stride |
|
|
|
Outputs - pdst (destination pointer to store to) |
|
|
|
Details : Stores 16 byte elements from 'in0' to (pdst) |
|
|
|
Stores 16 byte elements from 'in1' to (pdst + stride) |
|
|
|
*/ |
|
|
|
#define ST_B2(RTYPE, in0, in1, pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_B(RTYPE, in0, (pdst)); \ |
|
|
|
ST_B(RTYPE, in1, (pdst) + stride); \ |
|
|
|
} |
|
|
|
#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) |
|
|
|
#define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_B2(RTYPE, in0, in1, (pdst), stride); \ |
|
|
|
ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
|
|
|
} |
|
|
|
#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) |
|
|
|
#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
|
|
pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \ |
|
|
|
ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
|
|
|
} |
|
|
|
#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) |
|
|
|
|
|
|
|
/* Description : Store vectors of 8 halfword elements with stride |
|
|
|
/* Description : Store vectors with stride |
|
|
|
Arguments : Inputs - in0, in1, stride |
|
|
|
Outputs - pdst (destination pointer to store to) |
|
|
|
Details : Stores 8 halfword elements from 'in0' to (pdst) |
|
|
|
Stores 8 halfword elements from 'in1' to (pdst + stride) |
|
|
|
Details : Stores elements from 'in0' to (pdst) |
|
|
|
Stores elements from 'in1' to (pdst + stride) |
|
|
|
*/ |
|
|
|
#define ST_H2(RTYPE, in0, in1, pdst, stride) \ |
|
|
|
#define ST_V2(RTYPE, in0, in1, pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_H(RTYPE, in0, (pdst)); \ |
|
|
|
ST_H(RTYPE, in1, (pdst) + stride); \ |
|
|
|
ST_V(RTYPE, in0, (pdst)); \ |
|
|
|
ST_V(RTYPE, in1, (pdst) + stride); \ |
|
|
|
} |
|
|
|
#define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) |
|
|
|
#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) |
|
|
|
#define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__) |
|
|
|
#define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__) |
|
|
|
#define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__) |
|
|
|
#define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__) |
|
|
|
#define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \ |
|
|
|
#define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_H2(RTYPE, in0, in1, (pdst), stride); \ |
|
|
|
ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
|
|
|
ST_V2(RTYPE, in0, in1, (pdst), stride); \ |
|
|
|
ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \ |
|
|
|
} |
|
|
|
#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) |
|
|
|
#define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__) |
|
|
|
#define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__) |
|
|
|
#define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__) |
|
|
|
#define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ |
|
|
|
#define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ |
|
|
|
ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \ |
|
|
|
ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ |
|
|
|
ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \ |
|
|
|
} |
|
|
|
#define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__) |
|
|
|
#define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ |
|
|
|
#define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ |
|
|
|
ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
|
|
|
} |
|
|
|
#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) |
|
|
|
|
|
|
|
/* Description : Store vectors of word elements with stride |
|
|
|
Arguments : Inputs - in0, in1, stride |
|
|
|
Outputs - pdst (destination pointer to store to) |
|
|
|
Return Type - signed word |
|
|
|
Details : Stores 4 word elements from 'in0' to (pdst) |
|
|
|
Stores 4 word elements from 'in1' to (pdst + stride) |
|
|
|
*/ |
|
|
|
#define ST_SW2(in0, in1, pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_SW(in0, (pdst)); \ |
|
|
|
ST_SW(in1, (pdst) + stride); \ |
|
|
|
} |
|
|
|
#define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
|
|
pdst, stride) \ |
|
|
|
{ \ |
|
|
|
ST_SW2(in0, in1, (pdst), stride); \ |
|
|
|
ST_SW2(in2, in3, (pdst) + 2 * stride, stride); \ |
|
|
|
ST_SW2(in4, in5, (pdst) + 4 * stride, stride); \ |
|
|
|
ST_SW2(in6, in7, (pdst) + 6 * stride, stride); \ |
|
|
|
ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \ |
|
|
|
ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ |
|
|
|
} |
|
|
|
#define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__) |
|
|
|
#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__) |
|
|
|
#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__) |
|
|
|
|
|
|
|
/* Description : Store as 2x4 byte block to destination memory from input vector |
|
|
|
Arguments : Inputs - in, stidx, pdst, stride |
|
|
@@ -776,7 +651,7 @@ |
|
|
|
/* Description : average with rounding (in0 + in1 + 1) / 2. |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - signed byte |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Each byte element from 'in0' vector is added with each byte |
|
|
|
element from 'in1' vector. The addition of the elements plus 1 |
|
|
|
(for rounding) is done unsigned with full precision, |
|
|
@@ -941,7 +816,7 @@ |
|
|
|
Arguments : Inputs - mult0, mult1 |
|
|
|
cnst0, cnst1 |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Unsigned byte elements from mult0 are multiplied with |
|
|
|
unsigned byte elements from cnst0 producing a result |
|
|
|
twice the size of input i.e. unsigned halfword. |
|
|
@@ -969,7 +844,7 @@ |
|
|
|
Arguments : Inputs - mult0, mult1 |
|
|
|
cnst0, cnst1 |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - signed halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Signed byte elements from mult0 are multiplied with |
|
|
|
signed byte elements from cnst0 producing a result |
|
|
|
twice the size of input i.e. signed halfword. |
|
|
@@ -1004,7 +879,7 @@ |
|
|
|
Arguments : Inputs - mult0, mult1 |
|
|
|
cnst0, cnst1 |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - signed word |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Signed halfword elements from mult0 are multiplied with |
|
|
|
signed halfword elements from cnst0 producing a result |
|
|
|
twice the size of input i.e. signed word. |
|
|
@@ -1032,7 +907,7 @@ |
|
|
|
Arguments : Inputs - mult0, mult1 |
|
|
|
cnst0, cnst1 |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - signed halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Signed byte elements from mult0 are multiplied with |
|
|
|
signed byte elements from cnst0 producing a result |
|
|
|
twice the size of input i.e. signed halfword. |
|
|
@@ -1061,7 +936,7 @@ |
|
|
|
Arguments : Inputs - mult0, mult1 |
|
|
|
cnst0, cnst1 |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Unsigned byte elements from mult0 are multiplied with |
|
|
|
unsigned byte elements from cnst0 producing a result |
|
|
|
twice the size of input i.e. unsigned halfword. |
|
|
@@ -1082,7 +957,7 @@ |
|
|
|
Arguments : Inputs - mult0, mult1 |
|
|
|
cnst0, cnst1 |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - signed word |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Signed halfword elements from mult0 are multiplied with |
|
|
|
signed halfword elements from cnst0 producing a result |
|
|
|
twice the size of input i.e. signed word. |
|
|
@@ -1111,7 +986,7 @@ |
|
|
|
either vector are copied to the output vector |
|
|
|
Arguments : Inputs - in0, in1, min_vec |
|
|
|
Outputs - in0, in1, (in place) |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Minimum of unsigned halfword element values from 'in0' and |
|
|
|
'min_value' are written to output vector 'in0' |
|
|
|
*/ |
|
|
@@ -1202,7 +1077,7 @@ |
|
|
|
\ |
|
|
|
res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \ |
|
|
|
res1_m = __msa_splati_d(res0_m, 1); \ |
|
|
|
res0_m = res0_m + res1_m; \ |
|
|
|
res0_m += res1_m; \ |
|
|
|
sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \ |
|
|
|
sum_m; \ |
|
|
|
} ) |
|
|
@@ -1223,7 +1098,7 @@ |
|
|
|
res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \ |
|
|
|
res0_m = __msa_hadd_u_d(res_m, res_m); \ |
|
|
|
res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \ |
|
|
|
res0_m = res0_m + res1_m; \ |
|
|
|
res0_m += res1_m; \ |
|
|
|
sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \ |
|
|
|
sum_m; \ |
|
|
|
} ) |
|
|
@@ -1573,7 +1448,7 @@ |
|
|
|
/* Description : Interleave right half of halfword elements from vectors |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
|
|
|
Outputs - out0, out1, out2, out3 |
|
|
|
Return Type - signed halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Right half of halfword elements of in0 and right half of |
|
|
|
halfword elements of in1 are interleaved and copied to out0. |
|
|
|
Right half of halfword elements of in2 and right half of |
|
|
@@ -1625,16 +1500,16 @@ |
|
|
|
/* Description : Interleave right half of double word elements from vectors |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
|
|
|
Outputs - out0, out1, out2, out3 |
|
|
|
Return Type - unsigned double word |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Right half of double word elements of in0 and right half of |
|
|
|
double word elements of in1 are interleaved and copied to out0. |
|
|
|
Right half of double word elements of in2 and right half of |
|
|
|
double word elements of in3 are interleaved and copied to out1. |
|
|
|
*/ |
|
|
|
#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
|
|
|
{ \ |
|
|
|
out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \ |
|
|
|
out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \ |
|
|
|
#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \ |
|
|
|
{ \ |
|
|
|
out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \ |
|
|
|
out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \ |
|
|
|
} |
|
|
|
#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) |
|
|
|
#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) |
|
|
@@ -1643,7 +1518,7 @@ |
|
|
|
#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \ |
|
|
|
{ \ |
|
|
|
ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \ |
|
|
|
out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \ |
|
|
|
out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \ |
|
|
|
} |
|
|
|
#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) |
|
|
|
|
|
|
@@ -1698,14 +1573,14 @@ |
|
|
|
5-bit signed immediate value are copied to the output vector |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, max_val |
|
|
|
Outputs - in0, in1, in2, in3 (in place) |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Maximum of signed halfword element values from 'in0' and |
|
|
|
'max_val' are written to output vector 'in0' |
|
|
|
*/ |
|
|
|
#define MAXI_SH2(RTYPE, in0, in1, max_val) \ |
|
|
|
{ \ |
|
|
|
in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \ |
|
|
|
in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \ |
|
|
|
#define MAXI_SH2(RTYPE, in0, in1, max_val) \ |
|
|
|
{ \ |
|
|
|
in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \ |
|
|
|
in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \ |
|
|
|
} |
|
|
|
#define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__) |
|
|
|
#define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) |
|
|
@@ -1722,7 +1597,7 @@ |
|
|
|
The element data width remains unchanged |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, sat_val |
|
|
|
Outputs - in0, in1, in2, in3 (in place) |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Each unsigned halfword element from 'in0' is saturated to the |
|
|
|
value generated with (sat_val+1) bit range |
|
|
|
Results are in placed to original vectors |
|
|
@@ -1738,7 +1613,7 @@ |
|
|
|
#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \ |
|
|
|
{ \ |
|
|
|
SAT_UH2(RTYPE, in0, in1, sat_val); \ |
|
|
|
SAT_UH2(RTYPE, in2, in3, sat_val) \ |
|
|
|
SAT_UH2(RTYPE, in2, in3, sat_val); \ |
|
|
|
} |
|
|
|
#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) |
|
|
|
|
|
|
@@ -1747,7 +1622,7 @@ |
|
|
|
The element data width remains unchanged |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, sat_val |
|
|
|
Outputs - in0, in1, in2, in3 (in place) |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Each unsigned halfword element from 'in0' is saturated to the |
|
|
|
value generated with (sat_val+1) bit range |
|
|
|
Results are in placed to original vectors |
|
|
@@ -1761,7 +1636,7 @@ |
|
|
|
|
|
|
|
#define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \ |
|
|
|
{ \ |
|
|
|
SAT_SH2(RTYPE, in0, in1, sat_val) \ |
|
|
|
SAT_SH2(RTYPE, in0, in1, sat_val); \ |
|
|
|
in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \ |
|
|
|
} |
|
|
|
#define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__) |
|
|
@@ -1778,7 +1653,7 @@ |
|
|
|
The element data width remains unchanged |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, sat_val |
|
|
|
Outputs - in0, in1, in2, in3 (in place) |
|
|
|
Return Type - unsigned word |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Each unsigned word element from 'in0' is saturated to the |
|
|
|
value generated with (sat_val+1) bit range |
|
|
|
Results are in placed to original vectors |
|
|
@@ -1930,7 +1805,7 @@ |
|
|
|
/* Description : Pack even double word elements of vector pairs |
|
|
|
Arguments : Inputs - in0, in1, in2, in3 |
|
|
|
Outputs - out0, out1 |
|
|
|
Return Type - unsigned byte |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Even double elements of in0 are copied to the left half of |
|
|
|
out0 & even double elements of in1 are copied to the right |
|
|
|
half of out0. |
|
|
@@ -2100,7 +1975,7 @@ |
|
|
|
/* Description : Shift right logical all halfword elements of vector |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, shift |
|
|
|
Outputs - in0, in1, in2, in3 (in place) |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Each element of vector 'in0' is shifted right logical by |
|
|
|
number of bits respective element holds in vector 'shift' and |
|
|
|
result is in place written to 'in0' |
|
|
@@ -2119,7 +1994,7 @@ |
|
|
|
/* Description : Shift right arithmetic rounded halfwords |
|
|
|
Arguments : Inputs - in0, in1, shift |
|
|
|
Outputs - in0, in1, (in place) |
|
|
|
Return Type - unsigned halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : Each element of vector 'in0' is shifted right arithmetic by |
|
|
|
number of bits respective element holds in vector 'shift'. |
|
|
|
The last discarded bit is added to shifted value for rounding |
|
|
@@ -2445,7 +2320,7 @@ |
|
|
|
/* Description : Transposes input 8x4 byte block into 4x8 |
|
|
|
Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block) |
|
|
|
Outputs - out0, out1, out2, out3 (output 4x8 byte block) |
|
|
|
Return Type - unsigned byte |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : |
|
|
|
*/ |
|
|
|
#define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
|
@@ -2472,7 +2347,7 @@ |
|
|
|
(input 8x8 byte block) |
|
|
|
Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
|
|
|
(output 8x8 byte block) |
|
|
|
Return Type - unsigned byte |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : |
|
|
|
*/ |
|
|
|
#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
|
@@ -2596,7 +2471,7 @@ |
|
|
|
/* Description : Transposes 8x8 block with half word elements in vectors |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7 |
|
|
|
Outputs - out0, out1, out2, out3, out4, out5, out6, out7 |
|
|
|
Return Type - signed halfword |
|
|
|
Return Type - as per RTYPE |
|
|
|
Details : |
|
|
|
*/ |
|
|
|
#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ |
|
|
@@ -2646,8 +2521,6 @@ |
|
|
|
/* Description : Average byte elements from pair of vectors and store 8x4 byte |
|
|
|
block in destination memory |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride |
|
|
|
Outputs - |
|
|
|
Return Type - |
|
|
|
Details : Each byte element from input vector pair 'in0' and 'in1' are |
|
|
|
averaged (a + b)/2 and stored in 'tmp0_m' |
|
|
|
Each byte element from input vector pair 'in2' and 'in3' are |
|
|
@@ -2679,8 +2552,6 @@ |
|
|
|
/* Description : Average byte elements from pair of vectors and store 16x4 byte |
|
|
|
block in destination memory |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride |
|
|
|
Outputs - |
|
|
|
Return Type - |
|
|
|
Details : Each byte element from input vector pair 'in0' and 'in1' are |
|
|
|
averaged (a + b)/2 and stored in 'tmp0_m' |
|
|
|
Each byte element from input vector pair 'in2' and 'in3' are |
|
|
@@ -2707,8 +2578,6 @@ |
|
|
|
/* Description : Average rounded byte elements from pair of vectors and store |
|
|
|
8x4 byte block in destination memory |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride |
|
|
|
Outputs - |
|
|
|
Return Type - |
|
|
|
Details : Each byte element from input vector pair 'in0' and 'in1' are |
|
|
|
average rounded (a + b + 1)/2 and stored in 'tmp0_m' |
|
|
|
Each byte element from input vector pair 'in2' and 'in3' are |
|
|
@@ -2738,8 +2607,6 @@ |
|
|
|
/* Description : Average rounded byte elements from pair of vectors and store |
|
|
|
16x4 byte block in destination memory |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride |
|
|
|
Outputs - |
|
|
|
Return Type - |
|
|
|
Details : Each byte element from input vector pair 'in0' and 'in1' are |
|
|
|
average rounded (a + b + 1)/2 and stored in 'tmp0_m' |
|
|
|
Each byte element from input vector pair 'in2' and 'in3' are |
|
|
@@ -2764,8 +2631,6 @@ |
|
|
|
average rounded with destination and store 8x4 byte block |
|
|
|
in destination memory |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride |
|
|
|
Outputs - |
|
|
|
Return Type - |
|
|
|
Details : Each byte element from input vector pair 'in0' and 'in1' are |
|
|
|
average rounded (a + b + 1)/2 and stored in 'tmp0_m' |
|
|
|
Each byte element from input vector pair 'in2' and 'in3' are |
|
|
@@ -2794,8 +2659,6 @@ |
|
|
|
average rounded with destination and store 16x4 byte block |
|
|
|
in destination memory |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride |
|
|
|
Outputs - |
|
|
|
Return Type - |
|
|
|
Details : Each byte element from input vector pair 'in0' and 'in1' are |
|
|
|
average rounded (a + b + 1)/2 and stored in 'tmp0_m' |
|
|
|
Each byte element from input vector pair 'in2' and 'in3' are |
|
|
@@ -2822,8 +2685,6 @@ |
|
|
|
|
|
|
|
/* Description : Add block 4x4 |
|
|
|
Arguments : Inputs - in0, in1, in2, in3, pdst, stride |
|
|
|
Outputs - |
|
|
|
Return Type - unsigned bytes |
|
|
|
Details : Least significant 4 bytes from each input vector are added to |
|
|
|
the destination bytes, clipped between 0-255 and then stored. |
|
|
|
*/ |
|
|
|