Browse Source

use shorter types vec_"type" instead of the too long vector "type"

part 1 of h264 luma interpolation 8x8 for altivec contributed by
Mauricio Alvarez % lokifo A gmail P com %
Original thread:
Date: Jun 26, 2007 8:15 PM
Subject: Re: [FFmpeg-devel] [PATCH] h264 luma interpolation 8x8 for altivec

Originally committed as revision 10090 to svn://svn.ffmpeg.org/ffmpeg/trunk
tags/v0.5
Guillaume Poirier 18 years ago
parent
commit
3ca96802e2
2 changed files with 276 additions and 333 deletions
  1. +108
    -115
      libavcodec/ppc/h264_altivec.c
  2. +168
    -218
      libavcodec/ppc/h264_template_altivec.c

+ 108
- 115
libavcodec/ppc/h264_altivec.c View File

@@ -186,35 +186,35 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
((8 - x) * (y)), ((8 - x) * (y)),
((x) * (y))}; ((x) * (y))};
register int i; register int i;
vector unsigned char fperm;
const vector signed int vABCD = vec_ld(0, ABCD);
const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
const vector signed int vzero = vec_splat_s32(0);
const vector signed short v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
const vector unsigned short v6us = vec_splat_u16(6);
vec_u8_t fperm;
const vec_s32_t vABCD = vec_ld(0, ABCD);
const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
LOAD_ZERO;
const vec_s16_t v28ss = vec_sub(vec_sl(vec_splat_s16(1),vec_splat_u16(5)),vec_splat_s16(4));
const vec_u16_t v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;


vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vector unsigned char vsrc0uc, vsrc1uc;
vector signed short vsrc0ssH, vsrc1ssH;
vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
vector signed short vsrc2ssH, vsrc3ssH, psum;
vector unsigned char vdst, ppsum, fsum;
vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vec_u8_t vsrc0uc, vsrc1uc;
vec_s16_t vsrc0ssH, vsrc1ssH;
vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
vec_s16_t vsrc2ssH, vsrc3ssH, psum;
vec_u8_t vdst, ppsum, fsum;


if (((unsigned long)dst) % 16 == 0) { if (((unsigned long)dst) % 16 == 0) {
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F);
fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F);
} else { } else {
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F);
fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F);
} }


vsrcAuc = vec_ld(0, src); vsrcAuc = vec_ld(0, src);
@@ -230,10 +230,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
else else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);


vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc0uc);
vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc1uc);
vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc0uc);
vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc1uc);


if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
@@ -244,10 +242,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);


vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc2uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);


psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -257,7 +253,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
psum = vec_sra(psum, v6us); psum = vec_sra(psum, v6us);


vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_packsu(psum, psum);
ppsum = (vec_u8_t)vec_packsu(psum, psum);
fsum = vec_perm(vdst, ppsum, fperm); fsum = vec_perm(vdst, ppsum, fperm);


vec_st(fsum, 0, dst); vec_st(fsum, 0, dst);
@@ -269,7 +265,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
src += stride; src += stride;
} }
} else { } else {
vector unsigned char vsrcDuc;
vec_u8_t vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src); vsrcDuc = vec_ld(stride + 16, src);
@@ -280,10 +276,8 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
else else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);


vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc2uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc2uc);
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v, (vec_u8_t)vsrc3uc);


psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -293,7 +287,7 @@ void put_no_rnd_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride
psum = vec_sr(psum, v6us); psum = vec_sr(psum, v6us);


vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_pack(psum, psum);
ppsum = (vec_u8_t)vec_pack(psum, psum);
fsum = vec_perm(vdst, ppsum, fperm); fsum = vec_perm(vdst, ppsum, fperm);


vec_st(fsum, 0, dst); vec_st(fsum, 0, dst);
@@ -312,7 +306,7 @@ static inline void put_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h) int src_stride1, int h)
{ {
int i; int i;
vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;


mask_ = vec_lvsl(0, src2); mask_ = vec_lvsl(0, src2);


@@ -354,7 +348,7 @@ static inline void avg_pixels16_l2_altivec( uint8_t * dst, const uint8_t * src1,
int src_stride1, int h) int src_stride1, int h)
{ {
int i; int i;
vector unsigned char a, b, d, tmp1, tmp2, mask, mask_, edges, align;
vec_u8_t a, b, d, tmp1, tmp2, mask, mask_, edges, align;


mask_ = vec_lvsl(0, src2); mask_ = vec_lvsl(0, src2);


@@ -567,8 +561,7 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
const vec_u16_t twov = vec_splat_u16(2); const vec_u16_t twov = vec_splat_u16(2);
const vec_u16_t sixv = vec_splat_u16(6); const vec_u16_t sixv = vec_splat_u16(6);


const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,
-1,-1,-1,-1,-1,-1,-1,-1);
const vec_u8_t sel = (vec_u8_t) AVV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
LOAD_ZERO; LOAD_ZERO;


dct[0] += 32; // rounding for the >>6 at the end dct[0] += 32; // rounding for the >>6 at the end
@@ -601,10 +594,10 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
} }


#define transpose4x16(r0, r1, r2, r3) { \ #define transpose4x16(r0, r1, r2, r3) { \
register vector unsigned char r4; \
register vector unsigned char r5; \
register vector unsigned char r6; \
register vector unsigned char r7; \
register vec_u8_t r4; \
register vec_u8_t r5; \
register vec_u8_t r6; \
register vec_u8_t r7; \
\ \
r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \ r4 = vec_mergeh(r0, r2); /*0, 2 set 0*/ \
r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \ r5 = vec_mergel(r0, r2); /*0, 2 set 1*/ \
@@ -618,8 +611,8 @@ void ff_h264_idct8_add_altivec( uint8_t *dst, DCTELEM *dct, int stride ) {
} }


static inline void write16x4(uint8_t *dst, int dst_stride, static inline void write16x4(uint8_t *dst, int dst_stride,
register vector unsigned char r0, register vector unsigned char r1,
register vector unsigned char r2, register vector unsigned char r3) {
register vec_u8_t r0, register vec_u8_t r1,
register vec_u8_t r2, register vec_u8_t r3) {
DECLARE_ALIGNED_16(unsigned char, result[64]); DECLARE_ALIGNED_16(unsigned char, result[64]);
uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst; uint32_t *src_int = (uint32_t *)result, *dst_int = (uint32_t *)dst;
int int_dst_stride = dst_stride/4; int int_dst_stride = dst_stride/4;
@@ -651,16 +644,16 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
\todo FIXME: see if we can't spare some vec_lvsl() by them factorizing \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing
out of unaligned_load() */ out of unaligned_load() */
#define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\
register vector unsigned char r0 = unaligned_load(0, src);\
register vector unsigned char r1 = unaligned_load( src_stride, src);\
register vector unsigned char r2 = unaligned_load(2* src_stride, src);\
register vector unsigned char r3 = unaligned_load(3* src_stride, src);\
register vector unsigned char r4 = unaligned_load(4* src_stride, src);\
register vector unsigned char r5 = unaligned_load(5* src_stride, src);\
register vector unsigned char r6 = unaligned_load(6* src_stride, src);\
register vector unsigned char r7 = unaligned_load(7* src_stride, src);\
register vector unsigned char r14 = unaligned_load(14*src_stride, src);\
register vector unsigned char r15 = unaligned_load(15*src_stride, src);\
register vec_u8_t r0 = unaligned_load(0, src); \
register vec_u8_t r1 = unaligned_load( src_stride, src); \
register vec_u8_t r2 = unaligned_load(2* src_stride, src); \
register vec_u8_t r3 = unaligned_load(3* src_stride, src); \
register vec_u8_t r4 = unaligned_load(4* src_stride, src); \
register vec_u8_t r5 = unaligned_load(5* src_stride, src); \
register vec_u8_t r6 = unaligned_load(6* src_stride, src); \
register vec_u8_t r7 = unaligned_load(7* src_stride, src); \
register vec_u8_t r14 = unaligned_load(14*src_stride, src); \
register vec_u8_t r15 = unaligned_load(15*src_stride, src); \
\ \
r8 = unaligned_load( 8*src_stride, src); \ r8 = unaligned_load( 8*src_stride, src); \
r9 = unaligned_load( 9*src_stride, src); \ r9 = unaligned_load( 9*src_stride, src); \
@@ -710,26 +703,26 @@ static inline void write16x4(uint8_t *dst, int dst_stride,
} }


// out: o = |x-y| < a // out: o = |x-y| < a
static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x,
register vector unsigned char y,
register vector unsigned char a) {
register vector unsigned char diff = vec_subs(x, y);
register vector unsigned char diffneg = vec_subs(y, x);
register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */
o = (vector unsigned char)vec_cmplt(o, a);
static inline vec_u8_t diff_lt_altivec ( register vec_u8_t x,
register vec_u8_t y,
register vec_u8_t a) {
register vec_u8_t diff = vec_subs(x, y);
register vec_u8_t diffneg = vec_subs(y, x);
register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */
o = (vec_u8_t)vec_cmplt(o, a);
return o; return o;
} }


static inline vector unsigned char h264_deblock_mask ( register vector unsigned char p0,
register vector unsigned char p1,
register vector unsigned char q0,
register vector unsigned char q1,
register vector unsigned char alpha,
register vector unsigned char beta) {
static inline vec_u8_t h264_deblock_mask ( register vec_u8_t p0,
register vec_u8_t p1,
register vec_u8_t q0,
register vec_u8_t q1,
register vec_u8_t alpha,
register vec_u8_t beta) {


register vector unsigned char mask;
register vector unsigned char tempmask;
register vec_u8_t mask;
register vec_u8_t tempmask;


mask = diff_lt_altivec(p0, q0, alpha); mask = diff_lt_altivec(p0, q0, alpha);
tempmask = diff_lt_altivec(p1, p0, beta); tempmask = diff_lt_altivec(p1, p0, beta);
@@ -741,19 +734,19 @@ static inline vector unsigned char h264_deblock_mask ( register vector unsigned
} }


// out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0) // out: newp1 = clip((p2 + ((p0 + q0 + 1) >> 1)) >> 1, p1-tc0, p1+tc0)
static inline vector unsigned char h264_deblock_q1(register vector unsigned char p0,
register vector unsigned char p1,
register vector unsigned char p2,
register vector unsigned char q0,
register vector unsigned char tc0) {
register vector unsigned char average = vec_avg(p0, q0);
register vector unsigned char temp;
register vector unsigned char uncliped;
register vector unsigned char ones;
register vector unsigned char max;
register vector unsigned char min;
register vector unsigned char newp1;
static inline vec_u8_t h264_deblock_q1(register vec_u8_t p0,
register vec_u8_t p1,
register vec_u8_t p2,
register vec_u8_t q0,
register vec_u8_t tc0) {
register vec_u8_t average = vec_avg(p0, q0);
register vec_u8_t temp;
register vec_u8_t uncliped;
register vec_u8_t ones;
register vec_u8_t max;
register vec_u8_t min;
register vec_u8_t newp1;


temp = vec_xor(average, p2); temp = vec_xor(average, p2);
average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */ average = vec_avg(average, p2); /*avg(p2, avg(p0, q0)) */
@@ -769,16 +762,16 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char


#define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \ #define h264_deblock_p0_q0(p0, p1, q0, q1, tc0masked) { \
\ \
const vector unsigned char A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
const vec_u8_t A0v = vec_sl(vec_splat_u8(10), vec_splat_u8(4)); \
\ \
register vector unsigned char pq0bit = vec_xor(p0,q0); \
register vector unsigned char q1minus; \
register vector unsigned char p0minus; \
register vector unsigned char stage1; \
register vector unsigned char stage2; \
register vector unsigned char vec160; \
register vector unsigned char delta; \
register vector unsigned char deltaneg; \
register vec_u8_t pq0bit = vec_xor(p0,q0); \
register vec_u8_t q1minus; \
register vec_u8_t p0minus; \
register vec_u8_t stage1; \
register vec_u8_t stage2; \
register vec_u8_t vec160; \
register vec_u8_t delta; \
register vec_u8_t deltaneg; \
\ \
q1minus = vec_nor(q1, q1); /* 255 - q1 */ \ q1minus = vec_nor(q1, q1); /* 255 - q1 */ \
stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \ stage1 = vec_avg(p1, q1minus); /* (p1 - q1 + 256)>>1 */ \
@@ -801,16 +794,16 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char


#define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \ #define h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0) { \
DECLARE_ALIGNED_16(unsigned char, temp[16]); \ DECLARE_ALIGNED_16(unsigned char, temp[16]); \
register vector unsigned char alphavec; \
register vector unsigned char betavec; \
register vector unsigned char mask; \
register vector unsigned char p1mask; \
register vector unsigned char q1mask; \
register vec_u8_t alphavec; \
register vec_u8_t betavec; \
register vec_u8_t mask; \
register vec_u8_t p1mask; \
register vec_u8_t q1mask; \
register vector signed char tc0vec; \ register vector signed char tc0vec; \
register vector unsigned char finaltc0; \
register vector unsigned char tc0masked; \
register vector unsigned char newp1; \
register vector unsigned char newq1; \
register vec_u8_t finaltc0; \
register vec_u8_t tc0masked; \
register vec_u8_t newp1; \
register vec_u8_t newq1; \
\ \
temp[0] = alpha; \ temp[0] = alpha; \
temp[1] = beta; \ temp[1] = beta; \
@@ -824,18 +817,18 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char
tc0vec = vec_mergeh(tc0vec, tc0vec); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \
tc0vec = vec_mergeh(tc0vec, tc0vec); \ tc0vec = vec_mergeh(tc0vec, tc0vec); \
mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \ mask = vec_and(mask, vec_cmpgt(tc0vec, vec_splat_s8(-1))); /* if tc0[i] >= 0 */ \
finaltc0 = vec_and((vector unsigned char)tc0vec, mask); /* tc = tc0 */ \
finaltc0 = vec_and((vec_u8_t)tc0vec, mask); /* tc = tc0 */ \
\ \
p1mask = diff_lt_altivec(p2, p0, betavec); \ p1mask = diff_lt_altivec(p2, p0, betavec); \
p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \ p1mask = vec_and(p1mask, mask); /* if( |p2 - p0| < beta) */ \
tc0masked = vec_and(p1mask, (vector unsigned char)tc0vec); \
tc0masked = vec_and(p1mask, (vec_u8_t)tc0vec); \
finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \ finaltc0 = vec_sub(finaltc0, p1mask); /* tc++ */ \
newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \ newp1 = h264_deblock_q1(p0, p1, p2, q0, tc0masked); \
/*end if*/ \ /*end if*/ \
\ \
q1mask = diff_lt_altivec(q2, q0, betavec); \ q1mask = diff_lt_altivec(q2, q0, betavec); \
q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\ q1mask = vec_and(q1mask, mask); /* if ( |q2 - q0| < beta ) */\
tc0masked = vec_and(q1mask, (vector unsigned char)tc0vec); \
tc0masked = vec_and(q1mask, (vec_u8_t)tc0vec); \
finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \ finaltc0 = vec_sub(finaltc0, q1mask); /* tc++ */ \
newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \ newq1 = h264_deblock_q1(p0, q1, q2, q0, tc0masked); \
/*end if*/ \ /*end if*/ \
@@ -848,12 +841,12 @@ static inline vector unsigned char h264_deblock_q1(register vector unsigned char
static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {


if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) { if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) >= 0) {
register vector unsigned char p2 = vec_ld(-3*stride, pix);
register vector unsigned char p1 = vec_ld(-2*stride, pix);
register vector unsigned char p0 = vec_ld(-1*stride, pix);
register vector unsigned char q0 = vec_ld(0, pix);
register vector unsigned char q1 = vec_ld(stride, pix);
register vector unsigned char q2 = vec_ld(2*stride, pix);
register vec_u8_t p2 = vec_ld(-3*stride, pix);
register vec_u8_t p1 = vec_ld(-2*stride, pix);
register vec_u8_t p0 = vec_ld(-1*stride, pix);
register vec_u8_t q0 = vec_ld(0, pix);
register vec_u8_t q1 = vec_ld(stride, pix);
register vec_u8_t q2 = vec_ld(2*stride, pix);
h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0); h264_loop_filter_luma_altivec(p2, p1, p0, q0, q1, q2, alpha, beta, tc0);
vec_st(p1, -2*stride, pix); vec_st(p1, -2*stride, pix);
vec_st(p0, -1*stride, pix); vec_st(p0, -1*stride, pix);
@@ -864,7 +857,7 @@ static void h264_v_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha,


static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) { static void h264_h_loop_filter_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0) {


register vector unsigned char line0, line1, line2, line3, line4, line5;
register vec_u8_t line0, line1, line2, line3, line4, line5;
if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0)
return; return;
readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5);


+ 168
- 218
libavcodec/ppc/h264_template_altivec.c View File

@@ -27,37 +27,37 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
((8 - x) * (y)), ((8 - x) * (y)),
((x) * (y))}; ((x) * (y))};
register int i; register int i;
vector unsigned char fperm;
const vector signed int vABCD = vec_ld(0, ABCD);
const vector signed short vA = vec_splat((vector signed short)vABCD, 1);
const vector signed short vB = vec_splat((vector signed short)vABCD, 3);
const vector signed short vC = vec_splat((vector signed short)vABCD, 5);
const vector signed short vD = vec_splat((vector signed short)vABCD, 7);
const vector signed int vzero = vec_splat_s32(0);
const vector signed short v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vector unsigned short v6us = vec_splat_u16(6);
vec_u8_t fperm;
const vec_s32_t vABCD = vec_ld(0, ABCD);
const vec_s16_t vA = vec_splat((vec_s16_t)vABCD, 1);
const vec_s16_t vB = vec_splat((vec_s16_t)vABCD, 3);
const vec_s16_t vC = vec_splat((vec_s16_t)vABCD, 5);
const vec_s16_t vD = vec_splat((vec_s16_t)vABCD, 7);
LOAD_ZERO;
const vec_s16_t v32ss = vec_sl(vec_splat_s16(1),vec_splat_u16(5));
const vec_u16_t v6us = vec_splat_u16(6);
register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1; register int loadSecond = (((unsigned long)src) % 16) <= 7 ? 0 : 1;
register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0; register int reallyBadAlign = (((unsigned long)src) % 16) == 15 ? 1 : 0;


vector unsigned char vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vector unsigned char vsrc0uc, vsrc1uc;
vector signed short vsrc0ssH, vsrc1ssH;
vector unsigned char vsrcCuc, vsrc2uc, vsrc3uc;
vector signed short vsrc2ssH, vsrc3ssH, psum;
vector unsigned char vdst, ppsum, vfdst, fsum;
vec_u8_t vsrcAuc, vsrcBuc, vsrcperm0, vsrcperm1;
vec_u8_t vsrc0uc, vsrc1uc;
vec_s16_t vsrc0ssH, vsrc1ssH;
vec_u8_t vsrcCuc, vsrc2uc, vsrc3uc;
vec_s16_t vsrc2ssH, vsrc3ssH, psum;
vec_u8_t vdst, ppsum, vfdst, fsum;


POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_chroma_mc8_num, 1);


if (((unsigned long)dst) % 16 == 0) { if (((unsigned long)dst) % 16 == 0) {
fperm = (vector unsigned char)AVV(0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F);
fperm = (vec_u8_t)AVV(0x10, 0x11, 0x12, 0x13,
0x14, 0x15, 0x16, 0x17,
0x08, 0x09, 0x0A, 0x0B,
0x0C, 0x0D, 0x0E, 0x0F);
} else { } else {
fperm = (vector unsigned char)AVV(0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F);
fperm = (vec_u8_t)AVV(0x00, 0x01, 0x02, 0x03,
0x04, 0x05, 0x06, 0x07,
0x18, 0x19, 0x1A, 0x1B,
0x1C, 0x1D, 0x1E, 0x1F);
} }


vsrcAuc = vec_ld(0, src); vsrcAuc = vec_ld(0, src);
@@ -73,10 +73,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
else else
vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1); vsrc1uc = vec_perm(vsrcAuc, vsrcBuc, vsrcperm1);


vsrc0ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc0uc);
vsrc1ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc1uc);
vsrc0ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc0uc);
vsrc1ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc1uc);


if (!loadSecond) {// -> !reallyBadAlign if (!loadSecond) {// -> !reallyBadAlign
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
@@ -87,10 +85,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0); vsrc2uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm0);
vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcCuc, vsrcperm1);


vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc2uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);


psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -100,7 +96,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
psum = vec_sra(psum, v6us); psum = vec_sra(psum, v6us);


vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_packsu(psum, psum);
ppsum = (vec_u8_t)vec_packsu(psum, psum);
vfdst = vec_perm(vdst, ppsum, fperm); vfdst = vec_perm(vdst, ppsum, fperm);


OP_U8_ALTIVEC(fsum, vfdst, vdst); OP_U8_ALTIVEC(fsum, vfdst, vdst);
@@ -114,7 +110,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
src += stride; src += stride;
} }
} else { } else {
vector unsigned char vsrcDuc;
vec_u8_t vsrcDuc;
for (i = 0 ; i < h ; i++) { for (i = 0 ; i < h ; i++) {
vsrcCuc = vec_ld(stride + 0, src); vsrcCuc = vec_ld(stride + 0, src);
vsrcDuc = vec_ld(stride + 16, src); vsrcDuc = vec_ld(stride + 16, src);
@@ -125,10 +121,8 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
else else
vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1); vsrc3uc = vec_perm(vsrcCuc, vsrcDuc, vsrcperm1);


vsrc2ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc2uc);
vsrc3ssH = (vector signed short)vec_mergeh((vector unsigned char)vzero,
(vector unsigned char)vsrc3uc);
vsrc2ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc2uc);
vsrc3ssH = (vec_s16_t)vec_mergeh(zero_u8v,(vec_u8_t)vsrc3uc);


psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0)); psum = vec_mladd(vA, vsrc0ssH, vec_splat_s16(0));
psum = vec_mladd(vB, vsrc1ssH, psum); psum = vec_mladd(vB, vsrc1ssH, psum);
@@ -138,7 +132,7 @@ void PREFIX_h264_chroma_mc8_altivec(uint8_t * dst, uint8_t * src, int stride, in
psum = vec_sr(psum, v6us); psum = vec_sr(psum, v6us);


vdst = vec_ld(0, dst); vdst = vec_ld(0, dst);
ppsum = (vector unsigned char)vec_pack(psum, psum);
ppsum = (vec_u8_t)vec_pack(psum, psum);
vfdst = vec_perm(vdst, ppsum, fperm); vfdst = vec_perm(vdst, ppsum, fperm);


OP_U8_ALTIVEC(fsum, vfdst, vdst); OP_U8_ALTIVEC(fsum, vfdst, vdst);
@@ -160,44 +154,39 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_h_lowpass_num, 1);
register int i; register int i;


const vector signed int vzero = vec_splat_s32(0);
const vector unsigned char permM2 = vec_lvsl(-2, src);
const vector unsigned char permM1 = vec_lvsl(-1, src);
const vector unsigned char permP0 = vec_lvsl(+0, src);
const vector unsigned char permP1 = vec_lvsl(+1, src);
const vector unsigned char permP2 = vec_lvsl(+2, src);
const vector unsigned char permP3 = vec_lvsl(+3, src);
const vector signed short v5ss = vec_splat_s16(5);
const vector unsigned short v5us = vec_splat_u16(5);
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const vector unsigned char dstperm = vec_lvsr(0, dst);
const vector unsigned char neg1 =
(const vector unsigned char) vec_splat_s8(-1);

const vector unsigned char dstmask =
vec_perm((const vector unsigned char)vzero,
neg1, dstperm);

vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
LOAD_ZERO;
const vec_u8_t permM2 = vec_lvsl(-2, src);
const vec_u8_t permM1 = vec_lvsl(-1, src);
const vec_u8_t permP0 = vec_lvsl(+0, src);
const vec_u8_t permP1 = vec_lvsl(+1, src);
const vec_u8_t permP2 = vec_lvsl(+2, src);
const vec_u8_t permP3 = vec_lvsl(+3, src);
const vec_s16_t v5ss = vec_splat_s16(5);
const vec_u16_t v5us = vec_splat_u16(5);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const vec_u8_t dstperm = vec_lvsr(0, dst);
const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);
const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);

vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;


register int align = ((((unsigned long)src) - 2) % 16); register int align = ((((unsigned long)src) - 2) % 16);


vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B, srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B, srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, pp3A, pp3B, pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB; psumA, psumB, sumA, sumB;


vector unsigned char sum, dst1, dst2, vdst, fsum,
rsum, fdst1, fdst2;
vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2;


POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_h_lowpass_num, 1);


for (i = 0 ; i < 16 ; i ++) { for (i = 0 ; i < 16 ; i ++) {
vector unsigned char srcR1 = vec_ld(-2, src);
vector unsigned char srcR2 = vec_ld(14, src);
vec_u8_t srcR1 = vec_ld(-2, src);
vec_u8_t srcR2 = vec_ld(14, src);


switch (align) { switch (align) {
default: { default: {
@@ -217,7 +206,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = srcR2; srcP3 = srcR2;
} break; } break;
case 12: { case 12: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -226,7 +215,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 13: { case 13: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -235,7 +224,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 14: { case 14: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2; srcP0 = srcR2;
@@ -244,7 +233,7 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 15: { case 15: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2; srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0); srcP0 = vec_perm(srcR2, srcR3, permP0);
@@ -254,32 +243,20 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
} break; } break;
} }


srcP0A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP0);
srcP0B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP0);
srcP1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1);
srcP1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP1);

srcP2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2);
srcP2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP2);
srcP3A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);

srcM1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM1);
srcM1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM1);
srcM2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM2);
srcM2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM2);
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);

srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);

srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);


sum1A = vec_adds(srcP0A, srcP1A); sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B); sum1B = vec_adds(srcP0B, srcP1B);
@@ -291,8 +268,8 @@ static void PREFIX_h264_qpel16_h_lowpass_altivec(uint8_t * dst, uint8_t * src, i
pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss);


pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


pp3A = vec_add(sum3A, pp1A); pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B); pp3B = vec_add(sum3B, pp1B);
@@ -330,67 +307,56 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i


register int i; register int i;


const vector signed int vzero = vec_splat_s32(0);
const vector unsigned char perm = vec_lvsl(0, src);
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vector unsigned short v5us = vec_splat_u16(5);
const vector signed short v5ss = vec_splat_s16(5);
const vector signed short v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const vector unsigned char dstperm = vec_lvsr(0, dst);
const vector unsigned char neg1 = (const vector unsigned char)vec_splat_s8(-1);
const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
LOAD_ZERO;
const vec_u8_t perm = vec_lvsl(0, src);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u16_t v5us = vec_splat_u16(5);
const vec_s16_t v5ss = vec_splat_s16(5);
const vec_s16_t v16ss = vec_sl(vec_splat_s16(1),vec_splat_u16(4));
const vec_u8_t dstperm = vec_lvsr(0, dst);
const vec_u8_t neg1 = (const vec_u8_t)vec_splat_s8(-1);
const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);


uint8_t *srcbis = src - (srcStride * 2); uint8_t *srcbis = src - (srcStride * 2);


const vector unsigned char srcM2a = vec_ld(0, srcbis);
const vector unsigned char srcM2b = vec_ld(16, srcbis);
const vector unsigned char srcM2 = vec_perm(srcM2a, srcM2b, perm);
const vec_u8_t srcM2a = vec_ld(0, srcbis);
const vec_u8_t srcM2b = vec_ld(16, srcbis);
const vec_u8_t srcM2 = vec_perm(srcM2a, srcM2b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcM1a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcM1b = vec_ld(16, srcbis);
const vector unsigned char srcM1 = vec_perm(srcM1a, srcM1b, perm);
const vec_u8_t srcM1a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcM1b = vec_ld(16, srcbis);
const vec_u8_t srcM1 = vec_perm(srcM1a, srcM1b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcP0a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP0b = vec_ld(16, srcbis);
const vector unsigned char srcP0 = vec_perm(srcP0a, srcP0b, perm);
const vec_u8_t srcP0a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP0b = vec_ld(16, srcbis);
const vec_u8_t srcP0 = vec_perm(srcP0a, srcP0b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcP1a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP1b = vec_ld(16, srcbis);
const vector unsigned char srcP1 = vec_perm(srcP1a, srcP1b, perm);
const vec_u8_t srcP1a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP1b = vec_ld(16, srcbis);
const vec_u8_t srcP1 = vec_perm(srcP1a, srcP1b, perm);
// srcbis += srcStride; // srcbis += srcStride;
const vector unsigned char srcP2a = vec_ld(0, srcbis += srcStride);
const vector unsigned char srcP2b = vec_ld(16, srcbis);
const vector unsigned char srcP2 = vec_perm(srcP2a, srcP2b, perm);
const vec_u8_t srcP2a = vec_ld(0, srcbis += srcStride);
const vec_u8_t srcP2b = vec_ld(16, srcbis);
const vec_u8_t srcP2 = vec_perm(srcP2a, srcP2b, perm);
// srcbis += srcStride; // srcbis += srcStride;


vector signed short srcM2ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM2);
vector signed short srcM2ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM2);
vector signed short srcM1ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM1);
vector signed short srcM1ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM1);
vector signed short srcP0ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP0);
vector signed short srcP0ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP0);
vector signed short srcP1ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1);
vector signed short srcP1ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP1);
vector signed short srcP2ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2);
vector signed short srcP2ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP2);

vector signed short pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
vec_s16_t srcM2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
vec_s16_t srcM2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM2);
vec_s16_t srcM1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
vec_s16_t srcM1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
vec_s16_t srcP0ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
vec_s16_t srcP0ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
vec_s16_t srcP1ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
vec_s16_t srcP1ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP1);
vec_s16_t srcP2ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
vec_s16_t srcP2ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP2);

vec_s16_t pp1A, pp1B, pp2A, pp2B, pp3A, pp3B,
psumA, psumB, sumA, sumB, psumA, psumB, sumA, sumB,
srcP3ssA, srcP3ssB, srcP3ssA, srcP3ssB,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B; sum1A, sum1B, sum2A, sum2B, sum3A, sum3B;


vector unsigned char sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2,
srcP3a, srcP3b, srcP3;
vec_u8_t sum, dst1, dst2, vdst, fsum, rsum, fdst1, fdst2, srcP3a, srcP3b, srcP3;


POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_v_lowpass_num, 1);


@@ -398,10 +364,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
srcP3a = vec_ld(0, srcbis += srcStride); srcP3a = vec_ld(0, srcbis += srcStride);
srcP3b = vec_ld(16, srcbis); srcP3b = vec_ld(16, srcbis);
srcP3 = vec_perm(srcP3a, srcP3b, perm); srcP3 = vec_perm(srcP3a, srcP3b, perm);
srcP3ssA = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3ssB = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);
srcP3ssA = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3ssB = (vec_s16_t) vec_mergel(zero_u8v, srcP3);
// srcbis += srcStride; // srcbis += srcStride;


sum1A = vec_adds(srcP0ssA, srcP1ssA); sum1A = vec_adds(srcP0ssA, srcP1ssA);
@@ -425,8 +389,8 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
pp1A = vec_mladd(sum1A, v20ss, v16ss); pp1A = vec_mladd(sum1A, v20ss, v16ss);
pp1B = vec_mladd(sum1B, v20ss, v16ss); pp1B = vec_mladd(sum1B, v20ss, v16ss);


pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


pp3A = vec_add(sum3A, pp1A); pp3A = vec_add(sum3A, pp1A);
pp3B = vec_add(sum3B, pp1B); pp3B = vec_add(sum3B, pp1B);
@@ -461,58 +425,56 @@ static void PREFIX_h264_qpel16_v_lowpass_altivec(uint8_t * dst, uint8_t * src, i
static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) { static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp, uint8_t * src, int dstStride, int tmpStride, int srcStride) {
POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_DECLARE(PREFIX_h264_qpel16_hv_lowpass_num, 1);
register int i; register int i;
const vector signed int vzero = vec_splat_s32(0);
const vector unsigned char permM2 = vec_lvsl(-2, src);
const vector unsigned char permM1 = vec_lvsl(-1, src);
const vector unsigned char permP0 = vec_lvsl(+0, src);
const vector unsigned char permP1 = vec_lvsl(+1, src);
const vector unsigned char permP2 = vec_lvsl(+2, src);
const vector unsigned char permP3 = vec_lvsl(+3, src);
const vector signed short v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vector unsigned int v10ui = vec_splat_u32(10);
const vector signed short v5ss = vec_splat_s16(5);
const vector signed short v1ss = vec_splat_s16(1);
const vector signed int v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vector unsigned int v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));
LOAD_ZERO;
const vec_u8_t permM2 = vec_lvsl(-2, src);
const vec_u8_t permM1 = vec_lvsl(-1, src);
const vec_u8_t permP0 = vec_lvsl(+0, src);
const vec_u8_t permP1 = vec_lvsl(+1, src);
const vec_u8_t permP2 = vec_lvsl(+2, src);
const vec_u8_t permP3 = vec_lvsl(+3, src);
const vec_s16_t v20ss = vec_sl(vec_splat_s16(5),vec_splat_u16(2));
const vec_u32_t v10ui = vec_splat_u32(10);
const vec_s16_t v5ss = vec_splat_s16(5);
const vec_s16_t v1ss = vec_splat_s16(1);
const vec_s32_t v512si = vec_sl(vec_splat_s32(1),vec_splat_u32(9));
const vec_u32_t v16ui = vec_sl(vec_splat_u32(1),vec_splat_u32(4));


register int align = ((((unsigned long)src) - 2) % 16); register int align = ((((unsigned long)src) - 2) % 16);


const vector unsigned char neg1 = (const vector unsigned char)
vec_splat_s8(-1);
const vec_u8_t neg1 = (const vec_u8_t) vec_splat_s8(-1);


vector signed short srcP0A, srcP0B, srcP1A, srcP1B,
vec_s16_t srcP0A, srcP0B, srcP1A, srcP1B,
srcP2A, srcP2B, srcP3A, srcP3B, srcP2A, srcP2B, srcP3A, srcP3B,
srcM1A, srcM1B, srcM2A, srcM2B, srcM1A, srcM1B, srcM2A, srcM2B,
sum1A, sum1B, sum2A, sum2B, sum3A, sum3B, sum1A, sum1B, sum2A, sum2B, sum3A, sum3B,
pp1A, pp1B, pp2A, pp2B, psumA, psumB; pp1A, pp1B, pp2A, pp2B, psumA, psumB;


const vector unsigned char dstperm = vec_lvsr(0, dst);
const vec_u8_t dstperm = vec_lvsr(0, dst);


const vector unsigned char dstmask = vec_perm((const vector unsigned char)vzero, neg1, dstperm);
const vec_u8_t dstmask = vec_perm(zero_u8v, neg1, dstperm);


const vector unsigned char mperm = (const vector unsigned char)
const vec_u8_t mperm = (const vec_u8_t)
AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B, AVV(0x00, 0x08, 0x01, 0x09, 0x02, 0x0A, 0x03, 0x0B,
0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F); 0x04, 0x0C, 0x05, 0x0D, 0x06, 0x0E, 0x07, 0x0F);
int16_t *tmpbis = tmp; int16_t *tmpbis = tmp;


vector signed short tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
vec_s16_t tmpM1ssA, tmpM1ssB, tmpM2ssA, tmpM2ssB,
tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB, tmpP0ssA, tmpP0ssB, tmpP1ssA, tmpP1ssB,
tmpP2ssA, tmpP2ssB; tmpP2ssA, tmpP2ssB;


vector signed int pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
vec_s32_t pp1Ae, pp1Ao, pp1Be, pp1Bo, pp2Ae, pp2Ao, pp2Be, pp2Bo,
pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo, pp3Ae, pp3Ao, pp3Be, pp3Bo, pp1cAe, pp1cAo, pp1cBe, pp1cBo,
pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo, pp32Ae, pp32Ao, pp32Be, pp32Bo, sumAe, sumAo, sumBe, sumBo,
ssumAe, ssumAo, ssumBe, ssumBo; ssumAe, ssumAo, ssumBe, ssumBo;
vector unsigned char fsum, sumv, sum, dst1, dst2, vdst,
rsum, fdst1, fdst2;
vector signed short ssume, ssumo;
vec_u8_t fsum, sumv, sum, dst1, dst2, vdst, rsum, fdst1, fdst2;
vec_s16_t ssume, ssumo;


POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1); POWERPC_PERF_START_COUNT(PREFIX_h264_qpel16_hv_lowpass_num, 1);
src -= (2 * srcStride); src -= (2 * srcStride);
for (i = 0 ; i < 21 ; i ++) { for (i = 0 ; i < 21 ; i ++) {
vector unsigned char srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vector unsigned char srcR1 = vec_ld(-2, src);
vector unsigned char srcR2 = vec_ld(14, src);
vec_u8_t srcM2, srcM1, srcP0, srcP1, srcP2, srcP3;
vec_u8_t srcR1 = vec_ld(-2, src);
vec_u8_t srcR2 = vec_ld(14, src);


switch (align) { switch (align) {
default: { default: {
@@ -532,7 +494,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = srcR2; srcP3 = srcR2;
} break; } break;
case 12: { case 12: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -541,7 +503,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 13: { case 13: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = vec_perm(srcR1, srcR2, permP0); srcP0 = vec_perm(srcR1, srcR2, permP0);
@@ -550,7 +512,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 14: { case 14: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = vec_perm(srcR1, srcR2, permM1); srcM1 = vec_perm(srcR1, srcR2, permM1);
srcP0 = srcR2; srcP0 = srcR2;
@@ -559,7 +521,7 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
srcP3 = vec_perm(srcR2, srcR3, permP3); srcP3 = vec_perm(srcR2, srcR3, permP3);
} break; } break;
case 15: { case 15: {
vector unsigned char srcR3 = vec_ld(30, src);
vec_u8_t srcR3 = vec_ld(30, src);
srcM2 = vec_perm(srcR1, srcR2, permM2); srcM2 = vec_perm(srcR1, srcR2, permM2);
srcM1 = srcR2; srcM1 = srcR2;
srcP0 = vec_perm(srcR2, srcR3, permP0); srcP0 = vec_perm(srcR2, srcR3, permP0);
@@ -569,32 +531,20 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
} break; } break;
} }


srcP0A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP0);
srcP0B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP0);
srcP1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP1);
srcP1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP1);

srcP2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP2);
srcP2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP2);
srcP3A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcP3);
srcP3B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcP3);

srcM1A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM1);
srcM1B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM1);
srcM2A = (vector signed short)
vec_mergeh((vector unsigned char)vzero, srcM2);
srcM2B = (vector signed short)
vec_mergel((vector unsigned char)vzero, srcM2);
srcP0A = (vec_s16_t) vec_mergeh(zero_u8v, srcP0);
srcP0B = (vec_s16_t) vec_mergel(zero_u8v, srcP0);
srcP1A = (vec_s16_t) vec_mergeh(zero_u8v, srcP1);
srcP1B = (vec_s16_t) vec_mergel(zero_u8v, srcP1);

srcP2A = (vec_s16_t) vec_mergeh(zero_u8v, srcP2);
srcP2B = (vec_s16_t) vec_mergel(zero_u8v, srcP2);
srcP3A = (vec_s16_t) vec_mergeh(zero_u8v, srcP3);
srcP3B = (vec_s16_t) vec_mergel(zero_u8v, srcP3);

srcM1A = (vec_s16_t) vec_mergeh(zero_u8v, srcM1);
srcM1B = (vec_s16_t) vec_mergel(zero_u8v, srcM1);
srcM2A = (vec_s16_t) vec_mergeh(zero_u8v, srcM2);
srcM2B = (vec_s16_t) vec_mergel(zero_u8v, srcM2);


sum1A = vec_adds(srcP0A, srcP1A); sum1A = vec_adds(srcP0A, srcP1A);
sum1B = vec_adds(srcP0B, srcP1B); sum1B = vec_adds(srcP0B, srcP1B);
@@ -606,8 +556,8 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
pp1A = vec_mladd(sum1A, v20ss, sum3A); pp1A = vec_mladd(sum1A, v20ss, sum3A);
pp1B = vec_mladd(sum1B, v20ss, sum3B); pp1B = vec_mladd(sum1B, v20ss, sum3B);


pp2A = vec_mladd(sum2A, v5ss, (vector signed short)vzero);
pp2B = vec_mladd(sum2B, v5ss, (vector signed short)vzero);
pp2A = vec_mladd(sum2A, v5ss, zero_s16v);
pp2B = vec_mladd(sum2B, v5ss, zero_s16v);


psumA = vec_sub(pp1A, pp2A); psumA = vec_sub(pp1A, pp2A);
psumB = vec_sub(pp1B, pp2B); psumB = vec_sub(pp1B, pp2B);
@@ -636,15 +586,15 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
tmpbis += tmpStride; tmpbis += tmpStride;


for (i = 0 ; i < 16 ; i++) { for (i = 0 ; i < 16 ; i++) {
const vector signed short tmpP3ssA = vec_ld(0, tmpbis);
const vector signed short tmpP3ssB = vec_ld(16, tmpbis);
const vec_s16_t tmpP3ssA = vec_ld(0, tmpbis);
const vec_s16_t tmpP3ssB = vec_ld(16, tmpbis);


const vector signed short sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vector signed short sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vector signed short sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vector signed short sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
const vector signed short sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vector signed short sum3B = vec_adds(tmpM2ssB, tmpP3ssB);
const vec_s16_t sum1A = vec_adds(tmpP0ssA, tmpP1ssA);
const vec_s16_t sum1B = vec_adds(tmpP0ssB, tmpP1ssB);
const vec_s16_t sum2A = vec_adds(tmpM1ssA, tmpP2ssA);
const vec_s16_t sum2B = vec_adds(tmpM1ssB, tmpP2ssB);
const vec_s16_t sum3A = vec_adds(tmpM2ssA, tmpP3ssA);
const vec_s16_t sum3B = vec_adds(tmpM2ssB, tmpP3ssB);


tmpbis += tmpStride; tmpbis += tmpStride;


@@ -669,9 +619,9 @@ static void PREFIX_h264_qpel16_hv_lowpass_altivec(uint8_t * dst, int16_t * tmp,
pp2Be = vec_mule(sum2B, v5ss); pp2Be = vec_mule(sum2B, v5ss);
pp2Bo = vec_mulo(sum2B, v5ss); pp2Bo = vec_mulo(sum2B, v5ss);


pp3Ae = vec_sra((vector signed int)sum3A, v16ui);
pp3Ae = vec_sra((vec_s32_t)sum3A, v16ui);
pp3Ao = vec_mulo(sum3A, v1ss); pp3Ao = vec_mulo(sum3A, v1ss);
pp3Be = vec_sra((vector signed int)sum3B, v16ui);
pp3Be = vec_sra((vec_s32_t)sum3B, v16ui);
pp3Bo = vec_mulo(sum3B, v1ss); pp3Bo = vec_mulo(sum3B, v1ss);


pp1cAe = vec_add(pp1Ae, v512si); pp1cAe = vec_add(pp1Ae, v512si);


Loading…
Cancel
Save