See merge commit '39d6d3618d48625decaff7d9bdbb45b44ef2a805'.tags/n3.2
@@ -45,7 +45,7 @@ | |||
#define END_CHECK(end) "" | |||
#else | |||
#define END_CHECK(end) \ | |||
"cmp "end" , %%"REG_c" \n\t"\ | |||
"cmp "end" , %%"FF_REG_c" \n\t"\ | |||
"jge 1f \n\t" | |||
#endif | |||
@@ -92,11 +92,11 @@ | |||
"mov "tmpbyte" , "statep" \n\t"\ | |||
"test "lowword" , "lowword" \n\t"\ | |||
"jnz 2f \n\t"\ | |||
"mov "byte" , %%"REG_c" \n\t"\ | |||
"mov "byte" , %%"FF_REG_c" \n\t"\ | |||
END_CHECK(end)\ | |||
"add"OPSIZE" $2 , "byte" \n\t"\ | |||
"add"FF_OPSIZE" $2 , "byte" \n\t"\ | |||
"1: \n\t"\ | |||
"movzwl (%%"REG_c") , "tmp" \n\t"\ | |||
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\ | |||
"lea -1("low") , %%ecx \n\t"\ | |||
"xor "low" , %%ecx \n\t"\ | |||
"shr $15 , %%ecx \n\t"\ | |||
@@ -153,11 +153,11 @@ | |||
"mov "tmpbyte" , "statep" \n\t"\ | |||
"test "lowword" , "lowword" \n\t"\ | |||
" jnz 2f \n\t"\ | |||
"mov "byte" , %%"REG_c" \n\t"\ | |||
"mov "byte" , %%"FF_REG_c" \n\t"\ | |||
END_CHECK(end)\ | |||
"add"OPSIZE" $2 , "byte" \n\t"\ | |||
"add"FF_OPSIZE" $2 , "byte" \n\t"\ | |||
"1: \n\t"\ | |||
"movzwl (%%"REG_c") , "tmp" \n\t"\ | |||
"movzwl (%%"FF_REG_c") , "tmp" \n\t"\ | |||
"lea -1("low") , %%ecx \n\t"\ | |||
"xor "low" , %%ecx \n\t"\ | |||
"shr $15 , %%ecx \n\t"\ | |||
@@ -203,7 +203,7 @@ static av_always_inline int get_cabac_inline_x86(CABACContext *c, | |||
"i"(offsetof(CABACContext, bytestream_end)) | |||
TABLES_ARG | |||
,"1"(c->low), "2"(c->range) | |||
: "%"REG_c, "memory" | |||
: "%"FF_REG_c, "memory" | |||
); | |||
return bit & 1; | |||
} | |||
@@ -240,7 +240,7 @@ static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) | |||
"addl %%edx, %%eax \n\t" | |||
"cmp %c5(%2), %1 \n\t" | |||
"jge 1f \n\t" | |||
"add"OPSIZE" $2, %c4(%2) \n\t" | |||
"add"FF_OPSIZE" $2, %c4(%2) \n\t" | |||
#endif | |||
"1: \n\t" | |||
"movl %%eax, %c3(%2) \n\t" | |||
@@ -281,7 +281,7 @@ static av_always_inline int get_cabac_bypass_x86(CABACContext *c) | |||
"addl %%ecx, %%eax \n\t" | |||
"cmp %c5(%2), %1 \n\t" | |||
"jge 1f \n\t" | |||
"add"OPSIZE" $2, %c4(%2) \n\t" | |||
"add"FF_OPSIZE" $2, %c4(%2) \n\t" | |||
"1: \n\t" | |||
"movl %%eax, %c3(%2) \n\t" | |||
@@ -91,13 +91,13 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, | |||
"sub %10, %1 \n\t" | |||
"mov %2, %0 \n\t" | |||
"movl %7, %%ecx \n\t" | |||
"add %1, %%"REG_c" \n\t" | |||
"add %1, %%"FF_REG_c" \n\t" | |||
"movl %%ecx, (%0) \n\t" | |||
"test $1, %4 \n\t" | |||
" jnz 5f \n\t" | |||
"add"OPSIZE" $4, %2 \n\t" | |||
"add"FF_OPSIZE" $4, %2 \n\t" | |||
"4: \n\t" | |||
"add $1, %1 \n\t" | |||
@@ -105,7 +105,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, | |||
" jb 3b \n\t" | |||
"mov %2, %0 \n\t" | |||
"movl %7, %%ecx \n\t" | |||
"add %1, %%"REG_c" \n\t" | |||
"add %1, %%"FF_REG_c" \n\t" | |||
"movl %%ecx, (%0) \n\t" | |||
"5: \n\t" | |||
"add %9, %k0 \n\t" | |||
@@ -116,7 +116,7 @@ static int decode_significance_x86(CABACContext *c, int max_coeff, | |||
"i"(offsetof(CABACContext, bytestream)), | |||
"i"(offsetof(CABACContext, bytestream_end)) | |||
TABLES_ARG | |||
: "%"REG_c, "memory" | |||
: "%"FF_REG_c, "memory" | |||
); | |||
return coeff_count; | |||
} | |||
@@ -183,7 +183,7 @@ static int decode_significance_8x8_x86(CABACContext *c, | |||
"test $1, %4 \n\t" | |||
" jnz 5f \n\t" | |||
"add"OPSIZE" $4, %2 \n\t" | |||
"add"FF_OPSIZE" $4, %2 \n\t" | |||
"4: \n\t" | |||
"add $1, %6 \n\t" | |||
@@ -202,7 +202,7 @@ static int decode_significance_8x8_x86(CABACContext *c, | |||
"i"(offsetof(CABACContext, bytestream)), | |||
"i"(offsetof(CABACContext, bytestream_end)), | |||
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG | |||
: "%"REG_c, "memory" | |||
: "%"FF_REG_c, "memory" | |||
); | |||
return coeff_count; | |||
} | |||
@@ -32,7 +32,7 @@ av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels | |||
{ | |||
MOVQ_BFE(mm6); | |||
__asm__ volatile( | |||
"lea (%3, %3), %%"REG_a" \n\t" | |||
"lea (%3, %3), %%"FF_REG_a" \n\t" | |||
".p2align 3 \n\t" | |||
"1: \n\t" | |||
"movq (%1), %%mm0 \n\t" | |||
@@ -42,8 +42,8 @@ av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels | |||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
"movq %%mm4, (%2) \n\t" | |||
"movq %%mm5, (%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"movq (%1), %%mm0 \n\t" | |||
"movq 1(%1), %%mm1 \n\t" | |||
"movq (%1, %3), %%mm2 \n\t" | |||
@@ -51,20 +51,20 @@ av_unused static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels | |||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
"movq %%mm4, (%2) \n\t" | |||
"movq %%mm5, (%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"subl $4, %0 \n\t" | |||
"jnz 1b \n\t" | |||
:"+g"(h), "+S"(pixels), "+D"(block) | |||
:"r"((x86_reg)line_size) | |||
:REG_a, "memory"); | |||
:FF_REG_a, "memory"); | |||
} | |||
av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
{ | |||
MOVQ_BFE(mm6); | |||
__asm__ volatile( | |||
"lea (%3, %3), %%"REG_a" \n\t" | |||
"lea (%3, %3), %%"FF_REG_a" \n\t" | |||
".p2align 3 \n\t" | |||
"1: \n\t" | |||
"movq (%1), %%mm0 \n\t" | |||
@@ -81,8 +81,8 @@ av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixel | |||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
"movq %%mm4, 8(%2) \n\t" | |||
"movq %%mm5, 8(%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"movq (%1), %%mm0 \n\t" | |||
"movq 1(%1), %%mm1 \n\t" | |||
"movq (%1, %3), %%mm2 \n\t" | |||
@@ -97,42 +97,42 @@ av_unused static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixel | |||
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) | |||
"movq %%mm4, 8(%2) \n\t" | |||
"movq %%mm5, 8(%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"subl $4, %0 \n\t" | |||
"jnz 1b \n\t" | |||
:"+g"(h), "+S"(pixels), "+D"(block) | |||
:"r"((x86_reg)line_size) | |||
:REG_a, "memory"); | |||
:FF_REG_a, "memory"); | |||
} | |||
av_unused static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
{ | |||
MOVQ_BFE(mm6); | |||
__asm__ volatile( | |||
"lea (%3, %3), %%"REG_a" \n\t" | |||
"lea (%3, %3), %%"FF_REG_a" \n\t" | |||
"movq (%1), %%mm0 \n\t" | |||
".p2align 3 \n\t" | |||
"1: \n\t" | |||
"movq (%1, %3), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"),%%mm2 \n\t" | |||
"movq (%1, %%"FF_REG_a"),%%mm2\n\t" | |||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |||
"movq %%mm4, (%2) \n\t" | |||
"movq %%mm5, (%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"movq (%1, %3), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"),%%mm0 \n\t" | |||
"movq (%1, %%"FF_REG_a"),%%mm0\n\t" | |||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |||
"movq %%mm4, (%2) \n\t" | |||
"movq %%mm5, (%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"subl $4, %0 \n\t" | |||
"jnz 1b \n\t" | |||
:"+g"(h), "+S"(pixels), "+D"(block) | |||
:"r"((x86_reg)line_size) | |||
:REG_a, "memory"); | |||
:FF_REG_a, "memory"); | |||
} | |||
av_unused static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) | |||
@@ -166,12 +166,12 @@ av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels | |||
{ | |||
MOVQ_BFE(mm6); | |||
__asm__ volatile( | |||
"lea (%3, %3), %%"REG_a" \n\t" | |||
"lea (%3, %3), %%"FF_REG_a" \n\t" | |||
"movq (%1), %%mm0 \n\t" | |||
".p2align 3 \n\t" | |||
"1: \n\t" | |||
"movq (%1, %3), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm2 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" | |||
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) | |||
"movq (%2), %%mm3 \n\t" | |||
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6) | |||
@@ -179,11 +179,11 @@ av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels | |||
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) | |||
"movq %%mm0, (%2) \n\t" | |||
"movq %%mm1, (%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"movq (%1, %3), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | |||
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) | |||
"movq (%2), %%mm3 \n\t" | |||
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6) | |||
@@ -191,12 +191,12 @@ av_unused static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels | |||
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) | |||
"movq %%mm2, (%2) \n\t" | |||
"movq %%mm1, (%2, %3) \n\t" | |||
"add %%"REG_a", %1 \n\t" | |||
"add %%"REG_a", %2 \n\t" | |||
"add %%"FF_REG_a", %1 \n\t" | |||
"add %%"FF_REG_a", %2 \n\t" | |||
"subl $4, %0 \n\t" | |||
"jnz 1b \n\t" | |||
:"+g"(h), "+S"(pixels), "+D"(block) | |||
:"r"((x86_reg)line_size) | |||
:REG_a, "memory"); | |||
:FF_REG_a, "memory"); | |||
} |
@@ -283,15 +283,15 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, | |||
__asm__ volatile ( | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm2 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm4 \n\t" | |||
"add %3, %%"REG_a" \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm2 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm4 \n\t" | |||
"add %3, %%"FF_REG_a" \n\t" | |||
"psubusb %%mm0, %%mm2 \n\t" | |||
"psubusb %%mm4, %%mm0 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm3 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm5 \n\t" | |||
"psubusb %%mm1, %%mm3 \n\t" | |||
"psubusb %%mm5, %%mm1 \n\t" | |||
"por %%mm2, %%mm0 \n\t" | |||
@@ -306,7 +306,7 @@ static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, | |||
"paddw %%mm3, %%mm2 \n\t" | |||
"paddw %%mm2, %%mm0 \n\t" | |||
"paddw %%mm0, %%mm6 \n\t" | |||
"add %3, %%"REG_a" \n\t" | |||
"add %3, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
: "+a" (len) | |||
: "r" (blk1 - len), "r" (blk2 - len), "r" (stride)); | |||
@@ -319,18 +319,18 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, | |||
__asm__ volatile ( | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm2 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm3 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm1 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"punpckhbw %%mm7, %%mm2 \n\t" | |||
"punpckhbw %%mm7, %%mm3 \n\t" | |||
"paddw %%mm0, %%mm1 \n\t" | |||
"paddw %%mm2, %%mm3 \n\t" | |||
"movq (%3, %%"REG_a"), %%mm4 \n\t" | |||
"movq (%3, %%"REG_a"), %%mm2 \n\t" | |||
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq (%3, %%"FF_REG_a"), %%mm2 \n\t" | |||
"paddw %%mm5, %%mm1 \n\t" | |||
"paddw %%mm5, %%mm3 \n\t" | |||
"psrlw $1, %%mm1 \n\t" | |||
@@ -344,7 +344,7 @@ static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, | |||
"punpckhbw %%mm7, %%mm1 \n\t" | |||
"paddw %%mm1, %%mm0 \n\t" | |||
"paddw %%mm0, %%mm6 \n\t" | |||
"add %4, %%"REG_a" \n\t" | |||
"add %4, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
: "+a" (len) | |||
: "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len), | |||
@@ -356,8 +356,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, | |||
{ | |||
x86_reg len = -stride * h; | |||
__asm__ volatile ( | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0\n\t" | |||
"movq 1(%1, %%"FF_REG_a"), %%mm2\n\t" | |||
"movq %%mm0, %%mm1 \n\t" | |||
"movq %%mm2, %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
@@ -368,8 +368,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, | |||
"paddw %%mm3, %%mm1 \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%2, %%"REG_a"), %%mm2 \n\t" | |||
"movq 1(%2, %%"REG_a"), %%mm4 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm2\n\t" | |||
"movq 1(%2, %%"FF_REG_a"), %%mm4\n\t" | |||
"movq %%mm2, %%mm3 \n\t" | |||
"movq %%mm4, %%mm5 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
@@ -383,8 +383,8 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, | |||
"paddw %%mm3, %%mm1 \n\t" | |||
"paddw %%mm5, %%mm0 \n\t" | |||
"paddw %%mm5, %%mm1 \n\t" | |||
"movq (%3, %%"REG_a"), %%mm4 \n\t" | |||
"movq (%3, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%3, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq (%3, %%"FF_REG_a"), %%mm5 \n\t" | |||
"psrlw $2, %%mm0 \n\t" | |||
"psrlw $2, %%mm1 \n\t" | |||
"packuswb %%mm1, %%mm0 \n\t" | |||
@@ -398,7 +398,7 @@ static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, | |||
"paddw %%mm4, %%mm6 \n\t" | |||
"movq %%mm2, %%mm0 \n\t" | |||
"movq %%mm3, %%mm1 \n\t" | |||
"add %4, %%"REG_a" \n\t" | |||
"add %4, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
: "+a" (len) | |||
: "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len), | |||
@@ -188,13 +188,13 @@ __asm__ volatile( | |||
"movd %2, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"mov %3, %%"REG_a" \n\t" | |||
"mov %3, %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | |||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |||
"pxor %%mm2, %%mm2 \n\t" | |||
@@ -209,8 +209,8 @@ __asm__ volatile( | |||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |||
"pxor %%mm4, %%mm4 \n\t" | |||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow | |||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"psraw $3, %%mm0 \n\t" | |||
"psraw $3, %%mm1 \n\t" | |||
"psubw %%mm7, %%mm0 \n\t" | |||
@@ -223,13 +223,13 @@ __asm__ volatile( | |||
"psubw %%mm3, %%mm1 \n\t" | |||
"pandn %%mm0, %%mm4 \n\t" | |||
"pandn %%mm1, %%mm5 \n\t" | |||
"movq %%mm4, (%0, %%"REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"REG_a") \n\t" | |||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | |||
"add $16, %%"REG_a" \n\t" | |||
"add $16, %%"FF_REG_a" \n\t" | |||
"js 1b \n\t" | |||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |||
: "%"REG_a, "memory" | |||
: "%"FF_REG_a, "memory" | |||
); | |||
block[0]= block0; | |||
} | |||
@@ -251,13 +251,13 @@ __asm__ volatile( | |||
"movd %2, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"mov %3, %%"REG_a" \n\t" | |||
"mov %3, %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | |||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |||
"pxor %%mm2, %%mm2 \n\t" | |||
@@ -276,8 +276,8 @@ __asm__ volatile( | |||
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |||
"pxor %%mm4, %%mm4 \n\t" | |||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow | |||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"psraw $4, %%mm0 \n\t" | |||
"psraw $4, %%mm1 \n\t" | |||
"psubw %%mm7, %%mm0 \n\t" | |||
@@ -290,13 +290,13 @@ __asm__ volatile( | |||
"psubw %%mm3, %%mm1 \n\t" | |||
"pandn %%mm0, %%mm4 \n\t" | |||
"pandn %%mm1, %%mm5 \n\t" | |||
"movq %%mm4, (%0, %%"REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"REG_a") \n\t" | |||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | |||
"add $16, %%"REG_a" \n\t" | |||
"add $16, %%"FF_REG_a" \n\t" | |||
"js 1b \n\t" | |||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |||
: "%"REG_a, "memory" | |||
: "%"FF_REG_a, "memory" | |||
); | |||
} | |||
@@ -326,13 +326,13 @@ __asm__ volatile( | |||
"movd %2, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"mov %3, %%"REG_a" \n\t" | |||
"mov %3, %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | |||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |||
"pxor %%mm2, %%mm2 \n\t" | |||
@@ -347,8 +347,8 @@ __asm__ volatile( | |||
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q | |||
"pxor %%mm4, %%mm4 \n\t" | |||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow | |||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"psraw $4, %%mm0 \n\t" | |||
"psraw $4, %%mm1 \n\t" | |||
"pxor %%mm2, %%mm0 \n\t" | |||
@@ -357,13 +357,13 @@ __asm__ volatile( | |||
"psubw %%mm3, %%mm1 \n\t" | |||
"pandn %%mm0, %%mm4 \n\t" | |||
"pandn %%mm1, %%mm5 \n\t" | |||
"movq %%mm4, (%0, %%"REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"REG_a") \n\t" | |||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | |||
"add $16, %%"REG_a" \n\t" | |||
"add $16, %%"FF_REG_a" \n\t" | |||
"jng 1b \n\t" | |||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) | |||
: "%"REG_a, "memory" | |||
: "%"FF_REG_a, "memory" | |||
); | |||
block[0]= block0; | |||
//Note, we do not do mismatch control for intra as errors cannot accumulate | |||
@@ -390,13 +390,13 @@ __asm__ volatile( | |||
"movd %2, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"packssdw %%mm6, %%mm6 \n\t" | |||
"mov %3, %%"REG_a" \n\t" | |||
"mov %3, %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq 8(%0, %%"FF_REG_a"), %%mm1\n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq 8(%1, %%"FF_REG_a"), %%mm5\n\t" | |||
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] | |||
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] | |||
"pxor %%mm2, %%mm2 \n\t" | |||
@@ -415,8 +415,8 @@ __asm__ volatile( | |||
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q | |||
"pxor %%mm4, %%mm4 \n\t" | |||
"pxor %%mm5, %%mm5 \n\t" // FIXME slow | |||
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw (%0, %%"FF_REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 | |||
"pcmpeqw 8(%0, %%"FF_REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 | |||
"psrlw $5, %%mm0 \n\t" | |||
"psrlw $5, %%mm1 \n\t" | |||
"pxor %%mm2, %%mm0 \n\t" | |||
@@ -427,10 +427,10 @@ __asm__ volatile( | |||
"pandn %%mm1, %%mm5 \n\t" | |||
"pxor %%mm4, %%mm7 \n\t" | |||
"pxor %%mm5, %%mm7 \n\t" | |||
"movq %%mm4, (%0, %%"REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"REG_a") \n\t" | |||
"movq %%mm4, (%0, %%"FF_REG_a") \n\t" | |||
"movq %%mm5, 8(%0, %%"FF_REG_a")\n\t" | |||
"add $16, %%"REG_a" \n\t" | |||
"add $16, %%"FF_REG_a" \n\t" | |||
"jng 1b \n\t" | |||
"movd 124(%0, %3), %%mm0 \n\t" | |||
"movq %%mm7, %%mm6 \n\t" | |||
@@ -445,7 +445,7 @@ __asm__ volatile( | |||
"movd %%mm0, 124(%0, %3) \n\t" | |||
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) | |||
: "%"REG_a, "memory" | |||
: "%"FF_REG_a, "memory" | |||
); | |||
} | |||
@@ -150,32 +150,32 @@ static int RENAME(dct_quantize)(MpegEncContext *s, | |||
if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ | |||
__asm__ volatile( | |||
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 | |||
"movd %%"FF_REG_a", "MM"3 \n\t" // last_non_zero_p1 | |||
SPREADW(MM"3") | |||
"pxor "MM"7, "MM"7 \n\t" // 0 | |||
"pxor "MM"4, "MM"4 \n\t" // 0 | |||
MOVQ" (%2), "MM"5 \n\t" // qmat[0] | |||
"pxor "MM"6, "MM"6 \n\t" | |||
"psubw (%3), "MM"6 \n\t" // -bias[0] | |||
"mov $-128, %%"REG_a" \n\t" | |||
"mov $-128, %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] | |||
MOVQ" (%1, %%"FF_REG_a"), "MM"0 \n\t" // block[i] | |||
SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) | |||
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] | |||
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 | |||
"por "MM"0, "MM"4 \n\t" | |||
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) | |||
MOVQ" "MM"0, (%5, %%"REG_a") \n\t" | |||
MOVQ" "MM"0, (%5, %%"FF_REG_a") \n\t" | |||
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 | |||
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" | |||
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 | |||
MOVQ" (%4, %%"FF_REG_a"), "MM"1 \n\t" | |||
MOVQ" "MM"7, (%1, %%"FF_REG_a") \n\t" // 0 | |||
"pandn "MM"1, "MM"0 \n\t" | |||
PMAXW(MM"0", MM"3") | |||
"add $"MMREG_WIDTH", %%"REG_a" \n\t" | |||
"add $"MMREG_WIDTH", %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
PMAX(MM"3", MM"0") | |||
"movd "MM"3, %%"REG_a" \n\t" | |||
"movd "MM"3, %%"FF_REG_a" \n\t" | |||
"movzbl %%al, %%eax \n\t" // last_non_zero_p1 | |||
: "+a" (last_non_zero_p1) | |||
: "r" (block+64), "r" (qmat), "r" (bias), | |||
@@ -185,31 +185,31 @@ static int RENAME(dct_quantize)(MpegEncContext *s, | |||
); | |||
}else{ // FMT_H263 | |||
__asm__ volatile( | |||
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 | |||
"movd %%"FF_REG_a", "MM"3 \n\t" // last_non_zero_p1 | |||
SPREADW(MM"3") | |||
"pxor "MM"7, "MM"7 \n\t" // 0 | |||
"pxor "MM"4, "MM"4 \n\t" // 0 | |||
"mov $-128, %%"REG_a" \n\t" | |||
"mov $-128, %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] | |||
MOVQ" (%1, %%"FF_REG_a"), "MM"0 \n\t" // block[i] | |||
SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) | |||
MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0] | |||
MOVQ" (%3, %%"FF_REG_a"), "MM"6 \n\t" // bias[0] | |||
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] | |||
MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i] | |||
MOVQ" (%2, %%"FF_REG_a"), "MM"5 \n\t" // qmat[i] | |||
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 | |||
"por "MM"0, "MM"4 \n\t" | |||
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) | |||
MOVQ" "MM"0, (%5, %%"REG_a") \n\t" | |||
MOVQ" "MM"0, (%5, %%"FF_REG_a") \n\t" | |||
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 | |||
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" | |||
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 | |||
MOVQ" (%4, %%"FF_REG_a"), "MM"1 \n\t" | |||
MOVQ" "MM"7, (%1, %%"FF_REG_a") \n\t" // 0 | |||
"pandn "MM"1, "MM"0 \n\t" | |||
PMAXW(MM"0", MM"3") | |||
"add $"MMREG_WIDTH", %%"REG_a" \n\t" | |||
"add $"MMREG_WIDTH", %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
PMAX(MM"3", MM"0") | |||
"movd "MM"3, %%"REG_a" \n\t" | |||
"movd "MM"3, %%"FF_REG_a" \n\t" | |||
"movzbl %%al, %%eax \n\t" // last_non_zero_p1 | |||
: "+a" (last_non_zero_p1) | |||
: "r" (block+64), "r" (qmat+64), "r" (bias+64), | |||
@@ -46,12 +46,12 @@ av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixel | |||
"punpckhbw %%mm7, %%mm5 \n\t" | |||
"paddusw %%mm0, %%mm4 \n\t" | |||
"paddusw %%mm1, %%mm5 \n\t" | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
"add %3, %1 \n\t" | |||
".p2align 3 \n\t" | |||
"1: \n\t" | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" | |||
"movq %%mm0, %%mm1 \n\t" | |||
"movq %%mm2, %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
@@ -67,11 +67,11 @@ av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixel | |||
"psrlw $2, %%mm4 \n\t" | |||
"psrlw $2, %%mm5 \n\t" | |||
"packuswb %%mm5, %%mm4 \n\t" | |||
"movq %%mm4, (%2, %%"REG_a") \n\t" | |||
"add %3, %%"REG_a" \n\t" | |||
"movq %%mm4, (%2, %%"FF_REG_a") \n\t" | |||
"add %3, %%"FF_REG_a" \n\t" | |||
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |||
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |||
"movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq %%mm2, %%mm3 \n\t" | |||
"movq %%mm4, %%mm5 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
@@ -87,14 +87,14 @@ av_unused STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixel | |||
"psrlw $2, %%mm0 \n\t" | |||
"psrlw $2, %%mm1 \n\t" | |||
"packuswb %%mm1, %%mm0 \n\t" | |||
"movq %%mm0, (%2, %%"REG_a") \n\t" | |||
"add %3, %%"REG_a" \n\t" | |||
"movq %%mm0, (%2, %%"FF_REG_a") \n\t" | |||
"add %3, %%"FF_REG_a" \n\t" | |||
"subl $2, %0 \n\t" | |||
"jnz 1b \n\t" | |||
:"+g"(h), "+S"(pixels) | |||
:"D"(block), "r"((x86_reg)line_size) | |||
:REG_a, "memory"); | |||
:FF_REG_a, "memory"); | |||
} | |||
// avg_pixels | |||
@@ -115,12 +115,12 @@ av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixel | |||
"punpckhbw %%mm7, %%mm5 \n\t" | |||
"paddusw %%mm0, %%mm4 \n\t" | |||
"paddusw %%mm1, %%mm5 \n\t" | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
"add %3, %1 \n\t" | |||
".p2align 3 \n\t" | |||
"1: \n\t" | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq 1(%1, %%"FF_REG_a"), %%mm2 \n\t" | |||
"movq %%mm0, %%mm1 \n\t" | |||
"movq %%mm2, %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
@@ -135,16 +135,16 @@ av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixel | |||
"paddusw %%mm1, %%mm5 \n\t" | |||
"psrlw $2, %%mm4 \n\t" | |||
"psrlw $2, %%mm5 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm3 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | |||
"packuswb %%mm5, %%mm4 \n\t" | |||
"pcmpeqd %%mm2, %%mm2 \n\t" | |||
"paddb %%mm2, %%mm2 \n\t" | |||
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) | |||
"movq %%mm5, (%2, %%"REG_a") \n\t" | |||
"add %3, %%"REG_a" \n\t" | |||
"movq %%mm5, (%2, %%"FF_REG_a") \n\t" | |||
"add %3, %%"FF_REG_a" \n\t" | |||
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |||
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 | |||
"movq 1(%1, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq %%mm2, %%mm3 \n\t" | |||
"movq %%mm4, %%mm5 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
@@ -159,17 +159,17 @@ av_unused STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixel | |||
"paddusw %%mm5, %%mm1 \n\t" | |||
"psrlw $2, %%mm0 \n\t" | |||
"psrlw $2, %%mm1 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm3 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm3 \n\t" | |||
"packuswb %%mm1, %%mm0 \n\t" | |||
"pcmpeqd %%mm2, %%mm2 \n\t" | |||
"paddb %%mm2, %%mm2 \n\t" | |||
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) | |||
"movq %%mm1, (%2, %%"REG_a") \n\t" | |||
"add %3, %%"REG_a" \n\t" | |||
"movq %%mm1, (%2, %%"FF_REG_a") \n\t" | |||
"add %3, %%"FF_REG_a" \n\t" | |||
"subl $2, %0 \n\t" | |||
"jnz 1b \n\t" | |||
:"+g"(h), "+S"(pixels) | |||
:"D"(block), "r"((x86_reg)line_size) | |||
:REG_a, "memory"); | |||
:FF_REG_a, "memory"); | |||
} |
@@ -390,10 +390,10 @@ static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int w | |||
#if HAVE_7REGS | |||
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ | |||
""op" ("r",%%"REG_d"), %%"t0" \n\t"\ | |||
""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ | |||
""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ | |||
""op" 48("r",%%"REG_d"), %%"t3" \n\t" | |||
""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\ | |||
""op" 16("r",%%"FF_REG_d"), %%"t1" \n\t"\ | |||
""op" 32("r",%%"FF_REG_d"), %%"t2" \n\t"\ | |||
""op" 48("r",%%"FF_REG_d"), %%"t3" \n\t" | |||
#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ | |||
snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) | |||
@@ -408,10 +408,10 @@ static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int w | |||
"psubw %%"s3", %%"t3" \n\t" | |||
#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ | |||
"movdqa %%"s0", ("w",%%"REG_d") \n\t"\ | |||
"movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ | |||
"movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ | |||
"movdqa %%"s3", 48("w",%%"REG_d") \n\t" | |||
"movdqa %%"s0", ("w",%%"FF_REG_d") \n\t"\ | |||
"movdqa %%"s1", 16("w",%%"FF_REG_d") \n\t"\ | |||
"movdqa %%"s2", 32("w",%%"FF_REG_d") \n\t"\ | |||
"movdqa %%"s3", 48("w",%%"FF_REG_d") \n\t" | |||
#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ | |||
"psraw $"n", %%"t0" \n\t"\ | |||
@@ -477,14 +477,14 @@ static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELE | |||
"psrlw $13, %%xmm5 \n\t" | |||
"paddw %%xmm7, %%xmm5 \n\t" | |||
snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") | |||
"movq (%2,%%"REG_d"), %%xmm1 \n\t" | |||
"movq 8(%2,%%"REG_d"), %%xmm3 \n\t" | |||
"movq (%2,%%"FF_REG_d"), %%xmm1 \n\t" | |||
"movq 8(%2,%%"FF_REG_d"), %%xmm3 \n\t" | |||
"paddw %%xmm7, %%xmm1 \n\t" | |||
"paddw %%xmm7, %%xmm3 \n\t" | |||
"pavgw %%xmm1, %%xmm0 \n\t" | |||
"pavgw %%xmm3, %%xmm2 \n\t" | |||
"movq 16(%2,%%"REG_d"), %%xmm1 \n\t" | |||
"movq 24(%2,%%"REG_d"), %%xmm3 \n\t" | |||
"movq 16(%2,%%"FF_REG_d"), %%xmm1 \n\t" | |||
"movq 24(%2,%%"FF_REG_d"), %%xmm3 \n\t" | |||
"paddw %%xmm7, %%xmm1 \n\t" | |||
"paddw %%xmm7, %%xmm3 \n\t" | |||
"pavgw %%xmm1, %%xmm4 \n\t" | |||
@@ -504,17 +504,17 @@ static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELE | |||
snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") | |||
"2: \n\t" | |||
"sub $64, %%"REG_d" \n\t" | |||
"sub $64, %%"FF_REG_d" \n\t" | |||
"jge 1b \n\t" | |||
:"+d"(i) | |||
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); | |||
} | |||
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ | |||
""op" ("r",%%"REG_d"), %%"t0" \n\t"\ | |||
""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ | |||
""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ | |||
""op" 24("r",%%"REG_d"), %%"t3" \n\t" | |||
""op" ("r",%%"FF_REG_d"), %%"t0" \n\t"\ | |||
""op" 8("r",%%"FF_REG_d"), %%"t1" \n\t"\ | |||
""op" 16("r",%%"FF_REG_d"), %%"t2" \n\t"\ | |||
""op" 24("r",%%"FF_REG_d"), %%"t3" \n\t" | |||
#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ | |||
snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) | |||
@@ -523,10 +523,10 @@ static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELE | |||
snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) | |||
#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ | |||
"movq %%"s0", ("w",%%"REG_d") \n\t"\ | |||
"movq %%"s1", 8("w",%%"REG_d") \n\t"\ | |||
"movq %%"s2", 16("w",%%"REG_d") \n\t"\ | |||
"movq %%"s3", 24("w",%%"REG_d") \n\t" | |||
"movq %%"s0", ("w",%%"FF_REG_d") \n\t"\ | |||
"movq %%"s1", 8("w",%%"FF_REG_d") \n\t"\ | |||
"movq %%"s2", 16("w",%%"FF_REG_d") \n\t"\ | |||
"movq %%"s3", 24("w",%%"FF_REG_d") \n\t" | |||
#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ | |||
"movq %%"s0", %%"t0" \n\t"\ | |||
@@ -571,14 +571,14 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM | |||
"psrlw $13, %%mm5 \n\t" | |||
"paddw %%mm7, %%mm5 \n\t" | |||
snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") | |||
"movq (%2,%%"REG_d"), %%mm1 \n\t" | |||
"movq 8(%2,%%"REG_d"), %%mm3 \n\t" | |||
"movq (%2,%%"FF_REG_d"), %%mm1 \n\t" | |||
"movq 8(%2,%%"FF_REG_d"), %%mm3 \n\t" | |||
"paddw %%mm7, %%mm1 \n\t" | |||
"paddw %%mm7, %%mm3 \n\t" | |||
"pavgw %%mm1, %%mm0 \n\t" | |||
"pavgw %%mm3, %%mm2 \n\t" | |||
"movq 16(%2,%%"REG_d"), %%mm1 \n\t" | |||
"movq 24(%2,%%"REG_d"), %%mm3 \n\t" | |||
"movq 16(%2,%%"FF_REG_d"), %%mm1 \n\t" | |||
"movq 24(%2,%%"FF_REG_d"), %%mm3 \n\t" | |||
"paddw %%mm7, %%mm1 \n\t" | |||
"paddw %%mm7, %%mm3 \n\t" | |||
"pavgw %%mm1, %%mm4 \n\t" | |||
@@ -598,7 +598,7 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM | |||
snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") | |||
"2: \n\t" | |||
"sub $32, %%"REG_d" \n\t" | |||
"sub $32, %%"FF_REG_d" \n\t" | |||
"jge 1b \n\t" | |||
:"+d"(i) | |||
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); | |||
@@ -610,39 +610,39 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM | |||
IDWTELEM * * dst_array = sb->line + src_y;\ | |||
x86_reg tmp;\ | |||
__asm__ volatile(\ | |||
"mov %7, %%"REG_c" \n\t"\ | |||
"mov %7, %%"FF_REG_c" \n\t"\ | |||
"mov %6, %2 \n\t"\ | |||
"mov %4, %%"REG_S" \n\t"\ | |||
"mov %4, %%"FF_REG_S" \n\t"\ | |||
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ | |||
"pcmpeqd %%xmm3, %%xmm3 \n\t"\ | |||
"psllw $15, %%xmm3 \n\t"\ | |||
"psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ | |||
"1: \n\t"\ | |||
"mov %1, %%"REG_D" \n\t"\ | |||
"mov (%%"REG_D"), %%"REG_D" \n\t"\ | |||
"add %3, %%"REG_D" \n\t" | |||
"mov %1, %%"FF_REG_D" \n\t"\ | |||
"mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ | |||
"add %3, %%"FF_REG_D" \n\t" | |||
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ | |||
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | |||
"movq (%%"REG_d"), %%"out_reg1" \n\t"\ | |||
"movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ | |||
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ | |||
"movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\ | |||
"movq (%%"FF_REG_d", %%"FF_REG_c"), %%"out_reg2" \n\t"\ | |||
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\ | |||
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\ | |||
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ | |||
"movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ | |||
"movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\ | |||
"movq "s_offset"+16(%%"FF_REG_S"), %%xmm4 \n\t"\ | |||
"punpcklbw %%xmm7, %%xmm0 \n\t"\ | |||
"punpcklbw %%xmm7, %%xmm4 \n\t"\ | |||
"pmullw %%xmm0, %%"out_reg1" \n\t"\ | |||
"pmullw %%xmm4, %%"out_reg2" \n\t" | |||
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ | |||
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | |||
"movq (%%"REG_d"), %%"out_reg1" \n\t"\ | |||
"movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ | |||
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ | |||
"movq (%%"FF_REG_d"), %%"out_reg1" \n\t"\ | |||
"movq 8(%%"FF_REG_d"), %%"out_reg2" \n\t"\ | |||
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\ | |||
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\ | |||
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ | |||
"movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ | |||
"movq "s_offset"(%%"FF_REG_S"), %%xmm0 \n\t"\ | |||
"movq "s_offset"+8(%%"FF_REG_S"), %%xmm4 \n\t"\ | |||
"punpcklbw %%xmm7, %%xmm0 \n\t"\ | |||
"punpcklbw %%xmm7, %%xmm4 \n\t"\ | |||
"pmullw %%xmm0, %%"out_reg1" \n\t"\ | |||
@@ -659,12 +659,12 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM | |||
"paddusw %%xmm6, %%xmm5 \n\t" | |||
#define snow_inner_add_yblock_sse2_end_common1\ | |||
"add $32, %%"REG_S" \n\t"\ | |||
"add %%"REG_c", %0 \n\t"\ | |||
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ | |||
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ | |||
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ | |||
"add %%"REG_c", (%%"REG_a") \n\t" | |||
"add $32, %%"FF_REG_S" \n\t"\ | |||
"add %%"FF_REG_c", %0 \n\t"\ | |||
"add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ | |||
"add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ | |||
"add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ | |||
"add %%"FF_REG_c", (%%"FF_REG_a") \n\t" | |||
#define snow_inner_add_yblock_sse2_end_common2\ | |||
"jnz 1b \n\t"\ | |||
@@ -672,18 +672,18 @@ static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM | |||
:\ | |||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ | |||
XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7", )\ | |||
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); | |||
"%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); | |||
#define snow_inner_add_yblock_sse2_end_8\ | |||
"sal $1, %%"REG_c" \n\t"\ | |||
"add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\ | |||
"sal $1, %%"FF_REG_c" \n\t"\ | |||
"add"FF_OPSIZE" $"FF_PTR_SIZE"*2, %1 \n\t"\ | |||
snow_inner_add_yblock_sse2_end_common1\ | |||
"sar $1, %%"REG_c" \n\t"\ | |||
"sar $1, %%"FF_REG_c" \n\t"\ | |||
"sub $2, %2 \n\t"\ | |||
snow_inner_add_yblock_sse2_end_common2 | |||
#define snow_inner_add_yblock_sse2_end_16\ | |||
"add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\ | |||
"add"FF_OPSIZE" $"FF_PTR_SIZE"*1, %1 \n\t"\ | |||
snow_inner_add_yblock_sse2_end_common1\ | |||
"dec %2 \n\t"\ | |||
snow_inner_add_yblock_sse2_end_common2 | |||
@@ -696,28 +696,28 @@ snow_inner_add_yblock_sse2_accum_8("2", "8") | |||
snow_inner_add_yblock_sse2_accum_8("1", "128") | |||
snow_inner_add_yblock_sse2_accum_8("0", "136") | |||
"mov %0, %%"REG_d" \n\t" | |||
"movdqa (%%"REG_D"), %%xmm0 \n\t" | |||
"mov %0, %%"FF_REG_d" \n\t" | |||
"movdqa (%%"FF_REG_D"), %%xmm0 \n\t" | |||
"movdqa %%xmm1, %%xmm2 \n\t" | |||
"punpckhwd %%xmm7, %%xmm1 \n\t" | |||
"punpcklwd %%xmm7, %%xmm2 \n\t" | |||
"paddd %%xmm2, %%xmm0 \n\t" | |||
"movdqa 16(%%"REG_D"), %%xmm2 \n\t" | |||
"movdqa 16(%%"FF_REG_D"), %%xmm2\n\t" | |||
"paddd %%xmm1, %%xmm2 \n\t" | |||
"paddd %%xmm3, %%xmm0 \n\t" | |||
"paddd %%xmm3, %%xmm2 \n\t" | |||
"mov %1, %%"REG_D" \n\t" | |||
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" | |||
"add %3, %%"REG_D" \n\t" | |||
"mov %1, %%"FF_REG_D" \n\t" | |||
"mov "FF_PTR_SIZE"(%%"FF_REG_D"), %%"FF_REG_D"; \n\t" | |||
"add %3, %%"FF_REG_D" \n\t" | |||
"movdqa (%%"REG_D"), %%xmm4 \n\t" | |||
"movdqa (%%"FF_REG_D"), %%xmm4 \n\t" | |||
"movdqa %%xmm5, %%xmm6 \n\t" | |||
"punpckhwd %%xmm7, %%xmm5 \n\t" | |||
"punpcklwd %%xmm7, %%xmm6 \n\t" | |||
"paddd %%xmm6, %%xmm4 \n\t" | |||
"movdqa 16(%%"REG_D"), %%xmm6 \n\t" | |||
"movdqa 16(%%"FF_REG_D"), %%xmm6\n\t" | |||
"paddd %%xmm5, %%xmm6 \n\t" | |||
"paddd %%xmm3, %%xmm4 \n\t" | |||
"paddd %%xmm3, %%xmm6 \n\t" | |||
@@ -726,13 +726,13 @@ snow_inner_add_yblock_sse2_accum_8("0", "136") | |||
"psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ | |||
"packssdw %%xmm2, %%xmm0 \n\t" | |||
"packuswb %%xmm7, %%xmm0 \n\t" | |||
"movq %%xmm0, (%%"REG_d") \n\t" | |||
"movq %%xmm0, (%%"FF_REG_d") \n\t" | |||
"psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ | |||
"psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ | |||
"packssdw %%xmm6, %%xmm4 \n\t" | |||
"packuswb %%xmm7, %%xmm4 \n\t" | |||
"movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" | |||
"movq %%xmm4, (%%"FF_REG_d",%%"FF_REG_c"); \n\t" | |||
snow_inner_add_yblock_sse2_end_8 | |||
} | |||
@@ -744,18 +744,18 @@ snow_inner_add_yblock_sse2_accum_16("2", "16") | |||
snow_inner_add_yblock_sse2_accum_16("1", "512") | |||
snow_inner_add_yblock_sse2_accum_16("0", "528") | |||
"mov %0, %%"REG_d" \n\t" | |||
"mov %0, %%"FF_REG_d" \n\t" | |||
"psrlw $4, %%xmm1 \n\t" | |||
"psrlw $4, %%xmm5 \n\t" | |||
"paddw (%%"REG_D"), %%xmm1 \n\t" | |||
"paddw 16(%%"REG_D"), %%xmm5 \n\t" | |||
"paddw (%%"FF_REG_D"), %%xmm1 \n\t" | |||
"paddw 16(%%"FF_REG_D"), %%xmm5 \n\t" | |||
"paddw %%xmm3, %%xmm1 \n\t" | |||
"paddw %%xmm3, %%xmm5 \n\t" | |||
"psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ | |||
"psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ | |||
"packuswb %%xmm5, %%xmm1 \n\t" | |||
"movdqu %%xmm1, (%%"REG_d") \n\t" | |||
"movdqu %%xmm1, (%%"FF_REG_d") \n\t" | |||
snow_inner_add_yblock_sse2_end_16 | |||
} | |||
@@ -764,30 +764,30 @@ snow_inner_add_yblock_sse2_end_16 | |||
IDWTELEM * * dst_array = sb->line + src_y;\ | |||
x86_reg tmp;\ | |||
__asm__ volatile(\ | |||
"mov %7, %%"REG_c" \n\t"\ | |||
"mov %7, %%"FF_REG_c" \n\t"\ | |||
"mov %6, %2 \n\t"\ | |||
"mov %4, %%"REG_S" \n\t"\ | |||
"mov %4, %%"FF_REG_S" \n\t"\ | |||
"pxor %%mm7, %%mm7 \n\t" /* 0 */\ | |||
"pcmpeqd %%mm3, %%mm3 \n\t"\ | |||
"psllw $15, %%mm3 \n\t"\ | |||
"psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ | |||
"1: \n\t"\ | |||
"mov %1, %%"REG_D" \n\t"\ | |||
"mov (%%"REG_D"), %%"REG_D" \n\t"\ | |||
"add %3, %%"REG_D" \n\t" | |||
"mov %1, %%"FF_REG_D" \n\t"\ | |||
"mov (%%"FF_REG_D"), %%"FF_REG_D" \n\t"\ | |||
"add %3, %%"FF_REG_D" \n\t" | |||
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ | |||
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ | |||
"movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ | |||
"movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ | |||
"mov "FF_PTR_SIZE"*"ptr_offset"(%%"FF_REG_a"), %%"FF_REG_d"; \n\t"\ | |||
"movd "d_offset"(%%"FF_REG_d"), %%"out_reg1" \n\t"\ | |||
"movd "d_offset"+4(%%"FF_REG_d"), %%"out_reg2" \n\t"\ | |||
"punpcklbw %%mm7, %%"out_reg1" \n\t"\ | |||
"punpcklbw %%mm7, %%"out_reg2" \n\t"\ | |||
"movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ | |||
"movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ | |||
"movd "s_offset"(%%"FF_REG_S"), %%mm0 \n\t"\ | |||
"movd "s_offset"+4(%%"FF_REG_S"), %%mm4 \n\t"\ | |||
"punpcklbw %%mm7, %%mm0 \n\t"\ | |||
"punpcklbw %%mm7, %%mm4 \n\t"\ | |||
"pmullw %%mm0, %%"out_reg1" \n\t"\ | |||
"pmullw %%mm4, %%"out_reg2" \n\t" | |||
"pmullw %%mm0, %%"out_reg1" \n\t"\ | |||
"pmullw %%mm4, %%"out_reg2" \n\t" | |||
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ | |||
snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ | |||
@@ -795,32 +795,32 @@ snow_inner_add_yblock_sse2_end_16 | |||
"paddusw %%mm6, %%mm5 \n\t" | |||
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ | |||
"mov %0, %%"REG_d" \n\t"\ | |||
"mov %0, %%"FF_REG_d" \n\t"\ | |||
"psrlw $4, %%mm1 \n\t"\ | |||
"psrlw $4, %%mm5 \n\t"\ | |||
"paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ | |||
"paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ | |||
"paddw "read_offset"(%%"FF_REG_D"), %%mm1 \n\t"\ | |||
"paddw "read_offset"+8(%%"FF_REG_D"), %%mm5 \n\t"\ | |||
"paddw %%mm3, %%mm1 \n\t"\ | |||
"paddw %%mm3, %%mm5 \n\t"\ | |||
"psraw $4, %%mm1 \n\t"\ | |||
"psraw $4, %%mm5 \n\t"\ | |||
"packuswb %%mm5, %%mm1 \n\t"\ | |||
"movq %%mm1, "write_offset"(%%"REG_d") \n\t" | |||
"movq %%mm1, "write_offset"(%%"FF_REG_d") \n\t" | |||
#define snow_inner_add_yblock_mmx_end(s_step)\ | |||
"add $"s_step", %%"REG_S" \n\t"\ | |||
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ | |||
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ | |||
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ | |||
"add %%"REG_c", (%%"REG_a") \n\t"\ | |||
"add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ | |||
"add %%"REG_c", %0 \n\t"\ | |||
"add $"s_step", %%"FF_REG_S" \n\t"\ | |||
"add %%"FF_REG_c", "FF_PTR_SIZE"*3(%%"FF_REG_a"); \n\t"\ | |||
"add %%"FF_REG_c", "FF_PTR_SIZE"*2(%%"FF_REG_a"); \n\t"\ | |||
"add %%"FF_REG_c", "FF_PTR_SIZE"*1(%%"FF_REG_a"); \n\t"\ | |||
"add %%"FF_REG_c", (%%"FF_REG_a") \n\t"\ | |||
"add"FF_OPSIZE " $"FF_PTR_SIZE"*1, %1 \n\t"\ | |||
"add %%"FF_REG_c", %0 \n\t"\ | |||
"dec %2 \n\t"\ | |||
"jnz 1b \n\t"\ | |||
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ | |||
:\ | |||
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ | |||
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); | |||
"%"FF_REG_c"","%"FF_REG_S"","%"FF_REG_D"","%"FF_REG_d""); | |||
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, | |||
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ | |||
@@ -84,7 +84,7 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ | |||
{\ | |||
rnd = 8-rnd;\ | |||
__asm__ volatile(\ | |||
"mov $8, %%"REG_c" \n\t"\ | |||
"mov $8, %%"FF_REG_c" \n\t"\ | |||
LOAD_ROUNDER_MMX("%5")\ | |||
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ | |||
"1: \n\t"\ | |||
@@ -119,13 +119,13 @@ static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ | |||
"movq %%mm3, (%1) \n\t"\ | |||
"add %6, %0 \n\t"\ | |||
"add %4, %1 \n\t"\ | |||
"dec %%"REG_c" \n\t"\ | |||
"dec %%"FF_REG_c" \n\t"\ | |||
"jnz 1b \n\t"\ | |||
: "+r"(src), "+r"(dst)\ | |||
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ | |||
"g"(stride-offset)\ | |||
NAMED_CONSTRAINTS_ADD(ff_pw_9)\ | |||
: "%"REG_c, "memory"\ | |||
: "%"FF_REG_c, "memory"\ | |||
);\ | |||
} | |||
@@ -32,22 +32,22 @@ static void line_noise_mmx(uint8_t *dst, const uint8_t *src, | |||
noise += shift; | |||
__asm__ volatile( | |||
"mov %3, %%"REG_a" \n\t" | |||
"mov %3, %%"FF_REG_a" \n\t" | |||
"pcmpeqb %%mm7, %%mm7 \n\t" | |||
"psllw $15, %%mm7 \n\t" | |||
"packsswb %%mm7, %%mm7 \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | |||
"pxor %%mm7, %%mm0 \n\t" | |||
"paddsb %%mm1, %%mm0 \n\t" | |||
"pxor %%mm7, %%mm0 \n\t" | |||
"movq %%mm0, (%2, %%"REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"movq %%mm0, (%2, %%"FF_REG_a") \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
:: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len) | |||
: "%"REG_a | |||
: "%"FF_REG_a | |||
); | |||
if (mmx_len != len) | |||
ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0); | |||
@@ -60,13 +60,13 @@ static void line_noise_avg_mmx(uint8_t *dst, const uint8_t *src, | |||
x86_reg mmx_len = len & (~7); | |||
__asm__ volatile( | |||
"mov %5, %%"REG_a" \n\t" | |||
"mov %5, %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%1, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"paddb (%2, %%"REG_a"), %%mm1 \n\t" | |||
"paddb (%3, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"paddb (%2, %%"FF_REG_a"), %%mm1\n\t" | |||
"paddb (%3, %%"FF_REG_a"), %%mm1\n\t" | |||
"movq %%mm0, %%mm2 \n\t" | |||
"movq %%mm1, %%mm3 \n\t" | |||
"punpcklbw %%mm0, %%mm0 \n\t" | |||
@@ -82,12 +82,12 @@ static void line_noise_avg_mmx(uint8_t *dst, const uint8_t *src, | |||
"psrlw $8, %%mm1 \n\t" | |||
"psrlw $8, %%mm3 \n\t" | |||
"packuswb %%mm3, %%mm1 \n\t" | |||
"movq %%mm1, (%4, %%"REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"movq %%mm1, (%4, %%"FF_REG_a") \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
:: "r" (src+mmx_len), "r" (shift[0]+mmx_len), "r" (shift[1]+mmx_len), "r" (shift[2]+mmx_len), | |||
"r" (dst+mmx_len), "g" (-mmx_len) | |||
: "%"REG_a | |||
: "%"FF_REG_a | |||
); | |||
if (mmx_len != len){ | |||
@@ -104,22 +104,22 @@ static void line_noise_mmxext(uint8_t *dst, const uint8_t *src, | |||
noise += shift; | |||
__asm__ volatile( | |||
"mov %3, %%"REG_a" \n\t" | |||
"mov %3, %%"FF_REG_a" \n\t" | |||
"pcmpeqb %%mm7, %%mm7 \n\t" | |||
"psllw $15, %%mm7 \n\t" | |||
"packsswb %%mm7, %%mm7 \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm1 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | |||
"pxor %%mm7, %%mm0 \n\t" | |||
"paddsb %%mm1, %%mm0 \n\t" | |||
"pxor %%mm7, %%mm0 \n\t" | |||
"movntq %%mm0, (%2, %%"REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"movntq %%mm0, (%2, %%"FF_REG_a") \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
:: "r" (src+mmx_len), "r" (noise+mmx_len), "r" (dst+mmx_len), "g" (-mmx_len) | |||
: "%"REG_a | |||
: "%"FF_REG_a | |||
); | |||
if (mmx_len != len) | |||
ff_line_noise_c(dst+mmx_len, src+mmx_len, noise+mmx_len, len-mmx_len, 0); | |||
@@ -28,46 +28,46 @@ typedef struct xmm_reg { uint64_t a, b; } xmm_reg; | |||
typedef struct ymm_reg { uint64_t a, b, c, d; } ymm_reg; | |||
#if ARCH_X86_64 | |||
# define OPSIZE "q" | |||
# define REG_a "rax" | |||
# define REG_b "rbx" | |||
# define REG_c "rcx" | |||
# define REG_d "rdx" | |||
# define REG_D "rdi" | |||
# define REG_S "rsi" | |||
# define PTR_SIZE "8" | |||
# define FF_OPSIZE "q" | |||
# define FF_REG_a "rax" | |||
# define FF_REG_b "rbx" | |||
# define FF_REG_c "rcx" | |||
# define FF_REG_d "rdx" | |||
# define FF_REG_D "rdi" | |||
# define FF_REG_S "rsi" | |||
# define FF_PTR_SIZE "8" | |||
typedef int64_t x86_reg; | |||
/* REG_SP is defined in Solaris sys headers, so use REG_sp */ | |||
# define REG_sp "rsp" | |||
# define REG_BP "rbp" | |||
# define REGBP rbp | |||
# define REGa rax | |||
# define REGb rbx | |||
# define REGc rcx | |||
# define REGd rdx | |||
# define REGSP rsp | |||
/* FF_REG_SP is defined in Solaris sys headers, so use FF_REG_sp */ | |||
# define FF_REG_sp "rsp" | |||
# define FF_REG_BP "rbp" | |||
# define FF_REGBP rbp | |||
# define FF_REGa rax | |||
# define FF_REGb rbx | |||
# define FF_REGc rcx | |||
# define FF_REGd rdx | |||
# define FF_REGSP rsp | |||
#elif ARCH_X86_32 | |||
# define OPSIZE "l" | |||
# define REG_a "eax" | |||
# define REG_b "ebx" | |||
# define REG_c "ecx" | |||
# define REG_d "edx" | |||
# define REG_D "edi" | |||
# define REG_S "esi" | |||
# define PTR_SIZE "4" | |||
# define FF_OPSIZE "l" | |||
# define FF_REG_a "eax" | |||
# define FF_REG_b "ebx" | |||
# define FF_REG_c "ecx" | |||
# define FF_REG_d "edx" | |||
# define FF_REG_D "edi" | |||
# define FF_REG_S "esi" | |||
# define FF_PTR_SIZE "4" | |||
typedef int32_t x86_reg; | |||
# define REG_sp "esp" | |||
# define REG_BP "ebp" | |||
# define REGBP ebp | |||
# define REGa eax | |||
# define REGb ebx | |||
# define REGc ecx | |||
# define REGd edx | |||
# define REGSP esp | |||
# define FF_REG_sp "esp" | |||
# define FF_REG_BP "ebp" | |||
# define FF_REGBP ebp | |||
# define FF_REGa eax | |||
# define FF_REGb ebx | |||
# define FF_REGc ecx | |||
# define FF_REGd edx | |||
# define FF_REGSP esp | |||
#else | |||
typedef int x86_reg; | |||
#endif | |||
@@ -41,9 +41,9 @@ | |||
/* ebx saving is necessary for PIC. gcc seems unable to see it alone */ | |||
#define cpuid(index, eax, ebx, ecx, edx) \ | |||
__asm__ volatile ( \ | |||
"mov %%"REG_b", %%"REG_S" \n\t" \ | |||
"mov %%"FF_REG_b", %%"FF_REG_S" \n\t" \ | |||
"cpuid \n\t" \ | |||
"xchg %%"REG_b", %%"REG_S \ | |||
"xchg %%"FF_REG_b", %%"FF_REG_S \ | |||
: "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) \ | |||
: "0" (index), "2"(0)) | |||
@@ -55,9 +55,9 @@ av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, | |||
"jmp 9f \n\t" | |||
// Begin | |||
"0: \n\t" | |||
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" | |||
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" | |||
"movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t" | |||
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t" | |||
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t" | |||
"movd 1(%%"FF_REG_c", %%"FF_REG_S"), %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
"pshufw $0xFF, %%mm1, %%mm1 \n\t" | |||
@@ -65,14 +65,14 @@ av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, | |||
"pshufw $0xFF, %%mm0, %%mm0 \n\t" | |||
"2: \n\t" | |||
"psubw %%mm1, %%mm0 \n\t" | |||
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" | |||
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t" | |||
"pmullw %%mm3, %%mm0 \n\t" | |||
"psllw $7, %%mm1 \n\t" | |||
"paddw %%mm1, %%mm0 \n\t" | |||
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" | |||
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
// End | |||
"9: \n\t" | |||
"lea " LOCAL_MANGLE(0b) ", %0 \n\t" | |||
@@ -94,22 +94,22 @@ av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode, | |||
"jmp 9f \n\t" | |||
// Begin | |||
"0: \n\t" | |||
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t" | |||
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t" | |||
"movq (%%"FF_REG_d", %%"FF_REG_a"), %%mm3 \n\t" | |||
"movd (%%"FF_REG_c", %%"FF_REG_S"), %%mm0 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
"pshufw $0xFF, %%mm0, %%mm1 \n\t" | |||
"1: \n\t" | |||
"pshufw $0xFF, %%mm0, %%mm0 \n\t" | |||
"2: \n\t" | |||
"psubw %%mm1, %%mm0 \n\t" | |||
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t" | |||
"movl 8(%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t" | |||
"pmullw %%mm3, %%mm0 \n\t" | |||
"psllw $7, %%mm1 \n\t" | |||
"paddw %%mm1, %%mm0 \n\t" | |||
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t" | |||
"movq %%mm0, (%%"FF_REG_D", %%"FF_REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
// End | |||
"9: \n\t" | |||
"lea " LOCAL_MANGLE(0b) ", %0 \n\t" | |||
@@ -206,39 +206,39 @@ void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, | |||
__asm__ volatile( | |||
#if ARCH_X86_64 | |||
"mov -8(%%rsp), %%"REG_a" \n\t" | |||
"mov %%"REG_a", %5 \n\t" // retsave | |||
"mov -8(%%rsp), %%"FF_REG_a" \n\t" | |||
"mov %%"FF_REG_a", %5 \n\t" // retsave | |||
#else | |||
#if defined(PIC) | |||
"mov %%"REG_b", %5 \n\t" // ebxsave | |||
"mov %%"FF_REG_b", %5 \n\t" // ebxsave | |||
#endif | |||
#endif | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"mov %0, %%"REG_c" \n\t" | |||
"mov %1, %%"REG_D" \n\t" | |||
"mov %2, %%"REG_d" \n\t" | |||
"mov %3, %%"REG_b" \n\t" | |||
"xor %%"REG_a", %%"REG_a" \n\t" // i | |||
PREFETCH" (%%"REG_c") \n\t" | |||
PREFETCH" 32(%%"REG_c") \n\t" | |||
PREFETCH" 64(%%"REG_c") \n\t" | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"mov %0, %%"FF_REG_c" \n\t" | |||
"mov %1, %%"FF_REG_D" \n\t" | |||
"mov %2, %%"FF_REG_d" \n\t" | |||
"mov %3, %%"FF_REG_b" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i | |||
PREFETCH" (%%"FF_REG_c") \n\t" | |||
PREFETCH" 32(%%"FF_REG_c") \n\t" | |||
PREFETCH" 64(%%"FF_REG_c") \n\t" | |||
#if ARCH_X86_64 | |||
#define CALL_MMXEXT_FILTER_CODE \ | |||
"movl (%%"REG_b"), %%esi \n\t"\ | |||
"call *%4 \n\t"\ | |||
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ | |||
"add %%"REG_S", %%"REG_c" \n\t"\ | |||
"add %%"REG_a", %%"REG_D" \n\t"\ | |||
"xor %%"REG_a", %%"REG_a" \n\t"\ | |||
"movl (%%"FF_REG_b"), %%esi \n\t"\ | |||
"call *%4 \n\t"\ | |||
"movl (%%"FF_REG_b", %%"FF_REG_a"), %%esi \n\t"\ | |||
"add %%"FF_REG_S", %%"FF_REG_c" \n\t"\ | |||
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\ | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ | |||
#else | |||
#define CALL_MMXEXT_FILTER_CODE \ | |||
"movl (%%"REG_b"), %%esi \n\t"\ | |||
"call *%4 \n\t"\ | |||
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ | |||
"add %%"REG_a", %%"REG_D" \n\t"\ | |||
"xor %%"REG_a", %%"REG_a" \n\t"\ | |||
"movl (%%"FF_REG_b"), %%esi \n\t"\ | |||
"call *%4 \n\t"\ | |||
"addl (%%"FF_REG_b", %%"FF_REG_a"), %%"FF_REG_c" \n\t"\ | |||
"add %%"FF_REG_a", %%"FF_REG_D" \n\t"\ | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ | |||
#endif /* ARCH_X86_64 */ | |||
@@ -252,11 +252,11 @@ void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, | |||
CALL_MMXEXT_FILTER_CODE | |||
#if ARCH_X86_64 | |||
"mov %5, %%"REG_a" \n\t" | |||
"mov %%"REG_a", -8(%%rsp) \n\t" | |||
"mov %5, %%"FF_REG_a" \n\t" | |||
"mov %%"FF_REG_a", -8(%%rsp) \n\t" | |||
#else | |||
#if defined(PIC) | |||
"mov %5, %%"REG_b" \n\t" | |||
"mov %5, %%"FF_REG_b" \n\t" | |||
#endif | |||
#endif | |||
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), | |||
@@ -268,9 +268,9 @@ void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst, | |||
,"m" (ebxsave) | |||
#endif | |||
#endif | |||
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |||
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D | |||
#if ARCH_X86_64 || !defined(PIC) | |||
,"%"REG_b | |||
,"%"FF_REG_b | |||
#endif | |||
); | |||
@@ -295,33 +295,33 @@ void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, | |||
#endif | |||
__asm__ volatile( | |||
#if ARCH_X86_64 | |||
"mov -8(%%rsp), %%"REG_a" \n\t" | |||
"mov %%"REG_a", %7 \n\t" // retsave | |||
"mov -8(%%rsp), %%"FF_REG_a" \n\t" | |||
"mov %%"FF_REG_a", %7 \n\t" // retsave | |||
#else | |||
#if defined(PIC) | |||
"mov %%"REG_b", %7 \n\t" // ebxsave | |||
"mov %%"FF_REG_b", %7 \n\t" // ebxsave | |||
#endif | |||
#endif | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"mov %0, %%"REG_c" \n\t" | |||
"mov %1, %%"REG_D" \n\t" | |||
"mov %2, %%"REG_d" \n\t" | |||
"mov %3, %%"REG_b" \n\t" | |||
"xor %%"REG_a", %%"REG_a" \n\t" // i | |||
PREFETCH" (%%"REG_c") \n\t" | |||
PREFETCH" 32(%%"REG_c") \n\t" | |||
PREFETCH" 64(%%"REG_c") \n\t" | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"mov %0, %%"FF_REG_c" \n\t" | |||
"mov %1, %%"FF_REG_D" \n\t" | |||
"mov %2, %%"FF_REG_d" \n\t" | |||
"mov %3, %%"FF_REG_b" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i | |||
PREFETCH" (%%"FF_REG_c") \n\t" | |||
PREFETCH" 32(%%"FF_REG_c") \n\t" | |||
PREFETCH" 64(%%"FF_REG_c") \n\t" | |||
CALL_MMXEXT_FILTER_CODE | |||
CALL_MMXEXT_FILTER_CODE | |||
CALL_MMXEXT_FILTER_CODE | |||
CALL_MMXEXT_FILTER_CODE | |||
"xor %%"REG_a", %%"REG_a" \n\t" // i | |||
"mov %5, %%"REG_c" \n\t" // src2 | |||
"mov %6, %%"REG_D" \n\t" // dst2 | |||
PREFETCH" (%%"REG_c") \n\t" | |||
PREFETCH" 32(%%"REG_c") \n\t" | |||
PREFETCH" 64(%%"REG_c") \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" // i | |||
"mov %5, %%"FF_REG_c" \n\t" // src2 | |||
"mov %6, %%"FF_REG_D" \n\t" // dst2 | |||
PREFETCH" (%%"FF_REG_c") \n\t" | |||
PREFETCH" 32(%%"FF_REG_c") \n\t" | |||
PREFETCH" 64(%%"FF_REG_c") \n\t" | |||
CALL_MMXEXT_FILTER_CODE | |||
CALL_MMXEXT_FILTER_CODE | |||
@@ -329,11 +329,11 @@ void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, | |||
CALL_MMXEXT_FILTER_CODE | |||
#if ARCH_X86_64 | |||
"mov %7, %%"REG_a" \n\t" | |||
"mov %%"REG_a", -8(%%rsp) \n\t" | |||
"mov %7, %%"FF_REG_a" \n\t" | |||
"mov %%"FF_REG_a", -8(%%rsp) \n\t" | |||
#else | |||
#if defined(PIC) | |||
"mov %7, %%"REG_b" \n\t" | |||
"mov %7, %%"FF_REG_b" \n\t" | |||
#endif | |||
#endif | |||
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), | |||
@@ -345,9 +345,9 @@ void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2, | |||
,"m" (ebxsave) | |||
#endif | |||
#endif | |||
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D | |||
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_D | |||
#if ARCH_X86_64 || !defined(PIC) | |||
,"%"REG_b | |||
,"%"FF_REG_b | |||
#endif | |||
); | |||
@@ -1101,43 +1101,43 @@ static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int sr | |||
unsigned i; | |||
x86_reg mmx_size= 23 - src_size; | |||
__asm__ volatile ( | |||
"test %%"REG_a", %%"REG_a" \n\t" | |||
"test %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
"jns 2f \n\t" | |||
"movq "MANGLE(mask24r)", %%mm5 \n\t" | |||
"movq "MANGLE(mask24g)", %%mm6 \n\t" | |||
"movq "MANGLE(mask24b)", %%mm7 \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 32(%1, %%"REG_a") \n\t" | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | |||
"movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG | |||
"movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B | |||
PREFETCH" 32(%1, %%"FF_REG_a") \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | |||
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t" // BGR BGR BG | |||
"movq 2(%1, %%"FF_REG_a"), %%mm2 \n\t" // R BGR BGR B | |||
"psllq $16, %%mm0 \n\t" // 00 BGR BGR | |||
"pand %%mm5, %%mm0 \n\t" | |||
"pand %%mm6, %%mm1 \n\t" | |||
"pand %%mm7, %%mm2 \n\t" | |||
"por %%mm0, %%mm1 \n\t" | |||
"por %%mm2, %%mm1 \n\t" | |||
"movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG | |||
MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG | |||
"movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B | |||
"movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR | |||
"movq 6(%1, %%"FF_REG_a"), %%mm0 \n\t" // BGR BGR BG | |||
MOVNTQ" %%mm1,(%2, %%"FF_REG_a") \n\t" // RGB RGB RG | |||
"movq 8(%1, %%"FF_REG_a"), %%mm1 \n\t" // R BGR BGR B | |||
"movq 10(%1, %%"FF_REG_a"), %%mm2 \n\t" // GR BGR BGR | |||
"pand %%mm7, %%mm0 \n\t" | |||
"pand %%mm5, %%mm1 \n\t" | |||
"pand %%mm6, %%mm2 \n\t" | |||
"por %%mm0, %%mm1 \n\t" | |||
"por %%mm2, %%mm1 \n\t" | |||
"movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B | |||
MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R | |||
"movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR | |||
"movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG | |||
"movq 14(%1, %%"FF_REG_a"), %%mm0 \n\t" // R BGR BGR B | |||
MOVNTQ" %%mm1, 8(%2, %%"FF_REG_a")\n\t" // B RGB RGB R | |||
"movq 16(%1, %%"FF_REG_a"), %%mm1 \n\t" // GR BGR BGR | |||
"movq 18(%1, %%"FF_REG_a"), %%mm2 \n\t" // BGR BGR BG | |||
"pand %%mm6, %%mm0 \n\t" | |||
"pand %%mm7, %%mm1 \n\t" | |||
"pand %%mm5, %%mm2 \n\t" | |||
"por %%mm0, %%mm1 \n\t" | |||
"por %%mm2, %%mm1 \n\t" | |||
MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" | |||
"add $24, %%"REG_a" \n\t" | |||
MOVNTQ" %%mm1, 16(%2, %%"FF_REG_a") \n\t" | |||
"add $24, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
"2: \n\t" | |||
: "+a" (mmx_size) | |||
@@ -1173,20 +1173,20 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u | |||
for (y=0; y<height; y++) { | |||
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | |||
__asm__ volatile( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | |||
PREFETCH" 32(%2, %%"REG_a") \n\t" | |||
PREFETCH" 32(%3, %%"REG_a") \n\t" | |||
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | |||
PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | |||
PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | |||
PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | |||
"movq %%mm0, %%mm2 \n\t" // U(0) | |||
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | |||
"movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | |||
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |||
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |||
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | |||
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | |||
"movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | |||
"movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | |||
"movq %%mm3, %%mm4 \n\t" // Y(0) | |||
"movq %%mm5, %%mm6 \n\t" // Y(8) | |||
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) | |||
@@ -1194,16 +1194,16 @@ static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *u | |||
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) | |||
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) | |||
MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm3, (%0, %%"FF_REG_a", 4) \n\t" | |||
MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | |||
MOVNTQ" %%mm5, 16(%0, %%"FF_REG_a", 4) \n\t" | |||
MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"cmp %4, %%"REG_a" \n\t" | |||
" jb 1b \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
"cmp %4, %%"FF_REG_a" \n\t" | |||
" jb 1b \n\t" | |||
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | |||
: "%"REG_a | |||
: "%"FF_REG_a | |||
); | |||
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |||
usrc += chromStride; | |||
@@ -1238,20 +1238,20 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u | |||
for (y=0; y<height; y++) { | |||
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) | |||
__asm__ volatile( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 32(%1, %%"REG_a", 2) \n\t" | |||
PREFETCH" 32(%2, %%"REG_a") \n\t" | |||
PREFETCH" 32(%3, %%"REG_a") \n\t" | |||
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) | |||
PREFETCH" 32(%1, %%"FF_REG_a", 2) \n\t" | |||
PREFETCH" 32(%2, %%"FF_REG_a") \n\t" | |||
PREFETCH" 32(%3, %%"FF_REG_a") \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm0 \n\t" // U(0) | |||
"movq %%mm0, %%mm2 \n\t" // U(0) | |||
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) | |||
"movq (%3, %%"FF_REG_a"), %%mm1 \n\t" // V(0) | |||
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |||
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) | |||
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) | |||
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) | |||
"movq (%1, %%"FF_REG_a",2), %%mm3 \n\t" // Y(0) | |||
"movq 8(%1, %%"FF_REG_a",2), %%mm5 \n\t" // Y(8) | |||
"movq %%mm0, %%mm4 \n\t" // Y(0) | |||
"movq %%mm2, %%mm6 \n\t" // Y(8) | |||
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) | |||
@@ -1259,16 +1259,16 @@ static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *u | |||
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) | |||
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) | |||
MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" | |||
MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 4) \n\t" | |||
MOVNTQ" %%mm4, 8(%0, %%"FF_REG_a", 4) \n\t" | |||
MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 4) \n\t" | |||
MOVNTQ" %%mm6, 24(%0, %%"FF_REG_a", 4) \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"cmp %4, %%"REG_a" \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
"cmp %4, %%"FF_REG_a" \n\t" | |||
" jb 1b \n\t" | |||
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) | |||
: "%"REG_a | |||
: "%"FF_REG_a | |||
); | |||
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { | |||
usrc += chromStride; | |||
@@ -1326,14 +1326,14 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
const x86_reg chromWidth= width>>1; | |||
for (y=0; y<height; y+=2) { | |||
__asm__ volatile( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | |||
"pcmpeqw %%mm7, %%mm7 \n\t" | |||
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | |||
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |||
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | |||
PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | |||
"movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |||
"movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | |||
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) | |||
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) | |||
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) | |||
@@ -1343,10 +1343,10 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |||
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |||
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t" | |||
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) | |||
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) | |||
"movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) | |||
"movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) | |||
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) | |||
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) | |||
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) | |||
@@ -1356,7 +1356,7 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |||
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |||
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t" | |||
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |||
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |||
@@ -1367,28 +1367,28 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |||
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |||
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" | |||
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" | |||
MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t" | |||
MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"cmp %4, %%"REG_a" \n\t" | |||
" jb 1b \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
"cmp %4, %%"FF_REG_a" \n\t" | |||
" jb 1b \n\t" | |||
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |||
: "memory", "%"REG_a | |||
: "memory", "%"FF_REG_a | |||
); | |||
ydst += lumStride; | |||
src += srcStride; | |||
__asm__ volatile( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a"\n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | |||
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |||
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | |||
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | |||
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | |||
PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | |||
"movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |||
"movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | |||
"movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | |||
"movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | |||
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |||
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |||
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |||
@@ -1396,15 +1396,15 @@ static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |||
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |||
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"cmp %4, %%"REG_a" \n\t" | |||
"add $8, %%"FF_REG_a"\n\t" | |||
"cmp %4, %%"FF_REG_a"\n\t" | |||
" jb 1b \n\t" | |||
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |||
: "memory", "%"REG_a | |||
: "memory", "%"FF_REG_a | |||
); | |||
udst += chromStride; | |||
vdst += chromStride; | |||
@@ -1438,23 +1438,23 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid | |||
if (mmxSize) { | |||
__asm__ volatile( | |||
"mov %4, %%"REG_a" \n\t" | |||
"mov %4, %%"FF_REG_a" \n\t" | |||
"movq "MANGLE(mmx_ff)", %%mm0 \n\t" | |||
"movq (%0, %%"REG_a"), %%mm4 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq %%mm4, %%mm2 \n\t" | |||
"psllq $8, %%mm4 \n\t" | |||
"pand %%mm0, %%mm2 \n\t" | |||
"por %%mm2, %%mm4 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm5 \n\t" | |||
"movq %%mm5, %%mm3 \n\t" | |||
"psllq $8, %%mm5 \n\t" | |||
"pand %%mm0, %%mm3 \n\t" | |||
"por %%mm3, %%mm5 \n\t" | |||
"1: \n\t" | |||
"movq (%0, %%"REG_a"), %%mm0 \n\t" | |||
"movq (%1, %%"REG_a"), %%mm1 \n\t" | |||
"movq 1(%0, %%"REG_a"), %%mm2 \n\t" | |||
"movq 1(%1, %%"REG_a"), %%mm3 \n\t" | |||
"movq (%0, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm1 \n\t" | |||
"movq 1(%0, %%"FF_REG_a"), %%mm2 \n\t" | |||
"movq 1(%1, %%"FF_REG_a"), %%mm3 \n\t" | |||
PAVGB" %%mm0, %%mm5 \n\t" | |||
PAVGB" %%mm0, %%mm3 \n\t" | |||
PAVGB" %%mm0, %%mm5 \n\t" | |||
@@ -1469,19 +1469,19 @@ static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWid | |||
"punpckhbw %%mm3, %%mm7 \n\t" | |||
"punpcklbw %%mm2, %%mm4 \n\t" | |||
"punpckhbw %%mm2, %%mm6 \n\t" | |||
MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"movq -1(%0, %%"REG_a"), %%mm4 \n\t" | |||
"movq -1(%1, %%"REG_a"), %%mm5 \n\t" | |||
" js 1b \n\t" | |||
MOVNTQ" %%mm5, (%2, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm7, 8(%2, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm4, (%3, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm6, 8(%3, %%"FF_REG_a", 2) \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
"movq -1(%0, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq -1(%1, %%"FF_REG_a"), %%mm5 \n\t" | |||
" js 1b \n\t" | |||
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), | |||
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), | |||
"g" (-mmxSize) | |||
NAMED_CONSTRAINTS_ADD(mmx_ff) | |||
: "%"REG_a | |||
: "%"FF_REG_a | |||
); | |||
} else { | |||
mmxSize = 1; | |||
@@ -1532,14 +1532,14 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
const x86_reg chromWidth= width>>1; | |||
for (y=0; y<height; y+=2) { | |||
__asm__ volatile( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
"pcmpeqw %%mm7, %%mm7 \n\t" | |||
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00... | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | |||
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) | |||
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) | |||
PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | |||
"movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) | |||
"movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) | |||
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) | |||
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) | |||
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) | |||
@@ -1549,10 +1549,10 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) | |||
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) | |||
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, (%1, %%"FF_REG_a", 2) \n\t" | |||
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) | |||
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) | |||
"movq 16(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) | |||
"movq 24(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) | |||
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) | |||
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) | |||
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) | |||
@@ -1562,7 +1562,7 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) | |||
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) | |||
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm3, 8(%1, %%"FF_REG_a", 2) \n\t" | |||
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) | |||
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) | |||
@@ -1573,28 +1573,28 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) | |||
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) | |||
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" | |||
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" | |||
MOVNTQ" %%mm0, (%3, %%"FF_REG_a") \n\t" | |||
MOVNTQ" %%mm2, (%2, %%"FF_REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"cmp %4, %%"REG_a" \n\t" | |||
" jb 1b \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
"cmp %4, %%"FF_REG_a" \n\t" | |||
" jb 1b \n\t" | |||
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |||
: "memory", "%"REG_a | |||
: "memory", "%"FF_REG_a | |||
); | |||
ydst += lumStride; | |||
src += srcStride; | |||
__asm__ volatile( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%0, %%"REG_a", 4) \n\t" | |||
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |||
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | |||
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | |||
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%0, %%"FF_REG_a", 4) \n\t" | |||
"movq (%0, %%"FF_REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) | |||
"movq 8(%0, %%"FF_REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) | |||
"movq 16(%0, %%"FF_REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) | |||
"movq 24(%0, %%"FF_REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) | |||
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) | |||
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) | |||
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) | |||
@@ -1602,15 +1602,15 @@ static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t | |||
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) | |||
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) | |||
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm0, (%1, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, 8(%1, %%"FF_REG_a", 2) \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
"cmp %4, %%"REG_a" \n\t" | |||
" jb 1b \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
"cmp %4, %%"FF_REG_a" \n\t" | |||
" jb 1b \n\t" | |||
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) | |||
: "memory", "%"REG_a | |||
: "memory", "%"FF_REG_a | |||
); | |||
udst += chromStride; | |||
vdst += chromStride; | |||
@@ -1655,20 +1655,20 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
int i; | |||
for (i=0; i<2; i++) { | |||
__asm__ volatile( | |||
"mov %2, %%"REG_a" \n\t" | |||
"mov %2, %%"FF_REG_a"\n\t" | |||
"movq "BGR2Y_IDX"(%3), %%mm6 \n\t" | |||
"movq "MANGLE(ff_w1111)", %%mm5 \n\t" | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | |||
"lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%0, %%"REG_d") \n\t" | |||
"movd (%0, %%"REG_d"), %%mm0 \n\t" | |||
"movd 3(%0, %%"REG_d"), %%mm1 \n\t" | |||
PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | |||
"movd (%0, %%"FF_REG_d"), %%mm0 \n\t" | |||
"movd 3(%0, %%"FF_REG_d"), %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"movd 6(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movd 9(%0, %%"REG_d"), %%mm3 \n\t" | |||
"movd 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movd 9(%0, %%"FF_REG_d"), %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
"punpcklbw %%mm7, %%mm3 \n\t" | |||
"pmaddwd %%mm6, %%mm0 \n\t" | |||
@@ -1686,12 +1686,12 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"packssdw %%mm2, %%mm0 \n\t" | |||
"psraw $7, %%mm0 \n\t" | |||
"movd 12(%0, %%"REG_d"), %%mm4 \n\t" | |||
"movd 15(%0, %%"REG_d"), %%mm1 \n\t" | |||
"movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | |||
"movd 15(%0, %%"FF_REG_d"), %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm4 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"movd 18(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movd 21(%0, %%"REG_d"), %%mm3 \n\t" | |||
"movd 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movd 21(%0, %%"FF_REG_d"), %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
"punpcklbw %%mm7, %%mm3 \n\t" | |||
"pmaddwd %%mm6, %%mm4 \n\t" | |||
@@ -1706,40 +1706,40 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"packssdw %%mm3, %%mm2 \n\t" | |||
"pmaddwd %%mm5, %%mm4 \n\t" | |||
"pmaddwd %%mm5, %%mm2 \n\t" | |||
"add $24, %%"REG_d" \n\t" | |||
"add $24, %%"FF_REG_d"\n\t" | |||
"packssdw %%mm2, %%mm4 \n\t" | |||
"psraw $7, %%mm4 \n\t" | |||
"packuswb %%mm4, %%mm0 \n\t" | |||
"paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" | |||
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" | |||
"add $8, %%"REG_a" \n\t" | |||
" js 1b \n\t" | |||
MOVNTQ" %%mm0, (%1, %%"FF_REG_a") \n\t" | |||
"add $8, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
: : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv) | |||
NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset) | |||
: "%"REG_a, "%"REG_d | |||
: "%"FF_REG_a, "%"FF_REG_d | |||
); | |||
ydst += lumStride; | |||
src += srcStride; | |||
} | |||
src -= srcStride*2; | |||
__asm__ volatile( | |||
"mov %4, %%"REG_a" \n\t" | |||
"mov %4, %%"FF_REG_a"\n\t" | |||
"movq "MANGLE(ff_w1111)", %%mm5 \n\t" | |||
"movq "BGR2U_IDX"(%5), %%mm6 \n\t" | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" | |||
"add %%"REG_d", %%"REG_d" \n\t" | |||
"lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_d" \n\t" | |||
"add %%"FF_REG_d", %%"FF_REG_d"\n\t" | |||
".p2align 4 \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%0, %%"REG_d") \n\t" | |||
PREFETCH" 64(%1, %%"REG_d") \n\t" | |||
PREFETCH" 64(%0, %%"FF_REG_d") \n\t" | |||
PREFETCH" 64(%1, %%"FF_REG_d") \n\t" | |||
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW | |||
"movq (%0, %%"REG_d"), %%mm0 \n\t" | |||
"movq (%1, %%"REG_d"), %%mm1 \n\t" | |||
"movq 6(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movq 6(%1, %%"REG_d"), %%mm3 \n\t" | |||
"movq (%0, %%"FF_REG_d"), %%mm0 \n\t" | |||
"movq (%1, %%"FF_REG_d"), %%mm1 \n\t" | |||
"movq 6(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movq 6(%1, %%"FF_REG_d"), %%mm3 \n\t" | |||
PAVGB" %%mm1, %%mm0 \n\t" | |||
PAVGB" %%mm3, %%mm2 \n\t" | |||
"movq %%mm0, %%mm1 \n\t" | |||
@@ -1751,10 +1751,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
#else | |||
"movd (%0, %%"REG_d"), %%mm0 \n\t" | |||
"movd (%1, %%"REG_d"), %%mm1 \n\t" | |||
"movd 3(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movd 3(%1, %%"REG_d"), %%mm3 \n\t" | |||
"movd (%0, %%"FF_REG_d"), %%mm0 \n\t" | |||
"movd (%1, %%"FF_REG_d"), %%mm1 \n\t" | |||
"movd 3(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movd 3(%1, %%"FF_REG_d"), %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm0 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
@@ -1762,10 +1762,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"paddw %%mm1, %%mm0 \n\t" | |||
"paddw %%mm3, %%mm2 \n\t" | |||
"paddw %%mm2, %%mm0 \n\t" | |||
"movd 6(%0, %%"REG_d"), %%mm4 \n\t" | |||
"movd 6(%1, %%"REG_d"), %%mm1 \n\t" | |||
"movd 9(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movd 9(%1, %%"REG_d"), %%mm3 \n\t" | |||
"movd 6(%0, %%"FF_REG_d"), %%mm4 \n\t" | |||
"movd 6(%1, %%"FF_REG_d"), %%mm1 \n\t" | |||
"movd 9(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movd 9(%1, %%"FF_REG_d"), %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm4 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
@@ -1795,10 +1795,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"psraw $7, %%mm0 \n\t" | |||
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW | |||
"movq 12(%0, %%"REG_d"), %%mm4 \n\t" | |||
"movq 12(%1, %%"REG_d"), %%mm1 \n\t" | |||
"movq 18(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movq 18(%1, %%"REG_d"), %%mm3 \n\t" | |||
"movq 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | |||
"movq 12(%1, %%"FF_REG_d"), %%mm1 \n\t" | |||
"movq 18(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movq 18(%1, %%"FF_REG_d"), %%mm3 \n\t" | |||
PAVGB" %%mm1, %%mm4 \n\t" | |||
PAVGB" %%mm3, %%mm2 \n\t" | |||
"movq %%mm4, %%mm1 \n\t" | |||
@@ -1810,10 +1810,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"punpcklbw %%mm7, %%mm4 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
#else | |||
"movd 12(%0, %%"REG_d"), %%mm4 \n\t" | |||
"movd 12(%1, %%"REG_d"), %%mm1 \n\t" | |||
"movd 15(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movd 15(%1, %%"REG_d"), %%mm3 \n\t" | |||
"movd 12(%0, %%"FF_REG_d"), %%mm4 \n\t" | |||
"movd 12(%1, %%"FF_REG_d"), %%mm1 \n\t" | |||
"movd 15(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movd 15(%1, %%"FF_REG_d"), %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm4 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
@@ -1821,10 +1821,10 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"paddw %%mm1, %%mm4 \n\t" | |||
"paddw %%mm3, %%mm2 \n\t" | |||
"paddw %%mm2, %%mm4 \n\t" | |||
"movd 18(%0, %%"REG_d"), %%mm5 \n\t" | |||
"movd 18(%1, %%"REG_d"), %%mm1 \n\t" | |||
"movd 21(%0, %%"REG_d"), %%mm2 \n\t" | |||
"movd 21(%1, %%"REG_d"), %%mm3 \n\t" | |||
"movd 18(%0, %%"FF_REG_d"), %%mm5 \n\t" | |||
"movd 18(%1, %%"FF_REG_d"), %%mm1 \n\t" | |||
"movd 21(%0, %%"FF_REG_d"), %%mm2 \n\t" | |||
"movd 21(%1, %%"FF_REG_d"), %%mm3 \n\t" | |||
"punpcklbw %%mm7, %%mm5 \n\t" | |||
"punpcklbw %%mm7, %%mm1 \n\t" | |||
"punpcklbw %%mm7, %%mm2 \n\t" | |||
@@ -1851,7 +1851,7 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"packssdw %%mm3, %%mm1 \n\t" | |||
"pmaddwd %%mm5, %%mm4 \n\t" | |||
"pmaddwd %%mm5, %%mm1 \n\t" | |||
"add $24, %%"REG_d" \n\t" | |||
"add $24, %%"FF_REG_d"\n\t" | |||
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 | |||
"psraw $7, %%mm4 \n\t" | |||
@@ -1860,14 +1860,14 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_ | |||
"punpckhdq %%mm4, %%mm1 \n\t" | |||
"packsswb %%mm1, %%mm0 \n\t" | |||
"paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" | |||
"movd %%mm0, (%2, %%"REG_a") \n\t" | |||
"punpckhdq %%mm0, %%mm0 \n\t" | |||
"movd %%mm0, (%3, %%"REG_a") \n\t" | |||
"add $4, %%"REG_a" \n\t" | |||
" js 1b \n\t" | |||
"movd %%mm0, (%2, %%"FF_REG_a") \n\t" | |||
"punpckhdq %%mm0, %%mm0 \n\t" | |||
"movd %%mm0, (%3, %%"FF_REG_a") \n\t" | |||
"add $4, %%"FF_REG_a" \n\t" | |||
" js 1b \n\t" | |||
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv) | |||
NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset) | |||
: "%"REG_a, "%"REG_d | |||
: "%"FF_REG_a, "%"FF_REG_d | |||
); | |||
udst += chromStride; | |||
@@ -1898,49 +1898,49 @@ static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, ui | |||
#if COMPILE_TEMPLATE_SSE2 | |||
if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) { | |||
__asm__( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%1, %%"REG_a") \n\t" | |||
PREFETCH" 64(%2, %%"REG_a") \n\t" | |||
"movdqa (%1, %%"REG_a"), %%xmm0 \n\t" | |||
"movdqa (%1, %%"REG_a"), %%xmm1 \n\t" | |||
"movdqa (%2, %%"REG_a"), %%xmm2 \n\t" | |||
PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | |||
PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | |||
"movdqa (%1, %%"FF_REG_a"), %%xmm0 \n\t" | |||
"movdqa (%1, %%"FF_REG_a"), %%xmm1 \n\t" | |||
"movdqa (%2, %%"FF_REG_a"), %%xmm2 \n\t" | |||
"punpcklbw %%xmm2, %%xmm0 \n\t" | |||
"punpckhbw %%xmm2, %%xmm1 \n\t" | |||
"movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" | |||
"movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" | |||
"add $16, %%"REG_a" \n\t" | |||
"cmp %3, %%"REG_a" \n\t" | |||
"movntdq %%xmm0, (%0, %%"FF_REG_a", 2) \n\t" | |||
"movntdq %%xmm1, 16(%0, %%"FF_REG_a", 2) \n\t" | |||
"add $16, %%"FF_REG_a" \n\t" | |||
"cmp %3, %%"FF_REG_a" \n\t" | |||
" jb 1b \n\t" | |||
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |||
: "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"REG_a | |||
: "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"FF_REG_a | |||
); | |||
} else | |||
#endif | |||
__asm__( | |||
"xor %%"REG_a", %%"REG_a" \n\t" | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t" | |||
"1: \n\t" | |||
PREFETCH" 64(%1, %%"REG_a") \n\t" | |||
PREFETCH" 64(%2, %%"REG_a") \n\t" | |||
"movq (%1, %%"REG_a"), %%mm0 \n\t" | |||
"movq 8(%1, %%"REG_a"), %%mm2 \n\t" | |||
PREFETCH" 64(%1, %%"FF_REG_a") \n\t" | |||
PREFETCH" 64(%2, %%"FF_REG_a") \n\t" | |||
"movq (%1, %%"FF_REG_a"), %%mm0 \n\t" | |||
"movq 8(%1, %%"FF_REG_a"), %%mm2 \n\t" | |||
"movq %%mm0, %%mm1 \n\t" | |||
"movq %%mm2, %%mm3 \n\t" | |||
"movq (%2, %%"REG_a"), %%mm4 \n\t" | |||
"movq 8(%2, %%"REG_a"), %%mm5 \n\t" | |||
"movq (%2, %%"FF_REG_a"), %%mm4 \n\t" | |||
"movq 8(%2, %%"FF_REG_a"), %%mm5 \n\t" | |||
"punpcklbw %%mm4, %%mm0 \n\t" | |||
"punpckhbw %%mm4, %%mm1 \n\t" | |||
"punpcklbw %%mm5, %%mm2 \n\t" | |||
"punpckhbw %%mm5, %%mm3 \n\t" | |||
MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" | |||
MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" | |||
"add $16, %%"REG_a" \n\t" | |||
"cmp %3, %%"REG_a" \n\t" | |||
" jb 1b \n\t" | |||
MOVNTQ" %%mm0, (%0, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm1, 8(%0, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm2, 16(%0, %%"FF_REG_a", 2) \n\t" | |||
MOVNTQ" %%mm3, 24(%0, %%"FF_REG_a", 2) \n\t" | |||
"add $16, %%"FF_REG_a" \n\t" | |||
"cmp %3, %%"FF_REG_a" \n\t" | |||
" jb 1b \n\t" | |||
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) | |||
: "memory", "%"REG_a | |||
: "memory", "%"FF_REG_a | |||
); | |||
} | |||
@@ -220,16 +220,16 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, | |||
"movdqa %%xmm3, %%xmm4 \n\t" \ | |||
"movdqa %%xmm3, %%xmm7 \n\t" \ | |||
"movl %3, %%ecx \n\t" \ | |||
"mov %0, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"mov %0, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
".p2align 4 \n\t" /* FIXME Unroll? */\ | |||
"1: \n\t"\ | |||
"movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\ | |||
"movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\ | |||
"movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\ | |||
"add $16, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"test %%"REG_S", %%"REG_S" \n\t"\ | |||
"movddup 8(%%"FF_REG_d"), %%xmm0 \n\t" /* filterCoeff */\ | |||
"movdqa (%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm2 \n\t" /* srcData */\ | |||
"movdqa 16(%%"FF_REG_S", %%"FF_REG_c", 2), %%xmm5 \n\t" /* srcData */\ | |||
"add $16, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ | |||
"pmulhw %%xmm0, %%xmm2 \n\t"\ | |||
"pmulhw %%xmm0, %%xmm5 \n\t"\ | |||
"paddw %%xmm2, %%xmm3 \n\t"\ | |||
@@ -238,13 +238,13 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, | |||
"psraw $3, %%xmm3 \n\t"\ | |||
"psraw $3, %%xmm4 \n\t"\ | |||
"packuswb %%xmm4, %%xmm3 \n\t"\ | |||
"movntdq %%xmm3, (%1, %%"REG_c")\n\t"\ | |||
"add $16, %%"REG_c" \n\t"\ | |||
"cmp %2, %%"REG_c" \n\t"\ | |||
"movntdq %%xmm3, (%1, %%"FF_REG_c") \n\t"\ | |||
"add $16, %%"FF_REG_c" \n\t"\ | |||
"cmp %2, %%"FF_REG_c" \n\t"\ | |||
"movdqa %%xmm7, %%xmm3 \n\t" \ | |||
"movdqa %%xmm7, %%xmm4 \n\t" \ | |||
"mov %0, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"mov %0, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"jb 1b \n\t" | |||
if (offset) { | |||
@@ -259,7 +259,7 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, | |||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), | |||
"m"(filterSize), "m"(((uint64_t *) dither)[0]) | |||
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) | |||
"%"REG_d, "%"REG_S, "%"REG_c | |||
"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c | |||
); | |||
} else { | |||
__asm__ volatile( | |||
@@ -269,7 +269,7 @@ static void yuv2yuvX_sse3(const int16_t *filter, int filterSize, | |||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset), | |||
"m"(filterSize), "m"(((uint64_t *) dither)[0]) | |||
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,) | |||
"%"REG_d, "%"REG_S, "%"REG_c | |||
"%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c | |||
); | |||
} | |||
} | |||
@@ -88,16 +88,16 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, | |||
"movq %%mm3, %%mm6\n\t" | |||
"movq %%mm4, %%mm7\n\t" | |||
"movl %3, %%ecx\n\t" | |||
"mov %0, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
".p2align 4 \n\t" /* FIXME Unroll? */\ | |||
"1: \n\t"\ | |||
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |||
"movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ | |||
"movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ | |||
"add $16, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"test %%"REG_S", %%"REG_S" \n\t"\ | |||
"mov %0, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
".p2align 4 \n\t" /* FIXME Unroll? */\ | |||
"1: \n\t"\ | |||
"movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |||
"movq (%%"FF_REG_S", %%"FF_REG_c", 2), %%mm2 \n\t" /* srcData */\ | |||
"movq 8(%%"FF_REG_S", %%"FF_REG_c", 2), %%mm5 \n\t" /* srcData */\ | |||
"add $16, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ | |||
"pmulhw %%mm0, %%mm2 \n\t"\ | |||
"pmulhw %%mm0, %%mm5 \n\t"\ | |||
"paddw %%mm2, %%mm3 \n\t"\ | |||
@@ -106,62 +106,62 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, | |||
"psraw $3, %%mm3 \n\t"\ | |||
"psraw $3, %%mm4 \n\t"\ | |||
"packuswb %%mm4, %%mm3 \n\t" | |||
MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" | |||
"add $8, %%"REG_c" \n\t"\ | |||
"cmp %2, %%"REG_c" \n\t"\ | |||
MOVNTQ2 " %%mm3, (%1, %%"FF_REG_c")\n\t" | |||
"add $8, %%"FF_REG_c" \n\t"\ | |||
"cmp %2, %%"FF_REG_c" \n\t"\ | |||
"movq %%mm6, %%mm3\n\t" | |||
"movq %%mm7, %%mm4\n\t" | |||
"mov %0, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"jb 1b \n\t"\ | |||
"mov %0, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"jb 1b \n\t"\ | |||
:: "g" (filter), | |||
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) | |||
: "%"REG_d, "%"REG_S, "%"REG_c | |||
: "%"FF_REG_d, "%"FF_REG_S, "%"FF_REG_c | |||
); | |||
} | |||
#define YSCALEYUV2PACKEDX_UV \ | |||
__asm__ volatile(\ | |||
"xor %%"REG_a", %%"REG_a" \n\t"\ | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ | |||
".p2align 4 \n\t"\ | |||
"nop \n\t"\ | |||
"1: \n\t"\ | |||
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ | |||
"movq %%mm3, %%mm4 \n\t"\ | |||
".p2align 4 \n\t"\ | |||
"2: \n\t"\ | |||
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |||
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ | |||
"add %6, %%"REG_S" \n\t" \ | |||
"movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ | |||
"add $16, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"movq 8(%%"FF_REG_d"), %%mm0 \n\t" /* filterCoeff */\ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* UsrcData */\ | |||
"add %6, %%"FF_REG_S" \n\t" \ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm5 \n\t" /* VsrcData */\ | |||
"add $16, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"pmulhw %%mm0, %%mm2 \n\t"\ | |||
"pmulhw %%mm0, %%mm5 \n\t"\ | |||
"paddw %%mm2, %%mm3 \n\t"\ | |||
"paddw %%mm5, %%mm4 \n\t"\ | |||
"test %%"REG_S", %%"REG_S" \n\t"\ | |||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ | |||
" jnz 2b \n\t"\ | |||
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ | |||
"lea "offset"(%0), %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"lea "offset"(%0), %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ | |||
"movq "#dst1", "#dst2" \n\t"\ | |||
".p2align 4 \n\t"\ | |||
"2: \n\t"\ | |||
"movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ | |||
"movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ | |||
"movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ | |||
"add $16, %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"movq 8(%%"FF_REG_d"), "#coeff" \n\t" /* filterCoeff */\ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a", 2), "#src1" \n\t" /* Y1srcData */\ | |||
"movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), "#src2" \n\t" /* Y2srcData */\ | |||
"add $16, %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"pmulhw "#coeff", "#src1" \n\t"\ | |||
"pmulhw "#coeff", "#src2" \n\t"\ | |||
"paddw "#src1", "#dst1" \n\t"\ | |||
"paddw "#src2", "#dst2" \n\t"\ | |||
"test %%"REG_S", %%"REG_S" \n\t"\ | |||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ | |||
" jnz 2b \n\t"\ | |||
#define YSCALEYUV2PACKEDX \ | |||
@@ -173,41 +173,41 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, | |||
"m" (dummy), "m" (dummy), "m" (dummy),\ | |||
"r" (dest), "m" (dstW_reg), "m"(uv_off) \ | |||
NAMED_CONSTRAINTS_ADD(bF8,bFC) \ | |||
: "%"REG_a, "%"REG_d, "%"REG_S \ | |||
: "%"FF_REG_a, "%"FF_REG_d, "%"FF_REG_S \ | |||
); | |||
#define YSCALEYUV2PACKEDX_ACCURATE_UV \ | |||
__asm__ volatile(\ | |||
"xor %%"REG_a", %%"REG_a" \n\t"\ | |||
"xor %%"FF_REG_a", %%"FF_REG_a" \n\t"\ | |||
".p2align 4 \n\t"\ | |||
"nop \n\t"\ | |||
"1: \n\t"\ | |||
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"pxor %%mm4, %%mm4 \n\t"\ | |||
"pxor %%mm5, %%mm5 \n\t"\ | |||
"pxor %%mm6, %%mm6 \n\t"\ | |||
"pxor %%mm7, %%mm7 \n\t"\ | |||
".p2align 4 \n\t"\ | |||
"2: \n\t"\ | |||
"movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ | |||
"add %6, %%"REG_S" \n\t" \ | |||
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ | |||
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ | |||
"movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm0 \n\t" /* UsrcData */\ | |||
"add %6, %%"FF_REG_S" \n\t" \ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm2 \n\t" /* VsrcData */\ | |||
"mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm1 \n\t" /* UsrcData */\ | |||
"movq %%mm0, %%mm3 \n\t"\ | |||
"punpcklwd %%mm1, %%mm0 \n\t"\ | |||
"punpckhwd %%mm1, %%mm3 \n\t"\ | |||
"movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ | |||
"movq "STR(APCK_COEF)"(%%"FF_REG_d"),%%mm1 \n\t" /* filterCoeff */\ | |||
"pmaddwd %%mm1, %%mm0 \n\t"\ | |||
"pmaddwd %%mm1, %%mm3 \n\t"\ | |||
"paddd %%mm0, %%mm4 \n\t"\ | |||
"paddd %%mm3, %%mm5 \n\t"\ | |||
"add %6, %%"REG_S" \n\t" \ | |||
"movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ | |||
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ | |||
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |||
"test %%"REG_S", %%"REG_S" \n\t"\ | |||
"add %6, %%"FF_REG_S" \n\t" \ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a"), %%mm3 \n\t" /* VsrcData */\ | |||
"mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ | |||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ | |||
"movq %%mm2, %%mm0 \n\t"\ | |||
"punpcklwd %%mm3, %%mm2 \n\t"\ | |||
"punpckhwd %%mm3, %%mm0 \n\t"\ | |||
@@ -229,30 +229,30 @@ static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, | |||
"movq %%mm6, "V_TEMP"(%0) \n\t"\ | |||
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ | |||
"lea "offset"(%0), %%"REG_d" \n\t"\ | |||
"mov (%%"REG_d"), %%"REG_S" \n\t"\ | |||
"lea "offset"(%0), %%"FF_REG_d" \n\t"\ | |||
"mov (%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"pxor %%mm1, %%mm1 \n\t"\ | |||
"pxor %%mm5, %%mm5 \n\t"\ | |||
"pxor %%mm7, %%mm7 \n\t"\ | |||
"pxor %%mm6, %%mm6 \n\t"\ | |||
".p2align 4 \n\t"\ | |||
"2: \n\t"\ | |||
"movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ | |||
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ | |||
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ | |||
"movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ | |||
"movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ | |||
"mov "STR(APCK_PTR2)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"movq (%%"FF_REG_S", %%"FF_REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ | |||
"movq %%mm0, %%mm3 \n\t"\ | |||
"punpcklwd %%mm4, %%mm0 \n\t"\ | |||
"punpckhwd %%mm4, %%mm3 \n\t"\ | |||
"movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ | |||
"movq "STR(APCK_COEF)"(%%"FF_REG_d"), %%mm4 \n\t" /* filterCoeff */\ | |||
"pmaddwd %%mm4, %%mm0 \n\t"\ | |||
"pmaddwd %%mm4, %%mm3 \n\t"\ | |||
"paddd %%mm0, %%mm1 \n\t"\ | |||
"paddd %%mm3, %%mm5 \n\t"\ | |||
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ | |||
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ | |||
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ | |||
"test %%"REG_S", %%"REG_S" \n\t"\ | |||
"movq 8(%%"FF_REG_S", %%"FF_REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ | |||
"mov "STR(APCK_SIZE)"(%%"FF_REG_d"), %%"FF_REG_S" \n\t"\ | |||
"add $"STR(APCK_SIZE)", %%"FF_REG_d" \n\t"\ | |||
"test %%"FF_REG_S", %%"FF_REG_S" \n\t"\ | |||
"movq %%mm2, %%mm0 \n\t"\ | |||
"punpcklwd %%mm3, %%mm2 \n\t"\ | |||
"punpckhwd %%mm3, %%mm0 \n\t"\ | |||
@@ -359,13 +359,13 @@ static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, | |||
"psraw $3, %%mm1 \n\t" | |||
"psraw $3, %%mm7 \n\t" | |||
"packuswb %%mm7, %%mm1 \n\t" | |||
WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) | |||
WRITEBGR32(%4, "%5", %%FF_REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) | |||
YSCALEYUV2PACKEDX_END | |||
} else { | |||
YSCALEYUV2PACKEDX_ACCURATE | |||
YSCALEYUV2RGBX | |||
"pcmpeqd %%mm7, %%mm7 \n\t" | |||
WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
} | |||
@@ -388,13 +388,13 @@ static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, | |||
"psraw $3, %%mm1 \n\t" | |||
"psraw $3, %%mm7 \n\t" | |||
"packuswb %%mm7, %%mm1 \n\t" | |||
WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |||
WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |||
YSCALEYUV2PACKEDX_END | |||
} else { | |||
YSCALEYUV2PACKEDX | |||
YSCALEYUV2RGBX | |||
"pcmpeqd %%mm7, %%mm7 \n\t" | |||
WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
WRITEBGR32(%4, "%5", %%FF_REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
} | |||
@@ -417,13 +417,13 @@ static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter, | |||
"psraw $3, %%mm1 \n\t" | |||
"psraw $3, %%mm7 \n\t" | |||
"packuswb %%mm7, %%mm1 \n\t" | |||
WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |||
WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |||
YSCALEYUV2PACKEDX_END | |||
} else { | |||
YSCALEYUV2PACKEDX | |||
YSCALEYUV2RGBX | |||
"pcmpeqd %%mm7, %%mm7 \n\t" | |||
WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
WRITEBGR32(%4, "%5", %%FF_REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
} | |||
@@ -476,7 +476,7 @@ static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, | |||
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t" | |||
"paddusb "RED_DITHER"(%0), %%mm5\n\t" | |||
#endif | |||
WRITERGB16(%4, "%5", %%REGa) | |||
WRITERGB16(%4, "%5", %%FF_REGa) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
@@ -500,7 +500,7 @@ static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, | |||
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%0), %%mm5 \n\t" | |||
#endif | |||
WRITERGB16(%4, "%5", %%REGa) | |||
WRITERGB16(%4, "%5", %%FF_REGa) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
@@ -553,7 +553,7 @@ static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, | |||
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t" | |||
"paddusb "RED_DITHER"(%0), %%mm5\n\t" | |||
#endif | |||
WRITERGB15(%4, "%5", %%REGa) | |||
WRITERGB15(%4, "%5", %%FF_REGa) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
@@ -577,7 +577,7 @@ static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, | |||
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%0), %%mm5 \n\t" | |||
#endif | |||
WRITERGB15(%4, "%5", %%REGa) | |||
WRITERGB15(%4, "%5", %%FF_REGa) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
@@ -705,14 +705,14 @@ static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, | |||
YSCALEYUV2PACKEDX_ACCURATE | |||
YSCALEYUV2RGBX | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize | |||
"add %4, %%"REG_c" \n\t" | |||
WRITEBGR24(%%REGc, "%5", %%REGa) | |||
"lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c"\n\t" //FIXME optimize | |||
"add %4, %%"FF_REG_c" \n\t" | |||
WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) | |||
:: "r" (&c->redDither), | |||
"m" (dummy), "m" (dummy), "m" (dummy), | |||
"r" (dest), "m" (dstW_reg), "m"(uv_off) | |||
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |||
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |||
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S | |||
); | |||
} | |||
@@ -729,15 +729,15 @@ static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, | |||
YSCALEYUV2PACKEDX | |||
YSCALEYUV2RGBX | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize | |||
"add %4, %%"REG_c" \n\t" | |||
WRITEBGR24(%%REGc, "%5", %%REGa) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
"lea (%%"FF_REG_a", %%"FF_REG_a", 2), %%"FF_REG_c" \n\t" //FIXME optimize | |||
"add %4, %%"FF_REG_c" \n\t" | |||
WRITEBGR24(%%FF_REGc, "%5", %%FF_REGa) | |||
:: "r" (&c->redDither), | |||
"m" (dummy), "m" (dummy), "m" (dummy), | |||
"r" (dest), "m" (dstW_reg), "m"(uv_off) | |||
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |||
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S | |||
: "%"FF_REG_a, "%"FF_REG_c, "%"FF_REG_d, "%"FF_REG_S | |||
); | |||
} | |||
#endif /* HAVE_6REGS */ | |||
@@ -776,7 +776,7 @@ static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, | |||
"psraw $3, %%mm4 \n\t" | |||
"psraw $3, %%mm1 \n\t" | |||
"psraw $3, %%mm7 \n\t" | |||
WRITEYUY2(%4, "%5", %%REGa) | |||
WRITEYUY2(%4, "%5", %%FF_REGa) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
@@ -797,7 +797,7 @@ static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, | |||
"psraw $3, %%mm4 \n\t" | |||
"psraw $3, %%mm1 \n\t" | |||
"psraw $3, %%mm7 \n\t" | |||
WRITEYUY2(%4, "%5", %%REGa) | |||
WRITEYUY2(%4, "%5", %%FF_REGa) | |||
YSCALEYUV2PACKEDX_END | |||
} | |||
@@ -908,37 +908,37 @@ static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], | |||
c->u_temp=(intptr_t)abuf0; | |||
c->v_temp=(intptr_t)abuf1; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%FF_REGBP, %5) | |||
"push %0 \n\t" | |||
"push %1 \n\t" | |||
"mov "U_TEMP"(%5), %0 \n\t" | |||
"mov "V_TEMP"(%5), %1 \n\t" | |||
YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) | |||
YSCALEYUV2RGB_YA(%%FF_REGBP, %5, %0, %1) | |||
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ | |||
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ | |||
"packuswb %%mm7, %%mm1 \n\t" | |||
"pop %1 \n\t" | |||
"pop %0 \n\t" | |||
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
#endif | |||
} else { | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%FF_REGBP, %5) | |||
"pcmpeqd %%mm7, %%mm7 \n\t" | |||
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
@@ -954,14 +954,14 @@ static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], | |||
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |||
@@ -977,20 +977,20 @@ static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], | |||
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |||
#ifdef DITHER1XBPP | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
#endif | |||
WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(bF8) | |||
@@ -1006,20 +1006,20 @@ static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], | |||
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |||
#ifdef DITHER1XBPP | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
#endif | |||
WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(bF8,bFC) | |||
@@ -1075,13 +1075,13 @@ static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], | |||
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2PACKED(%%REGBP, %5) | |||
WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2PACKED(%%FF_REGBP, %5) | |||
WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
@@ -1217,27 +1217,27 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, | |||
const int16_t *ubuf1 = ubuf[0]; | |||
if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%REGBP, %5) | |||
YSCALEYUV2RGB1_ALPHA(%%REGBP) | |||
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%FF_REGBP, %5) | |||
YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) | |||
WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
} else { | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%FF_REGBP, %5) | |||
"pcmpeqd %%mm7, %%mm7 \n\t" | |||
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
@@ -1246,27 +1246,27 @@ static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, | |||
const int16_t *ubuf1 = ubuf[1]; | |||
if (CONFIG_SWSCALE_ALPHA && c->needAlpha) { | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%REGBP, %5) | |||
YSCALEYUV2RGB1_ALPHA(%%REGBP) | |||
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%FF_REGBP, %5) | |||
YSCALEYUV2RGB1_ALPHA(%%FF_REGBP) | |||
WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
} else { | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%FF_REGBP, %5) | |||
"pcmpeqd %%mm7, %%mm7 \n\t" | |||
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITEBGR32(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
@@ -1285,14 +1285,14 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, | |||
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |||
const int16_t *ubuf1 = ubuf[0]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |||
@@ -1300,14 +1300,14 @@ static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, | |||
} else { | |||
const int16_t *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITEBGR24(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B) | |||
@@ -1326,20 +1326,20 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, | |||
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |||
const int16_t *ubuf1 = ubuf[0]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |||
#ifdef DITHER1XBPP | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
#endif | |||
WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(bF8) | |||
@@ -1347,20 +1347,20 @@ static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, | |||
} else { | |||
const int16_t *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |||
#ifdef DITHER1XBPP | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
#endif | |||
WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITERGB15(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(bF8) | |||
@@ -1379,20 +1379,20 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, | |||
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |||
const int16_t *ubuf1 = ubuf[0]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |||
#ifdef DITHER1XBPP | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
#endif | |||
WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(bF8,bFC) | |||
@@ -1400,20 +1400,20 @@ static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, | |||
} else { | |||
const int16_t *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%REGBP, %5) | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2RGB1b(%%FF_REGBP, %5) | |||
"pxor %%mm7, %%mm7 \n\t" | |||
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ | |||
#ifdef DITHER1XBPP | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" | |||
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
"paddusb "RED_DITHER"(%5), %%mm5 \n\t" | |||
#endif | |||
WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
WRITERGB16(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
NAMED_CONSTRAINTS_ADD(bF8,bFC) | |||
@@ -1469,26 +1469,26 @@ static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, | |||
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster | |||
const int16_t *ubuf1 = ubuf[0]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2PACKED1(%%REGBP, %5) | |||
WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2PACKED1(%%FF_REGBP, %5) | |||
WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||
} else { | |||
const int16_t *ubuf1 = ubuf[1]; | |||
__asm__ volatile( | |||
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"REG_b" \n\t" | |||
"push %%"REG_BP" \n\t" | |||
YSCALEYUV2PACKED1b(%%REGBP, %5) | |||
WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP) | |||
"pop %%"REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" | |||
"mov %%"FF_REG_b", "ESP_OFFSET"(%5) \n\t" | |||
"mov %4, %%"FF_REG_b" \n\t" | |||
"push %%"FF_REG_BP" \n\t" | |||
YSCALEYUV2PACKED1b(%%FF_REGBP, %5) | |||
WRITEYUY2(%%FF_REGb, DSTW_OFFSET"(%5)", %%FF_REGBP) | |||
"pop %%"FF_REG_BP" \n\t" | |||
"mov "ESP_OFFSET"(%5), %%"FF_REG_b" \n\t" | |||
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), | |||
"a" (&c->redDither) | |||
); | |||