The second stride is always the internal buffer one, MAX_PB_SIZE (times 2 to get the value in bytes). Signed-off-by: Michael Niedermayer <michaelni@gmx.at>tags/n2.6
| @@ -1407,11 +1407,11 @@ cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 8, 10, 16, dst, dststride, src, srcstride | |||
| %macro WEIGHTING_FUNCS 2 | |||
| %if WIN64 || ARCH_X86_32 | |||
| cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, srcstride, height, denom, wx, ox | |||
| cglobal hevc_put_hevc_uni_w%1_%2, 4, 5, 7, dst, dststride, src, height, denom, wx, ox | |||
| mov r4d, denomm | |||
| %define SHIFT r4d | |||
| %else | |||
| cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, height, denom, wx, ox | |||
| cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, height, denom, wx, ox | |||
| %define SHIFT denomd | |||
| %endif | |||
| lea SHIFT, [SHIFT+14-%2] ; shift = 14 - bitd + denom | |||
| @@ -1472,15 +1472,15 @@ cglobal hevc_put_hevc_uni_w%1_%2, 6, 6, 7, dst, dststride, src, srcstride, heigh | |||
| jnz .loop ; height loop | |||
| RET | |||
| cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, height, denom, wx0, wx1, ox0, ox1 | |||
| mov r6d, denomm | |||
| cglobal hevc_put_hevc_bi_w%1_%2, 4, 6, 10, dst, dststride, src, src2, height, denom, wx0, wx1, ox0, ox1 | |||
| movifnidn r5d, denomm | |||
| %if %1 <= 4 | |||
| pxor m1, m1 | |||
| %endif | |||
| movd m2, wx0m ; WX0 | |||
| lea r6d, [r6d+14-%2] ; shift = 14 - bitd + denom | |||
| lea r5d, [r5d+14-%2] ; shift = 14 - bitd + denom | |||
| movd m3, wx1m ; WX1 | |||
| movd m0, r6d ; shift | |||
| movd m0, r5d ; shift | |||
| %if %1 <= 4 | |||
| punpcklwd m2, m1 | |||
| punpcklwd m3, m1 | |||
| @@ -1488,19 +1488,24 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, | |||
| punpcklwd m2, m2 | |||
| punpcklwd m3, m3 | |||
| %endif | |||
| inc r6d | |||
| movd m5, r6d ; shift+1 | |||
| inc r5d | |||
| movd m5, r5d ; shift+1 | |||
| pshufd m2, m2, 0 | |||
| mov r6d, ox0m | |||
| mov r5d, ox0m | |||
| pshufd m3, m3, 0 | |||
| add r6d, ox1m | |||
| add r5d, ox1m | |||
| %if %2 != 8 | |||
| shl r6d, %2-8 ; ox << (bitd - 8) | |||
| shl r5d, %2-8 ; ox << (bitd - 8) | |||
| %endif | |||
| inc r6d | |||
| movd m4, r6d ; offset | |||
| inc r5d | |||
| movd m4, r5d ; offset | |||
| pshufd m4, m4, 0 | |||
| mov r6d, heightm | |||
| %if UNIX64 | |||
| %define h heightd | |||
| %else | |||
| mov r5d, heightm | |||
| %define h r5d | |||
| %endif | |||
| pslld m4, m0 | |||
| .loop | |||
| @@ -1540,7 +1545,7 @@ cglobal hevc_put_hevc_bi_w%1_%2, 5, 7, 10, dst, dststride, src, srcstride, src2, | |||
| add dstq, dststrideq ; dst += dststride | |||
| add srcq, 2*MAX_PB_SIZE ; src += srcstride | |||
| add src2q, 2*MAX_PB_SIZE ; src2 += srcstride | |||
| dec r6d ; cmp height | |||
| dec h ; cmp height | |||
| jnz .loop ; height loop | |||
| RET | |||
| %endmacro | |||
| @@ -74,8 +74,8 @@ void ff_hevc_put_hevc_bi_w_ ## name ## _ ## D ## _##opt(uint8_t *_dst, ptrdiff_t | |||
| PEL_PROTOTYPE(fname##64, bitd, opt) | |||
| #define WEIGHTING_PROTOTYPE(width, bitd, opt) \ | |||
| void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, int height, int denom, int _wx, int _ox); \ | |||
| void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1) | |||
| void ff_hevc_put_hevc_uni_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int height, int denom, int _wx, int _ox); \ | |||
| void ff_hevc_put_hevc_bi_w##width##_##bitd##_##opt(uint8_t *dst, ptrdiff_t dststride, int16_t *_src, int16_t *_src2, int height, int denom, int _wx0, int _wx1, int _ox0, int _ox1) | |||
| #define WEIGHTING_PROTOTYPES(bitd, opt) \ | |||
| WEIGHTING_PROTOTYPE(2, bitd, opt); \ | |||
| @@ -427,7 +427,7 @@ mc_rep_funcs(qpel_hv,12, 8, 16, sse4); | |||
| mc_rep_funcs(qpel_hv,12, 4, 12, sse4); | |||
| #define mc_rep_uni_w(bitd, step, W, opt) \ | |||
| void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride,\ | |||
| void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \ | |||
| int height, int denom, int _wx, int _ox) \ | |||
| { \ | |||
| int i; \ | |||
| @@ -436,7 +436,7 @@ void ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststri | |||
| for (i = 0; i < W; i += step) { \ | |||
| src= _src + i; \ | |||
| dst= _dst + (i * ((bitd + 7) / 8)); \ | |||
| ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, \ | |||
| ff_hevc_put_hevc_uni_w##step##_##bitd##_##opt(dst, dststride, src, \ | |||
| height, denom, _wx, _ox); \ | |||
| } \ | |||
| } | |||
| @@ -463,7 +463,7 @@ mc_rep_uni_w(12, 8, 48, sse4); | |||
| mc_rep_uni_w(12, 8, 64, sse4); | |||
| #define mc_rep_bi_w(bitd, step, W, opt) \ | |||
| void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, ptrdiff_t _srcstride, \ | |||
| void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststride, int16_t *_src, \ | |||
| int16_t *_src2, int height, \ | |||
| int denom, int _wx0, int _wx1, int _ox0, int _ox1) \ | |||
| { \ | |||
| @@ -475,8 +475,8 @@ void ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t dststrid | |||
| src = _src + i; \ | |||
| src2 = _src2 + i; \ | |||
| dst = _dst + (i * ((bitd + 7) / 8)); \ | |||
| ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, _srcstride, src2, \ | |||
| height, denom, _wx0, _wx1, _ox0, _ox1); \ | |||
| ff_hevc_put_hevc_bi_w##step##_##bitd##_##opt(dst, dststride, src, src2, \ | |||
| height, denom, _wx0, _wx1, _ox0, _ox1); \ | |||
| } \ | |||
| } | |||
| @@ -510,7 +510,7 @@ void ff_hevc_put_hevc_uni_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t | |||
| { \ | |||
| LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \ | |||
| ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \ | |||
| ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, height, denom, _wx, _ox);\ | |||
| ff_hevc_put_hevc_uni_w##W##_##bitd##_##opt(_dst, _dststride, temp, height, denom, _wx, _ox);\ | |||
| } | |||
| #define mc_uni_w_funcs(name, bitd, opt) \ | |||
| @@ -569,8 +569,8 @@ void ff_hevc_put_hevc_bi_w_##name##W##_##bitd##_##opt(uint8_t *_dst, ptrdiff_t _ | |||
| { \ | |||
| LOCAL_ALIGNED_16(int16_t, temp, [71 * MAX_PB_SIZE]); \ | |||
| ff_hevc_put_hevc_##name##W##_##bitd##_##opt(temp, _src, _srcstride, height, mx, my, width); \ | |||
| ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, MAX_PB_SIZE, _src2, \ | |||
| height, denom, _wx0, _wx1, _ox0, _ox1); \ | |||
| ff_hevc_put_hevc_bi_w##W##_##bitd##_##opt(_dst, _dststride, temp, _src2, \ | |||
| height, denom, _wx0, _wx1, _ox0, _ox1); \ | |||
| } | |||
| #define mc_bi_w_funcs(name, bitd, opt) \ | |||