Use this in VP8/H264-8bit loopfilter functions so they can be used if there is no aligned stack (e.g. MSVC 32bit or ICC 10.x). Signed-off-by: Luca Barbato <lu_zero@gentoo.org>tags/n1.1
| @@ -398,14 +398,12 @@ DEBLOCK_LUMA | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | ; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| cglobal deblock_%1_luma_8, 5,5 | |||||
| cglobal deblock_%1_luma_8, 5,5,8,2*%2 | |||||
| lea r4, [r1*3] | lea r4, [r1*3] | ||||
| dec r2 ; alpha-1 | dec r2 ; alpha-1 | ||||
| neg r4 | neg r4 | ||||
| dec r3 ; beta-1 | dec r3 ; beta-1 | ||||
| add r4, r0 ; pix-3*stride | add r4, r0 ; pix-3*stride | ||||
| %assign pad 2*%2+12-(stack_offset&15) | |||||
| SUB esp, pad | |||||
| mova m0, [r4+r1] ; p1 | mova m0, [r4+r1] ; p1 | ||||
| mova m1, [r4+2*r1] ; p0 | mova m1, [r4+2*r1] ; p0 | ||||
| @@ -443,22 +441,19 @@ cglobal deblock_%1_luma_8, 5,5 | |||||
| DEBLOCK_P0_Q0 | DEBLOCK_P0_Q0 | ||||
| mova [r4+2*r1], m1 | mova [r4+2*r1], m1 | ||||
| mova [r0], m2 | mova [r0], m2 | ||||
| ADD esp, pad | |||||
| RET | RET | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | ; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| INIT_MMX cpuname | INIT_MMX cpuname | ||||
| cglobal deblock_h_luma_8, 0,5 | |||||
| cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 | |||||
| mov r0, r0mp | mov r0, r0mp | ||||
| mov r3, r1m | mov r3, r1m | ||||
| lea r4, [r3*3] | lea r4, [r3*3] | ||||
| sub r0, 4 | sub r0, 4 | ||||
| lea r1, [r0+r4] | lea r1, [r0+r4] | ||||
| %assign pad 0x78-(stack_offset&15) | |||||
| SUB esp, pad | |||||
| %define pix_tmp esp+12 | |||||
| %define pix_tmp esp+12*HAVE_ALIGNED_STACK | |||||
| ; transpose 6x16 -> tmp space | ; transpose 6x16 -> tmp space | ||||
| TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp | TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp | ||||
| @@ -500,7 +495,6 @@ cglobal deblock_h_luma_8, 0,5 | |||||
| movq m3, [pix_tmp+0x48] | movq m3, [pix_tmp+0x48] | ||||
| TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) | TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) | ||||
| ADD esp, pad | |||||
| RET | RET | ||||
| %endmacro ; DEBLOCK_LUMA | %endmacro ; DEBLOCK_LUMA | ||||
| @@ -631,7 +625,7 @@ DEBLOCK_LUMA v, 16 | |||||
| %define mpb_0 m14 | %define mpb_0 m14 | ||||
| %define mpb_1 m15 | %define mpb_1 m15 | ||||
| %else | %else | ||||
| %define spill(x) [esp+16*x+((stack_offset+4)&15)] | |||||
| %define spill(x) [esp+16*x] | |||||
| %define p2 [r4+r1] | %define p2 [r4+r1] | ||||
| %define q2 [r0+2*r1] | %define q2 [r0+2*r1] | ||||
| %define t4 spill(0) | %define t4 spill(0) | ||||
| @@ -646,10 +640,7 @@ DEBLOCK_LUMA v, 16 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) | ; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) | ||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| cglobal deblock_%1_luma_intra_8, 4,6,16 | |||||
| %if ARCH_X86_64 == 0 | |||||
| sub esp, 0x60 | |||||
| %endif | |||||
| cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 | |||||
| lea r4, [r1*4] | lea r4, [r1*4] | ||||
| lea r5, [r1*3] ; 3*stride | lea r5, [r1*3] ; 3*stride | ||||
| dec r2d ; alpha-1 | dec r2d ; alpha-1 | ||||
| @@ -698,9 +689,6 @@ cglobal deblock_%1_luma_intra_8, 4,6,16 | |||||
| LUMA_INTRA_SWAP_PQ | LUMA_INTRA_SWAP_PQ | ||||
| LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] | LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] | ||||
| .end: | .end: | ||||
| %if ARCH_X86_64 == 0 | |||||
| add esp, 0x60 | |||||
| %endif | |||||
| RET | RET | ||||
| INIT_MMX cpuname | INIT_MMX cpuname | ||||
| @@ -737,12 +725,10 @@ cglobal deblock_h_luma_intra_8, 4,9 | |||||
| add rsp, 0x88 | add rsp, 0x88 | ||||
| RET | RET | ||||
| %else | %else | ||||
| cglobal deblock_h_luma_intra_8, 2,4 | |||||
| cglobal deblock_h_luma_intra_8, 2,4,8,0x80 | |||||
| lea r3, [r1*3] | lea r3, [r1*3] | ||||
| sub r0, 4 | sub r0, 4 | ||||
| lea r2, [r0+r3] | lea r2, [r0+r3] | ||||
| %assign pad 0x8c-(stack_offset&15) | |||||
| SUB rsp, pad | |||||
| %define pix_tmp rsp | %define pix_tmp rsp | ||||
| ; transpose 8x16 -> tmp space | ; transpose 8x16 -> tmp space | ||||
| @@ -773,7 +759,6 @@ cglobal deblock_h_luma_intra_8, 2,4 | |||||
| lea r0, [r0+r1*8] | lea r0, [r0+r1*8] | ||||
| lea r2, [r2+r1*8] | lea r2, [r2+r1*8] | ||||
| TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) | TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) | ||||
| ADD rsp, pad | |||||
| RET | RET | ||||
| %endif ; ARCH_X86_64 | %endif ; ARCH_X86_64 | ||||
| %endmacro ; DEBLOCK_LUMA_INTRA | %endmacro ; DEBLOCK_LUMA_INTRA | ||||
| @@ -275,18 +275,16 @@ void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, | |||||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; | ||||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; | c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; | ||||
| #if HAVE_ALIGNED_STACK | |||||
| c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; | c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; | ||||
| c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; | c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; | ||||
| c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; | ||||
| c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; | c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; | ||||
| #endif /* HAVE_ALIGNED_STACK */ | |||||
| } | } | ||||
| if (EXTERNAL_SSSE3(mm_flags)) { | if (EXTERNAL_SSSE3(mm_flags)) { | ||||
| c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; | c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; | ||||
| c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; | c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; | ||||
| } | } | ||||
| if (EXTERNAL_AVX(mm_flags) && HAVE_ALIGNED_STACK) { | |||||
| if (EXTERNAL_AVX(mm_flags)) { | |||||
| c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; | c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; | ||||
| c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; | c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; | ||||
| c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; | c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; | ||||
| @@ -1631,28 +1631,31 @@ SIMPLE_LOOPFILTER h, 5 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro INNER_LOOPFILTER 2 | %macro INNER_LOOPFILTER 2 | ||||
| %define stack_size 0 | |||||
| %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | |||||
| %ifidn %1, v ; [3]=hev() result | |||||
| %define stack_size mmsize * -4 | |||||
| %else ; h ; extra storage space for transposes | |||||
| %define stack_size mmsize * -5 | |||||
| %endif | |||||
| %endif | |||||
| %if %2 == 8 ; chroma | %if %2 == 8 ; chroma | ||||
| cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, dst, dst8, stride, flimE, flimI, hevthr | |||||
| cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr | |||||
| %else ; luma | %else ; luma | ||||
| cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr | |||||
| cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr | |||||
| %endif | %endif | ||||
| %if cpuflag(ssse3) | %if cpuflag(ssse3) | ||||
| pxor m7, m7 | pxor m7, m7 | ||||
| %endif | %endif | ||||
| %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | |||||
| %ifidn %1, v ; [3]=hev() result | |||||
| %assign pad 16 + mmsize * 4 - gprsize - (stack_offset & 15) | |||||
| %else ; h ; extra storage space for transposes | |||||
| %assign pad 16 + mmsize * 5 - gprsize - (stack_offset & 15) | |||||
| %endif | |||||
| %ifndef m8 | |||||
| ; splat function arguments | ; splat function arguments | ||||
| SPLATB_REG m0, flimEq, m7 ; E | SPLATB_REG m0, flimEq, m7 ; E | ||||
| SPLATB_REG m1, flimIq, m7 ; I | SPLATB_REG m1, flimIq, m7 ; I | ||||
| SPLATB_REG m2, hevthrq, m7 ; hev_thresh | SPLATB_REG m2, hevthrq, m7 ; hev_thresh | ||||
| SUB rsp, pad | |||||
| %define m_flimE [rsp] | %define m_flimE [rsp] | ||||
| %define m_flimI [rsp+mmsize] | %define m_flimI [rsp+mmsize] | ||||
| %define m_hevthr [rsp+mmsize*2] | %define m_hevthr [rsp+mmsize*2] | ||||
| @@ -2082,12 +2085,10 @@ cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, dst, stride, flimE, flimI, hevthr | |||||
| dec cntrq | dec cntrq | ||||
| jg .next8px | jg .next8px | ||||
| %endif | %endif | ||||
| %endif | |||||
| %ifndef m8 ; sse2 on x86-32 or mmx/mmxext | |||||
| ADD rsp, pad | |||||
| %endif | |||||
| REP_RET | |||||
| %else ; mmsize == 16 | |||||
| RET | RET | ||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| %if ARCH_X86_32 | %if ARCH_X86_32 | ||||
| @@ -2122,31 +2123,34 @@ INNER_LOOPFILTER h, 8 | |||||
| ;----------------------------------------------------------------------------- | ;----------------------------------------------------------------------------- | ||||
| %macro MBEDGE_LOOPFILTER 2 | %macro MBEDGE_LOOPFILTER 2 | ||||
| %if %2 == 8 ; chroma | |||||
| cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, dst1, dst8, stride, flimE, flimI, hevthr | |||||
| %else ; luma | |||||
| cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevthr | |||||
| %endif | |||||
| %if cpuflag(ssse3) | |||||
| pxor m7, m7 | |||||
| %endif | |||||
| %define stack_size 0 | |||||
| %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | %ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr | ||||
| %if mmsize == 16 ; [3]=hev() result | %if mmsize == 16 ; [3]=hev() result | ||||
| ; [4]=filter tmp result | ; [4]=filter tmp result | ||||
| ; [5]/[6] = p2/q2 backup | ; [5]/[6] = p2/q2 backup | ||||
| ; [7]=lim_res sign result | ; [7]=lim_res sign result | ||||
| %assign pad 16 + mmsize * 7 - gprsize - (stack_offset & 15) | |||||
| %define stack_size mmsize * -7 | |||||
| %else ; 8 ; extra storage space for transposes | %else ; 8 ; extra storage space for transposes | ||||
| %assign pad 16 + mmsize * 8 - gprsize - (stack_offset & 15) | |||||
| %define stack_size mmsize * -8 | |||||
| %endif | |||||
| %endif | %endif | ||||
| %if %2 == 8 ; chroma | |||||
| cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr | |||||
| %else ; luma | |||||
| cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr | |||||
| %endif | |||||
| %if cpuflag(ssse3) | |||||
| pxor m7, m7 | |||||
| %endif | |||||
| %ifndef m8 | |||||
| ; splat function arguments | ; splat function arguments | ||||
| SPLATB_REG m0, flimEq, m7 ; E | SPLATB_REG m0, flimEq, m7 ; E | ||||
| SPLATB_REG m1, flimIq, m7 ; I | SPLATB_REG m1, flimIq, m7 ; I | ||||
| SPLATB_REG m2, hevthrq, m7 ; hev_thresh | SPLATB_REG m2, hevthrq, m7 ; hev_thresh | ||||
| SUB rsp, pad | |||||
| %define m_flimE [rsp] | %define m_flimE [rsp] | ||||
| %define m_flimI [rsp+mmsize] | %define m_flimI [rsp+mmsize] | ||||
| %define m_hevthr [rsp+mmsize*2] | %define m_hevthr [rsp+mmsize*2] | ||||
| @@ -2740,12 +2744,10 @@ cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, dst1, stride, flimE, flimI, hevt | |||||
| dec cntrq | dec cntrq | ||||
| jg .next8px | jg .next8px | ||||
| %endif | %endif | ||||
| %endif | |||||
| %ifndef m8 ; sse2 on x86-32 or mmx/mmxext | |||||
| ADD rsp, pad | |||||
| %endif | |||||
| REP_RET | |||||
| %else ; mmsize == 16 | |||||
| RET | RET | ||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| %if ARCH_X86_32 | %if ARCH_X86_32 | ||||
| @@ -390,13 +390,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; | c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; | ||||
| #if ARCH_X86_64 || HAVE_ALIGNED_STACK | |||||
| c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; | c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; | ||||
| c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; | c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; | ||||
| c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; | c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; | ||||
| c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; | c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; | ||||
| #endif | |||||
| } | } | ||||
| if (mm_flags & AV_CPU_FLAG_SSE2) { | if (mm_flags & AV_CPU_FLAG_SSE2) { | ||||
| @@ -404,13 +402,11 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; | c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; | ||||
| #if ARCH_X86_64 || HAVE_ALIGNED_STACK | |||||
| c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; | ||||
| c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; | ||||
| c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; | c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; | ||||
| c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; | c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; | ||||
| #endif | |||||
| } | } | ||||
| if (mm_flags & AV_CPU_FLAG_SSSE3) { | if (mm_flags & AV_CPU_FLAG_SSSE3) { | ||||
| @@ -424,7 +420,6 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; | c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; | ||||
| c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; | c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; | ||||
| #if ARCH_X86_64 || HAVE_ALIGNED_STACK | |||||
| c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; | c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; | ||||
| c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; | c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; | ||||
| c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; | c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; | ||||
| @@ -434,17 +429,14 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) | |||||
| c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; | c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; | ||||
| c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; | c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; | ||||
| c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; | c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; | ||||
| #endif | |||||
| } | } | ||||
| if (mm_flags & AV_CPU_FLAG_SSE4) { | if (mm_flags & AV_CPU_FLAG_SSE4) { | ||||
| c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; | c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; | ||||
| c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; | c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; | ||||
| #if ARCH_X86_64 || HAVE_ALIGNED_STACK | |||||
| c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; | c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; | ||||
| c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; | c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; | ||||
| #endif | |||||
| } | } | ||||
| #endif /* HAVE_YASM */ | #endif /* HAVE_YASM */ | ||||
| } | } | ||||
| @@ -111,7 +111,12 @@ CPUNOP amdnop | |||||
| ; %1 = number of arguments. loads them from stack if needed. | ; %1 = number of arguments. loads them from stack if needed. | ||||
| ; %2 = number of registers used. pushes callee-saved regs if needed. | ; %2 = number of registers used. pushes callee-saved regs if needed. | ||||
| ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | ; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed. | ||||
| ; %4 = list of names to define to registers | |||||
| ; %4 = (optional) stack size to be allocated. If not aligned (x86-32 ICC 10.x, | |||||
| ; MSVC or YMM), the stack will be manually aligned (to 16 or 32 bytes), | |||||
| ; and an extra register will be allocated to hold the original stack | |||||
| ; pointer (to not invalidate r0m etc.). To prevent the use of an extra | |||||
| ; register as stack pointer, request a negative stack size. | |||||
| ; %4+/%5+ = list of names to define to registers | |||||
| ; PROLOGUE can also be invoked by adding the same options to cglobal | ; PROLOGUE can also be invoked by adding the same options to cglobal | ||||
| ; e.g. | ; e.g. | ||||
| @@ -147,11 +152,11 @@ CPUNOP amdnop | |||||
| %define r%1m %2d | %define r%1m %2d | ||||
| %define r%1mp %2 | %define r%1mp %2 | ||||
| %elif ARCH_X86_64 ; memory | %elif ARCH_X86_64 ; memory | ||||
| %define r%1m [rsp + stack_offset + %3] | |||||
| %define r%1mp qword r %+ %1 %+ m | |||||
| %define r%1m [rstk + stack_offset + %3] | |||||
| %define r%1mp qword r %+ %1m | |||||
| %else | %else | ||||
| %define r%1m [esp + stack_offset + %3] | |||||
| %define r%1mp dword r %+ %1 %+ m | |||||
| %define r%1m [rstk + stack_offset + %3] | |||||
| %define r%1mp dword r %+ %1m | |||||
| %endif | %endif | ||||
| %define r%1 %2 | %define r%1 %2 | ||||
| %endmacro | %endmacro | ||||
| @@ -212,12 +217,16 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 | |||||
| %macro PUSH 1 | %macro PUSH 1 | ||||
| push %1 | push %1 | ||||
| %assign stack_offset stack_offset+gprsize | |||||
| %ifidn rstk, rsp | |||||
| %assign stack_offset stack_offset+gprsize | |||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| %macro POP 1 | %macro POP 1 | ||||
| pop %1 | pop %1 | ||||
| %assign stack_offset stack_offset-gprsize | |||||
| %ifidn rstk, rsp | |||||
| %assign stack_offset stack_offset-gprsize | |||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| %macro PUSH_IF_USED 1-* | %macro PUSH_IF_USED 1-* | ||||
| @@ -249,14 +258,14 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 | |||||
| %macro SUB 2 | %macro SUB 2 | ||||
| sub %1, %2 | sub %1, %2 | ||||
| %ifidn %1, rsp | |||||
| %ifidn %1, rstk | |||||
| %assign stack_offset stack_offset+(%2) | %assign stack_offset stack_offset+(%2) | ||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| %macro ADD 2 | %macro ADD 2 | ||||
| add %1, %2 | add %1, %2 | ||||
| %ifidn %1, rsp | |||||
| %ifidn %1, rstk | |||||
| %assign stack_offset stack_offset-(%2) | %assign stack_offset stack_offset-(%2) | ||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| @@ -314,6 +323,73 @@ DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 | |||||
| %assign n_arg_names %0 | %assign n_arg_names %0 | ||||
| %endmacro | %endmacro | ||||
| %macro ALLOC_STACK 1-2 0 ; stack_size, n_xmm_regs (for win64 only) | |||||
| %ifnum %1 | |||||
| %if %1 != 0 | |||||
| %assign %%stack_alignment ((mmsize + 15) & ~15) | |||||
| %assign stack_size %1 | |||||
| %if stack_size < 0 | |||||
| %assign stack_size -stack_size | |||||
| %endif | |||||
| %assign xmm_regs_used %2 | |||||
| %if mmsize <= 16 && HAVE_ALIGNED_STACK | |||||
| %assign stack_size_padded stack_size + %%stack_alignment - gprsize - (stack_offset & (%%stack_alignment - 1)) | |||||
| %if xmm_regs_used > 6 | |||||
| %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 | |||||
| %endif | |||||
| SUB rsp, stack_size_padded | |||||
| %else | |||||
| %assign reg_num (regs_used - 1) | |||||
| %xdefine rstk r %+ reg_num | |||||
| ; align stack, and save original stack location directly above | |||||
| ; it, i.e. in [rsp+stack_size_padded], so we can restore the | |||||
| ; stack in a single instruction (i.e. mov rsp, rstk or mov | |||||
| ; rsp, [rsp+stack_size_padded]) | |||||
| mov rstk, rsp | |||||
| %assign stack_size_padded stack_size | |||||
| %if xmm_regs_used > 6 | |||||
| %assign stack_size_padded stack_size_padded + (xmm_regs_used - 6) * 16 | |||||
| %endif | |||||
| %if %1 < 0 ; need to store rsp on stack | |||||
| sub rsp, gprsize+stack_size_padded | |||||
| and rsp, ~(%%stack_alignment-1) | |||||
| %xdefine rstkm [rsp+stack_size_padded] | |||||
| mov rstkm, rstk | |||||
| %else ; can keep rsp in rstk during whole function | |||||
| sub rsp, stack_size_padded | |||||
| and rsp, ~(%%stack_alignment-1) | |||||
| %xdefine rstkm rstk | |||||
| %endif | |||||
| %endif | |||||
| %if xmm_regs_used > 6 | |||||
| WIN64_PUSH_XMM | |||||
| %endif | |||||
| %endif | |||||
| %endif | |||||
| %endmacro | |||||
| %macro SETUP_STACK_POINTER 1 | |||||
| %ifnum %1 | |||||
| %if %1 != 0 && (HAVE_ALIGNED_STACK == 0 || mmsize == 32) | |||||
| %if %1 > 0 | |||||
| %assign regs_used (regs_used + 1) | |||||
| %elif ARCH_X86_64 && regs_used == num_args && num_args <= 4 + UNIX64 * 2 | |||||
| %warning "Stack pointer will overwrite register argument" | |||||
| %endif | |||||
| %endif | |||||
| %endif | |||||
| %endmacro | |||||
| %macro DEFINE_ARGS_INTERNAL 3+ | |||||
| %ifnum %2 | |||||
| DEFINE_ARGS %3 | |||||
| %elif %1 == 4 | |||||
| DEFINE_ARGS %2 | |||||
| %elif %1 > 4 | |||||
| DEFINE_ARGS %2, %3 | |||||
| %endif | |||||
| %endmacro | |||||
| %if WIN64 ; Windows x64 ;================================================= | %if WIN64 ; Windows x64 ;================================================= | ||||
| DECLARE_REG 0, rcx | DECLARE_REG 0, rcx | ||||
| @@ -332,31 +408,37 @@ DECLARE_REG 12, R13, 104 | |||||
| DECLARE_REG 13, R14, 112 | DECLARE_REG 13, R14, 112 | ||||
| DECLARE_REG 14, R15, 120 | DECLARE_REG 14, R15, 120 | ||||
| %macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names... | |||||
| %macro PROLOGUE 2-5+ 0 ; #args, #regs, #xmm_regs, [stack_size,] arg_names... | |||||
| %assign num_args %1 | %assign num_args %1 | ||||
| %assign regs_used %2 | %assign regs_used %2 | ||||
| SETUP_STACK_POINTER %4 | |||||
| ASSERT regs_used >= num_args | ASSERT regs_used >= num_args | ||||
| ASSERT regs_used <= 15 | ASSERT regs_used <= 15 | ||||
| PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 | PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14 | ||||
| %if mmsize == 8 | |||||
| %assign xmm_regs_used 0 | |||||
| %else | |||||
| %assign xmm_regs_used 0 | |||||
| ALLOC_STACK %4, %3 | |||||
| %if mmsize != 8 && stack_size == 0 | |||||
| WIN64_SPILL_XMM %3 | WIN64_SPILL_XMM %3 | ||||
| %endif | %endif | ||||
| LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 | LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14 | ||||
| DEFINE_ARGS %4 | |||||
| DEFINE_ARGS_INTERNAL %0, %4, %5 | |||||
| %endmacro | |||||
| %macro WIN64_PUSH_XMM 0 | |||||
| %assign %%i xmm_regs_used | |||||
| %rep (xmm_regs_used-6) | |||||
| %assign %%i %%i-1 | |||||
| movdqa [rsp + (%%i-6)*16 + stack_size], xmm %+ %%i | |||||
| %endrep | |||||
| %endmacro | %endmacro | ||||
| %macro WIN64_SPILL_XMM 1 | %macro WIN64_SPILL_XMM 1 | ||||
| %assign xmm_regs_used %1 | %assign xmm_regs_used %1 | ||||
| ASSERT xmm_regs_used <= 16 | ASSERT xmm_regs_used <= 16 | ||||
| %if xmm_regs_used > 6 | %if xmm_regs_used > 6 | ||||
| SUB rsp, (xmm_regs_used-6)*16+16 | |||||
| %assign %%i xmm_regs_used | |||||
| %rep (xmm_regs_used-6) | |||||
| %assign %%i %%i-1 | |||||
| movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i | |||||
| %endrep | |||||
| %assign stack_size_padded (xmm_regs_used-6)*16+16-gprsize-(stack_offset&15) | |||||
| SUB rsp, stack_size_padded | |||||
| WIN64_PUSH_XMM | |||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| @@ -365,19 +447,25 @@ DECLARE_REG 14, R15, 120 | |||||
| %assign %%i xmm_regs_used | %assign %%i xmm_regs_used | ||||
| %rep (xmm_regs_used-6) | %rep (xmm_regs_used-6) | ||||
| %assign %%i %%i-1 | %assign %%i %%i-1 | ||||
| movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)] | |||||
| movdqa xmm %+ %%i, [%1 + (%%i-6)*16+stack_size] | |||||
| %endrep | %endrep | ||||
| add %1, (xmm_regs_used-6)*16+16 | |||||
| %endif | |||||
| %if stack_size_padded > 0 | |||||
| %if stack_size > 0 && (mmsize == 32 || HAVE_ALIGNED_STACK == 0) | |||||
| mov rsp, rstkm | |||||
| %else | |||||
| add %1, stack_size_padded | |||||
| %endif | |||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||
| %macro WIN64_RESTORE_XMM 1 | %macro WIN64_RESTORE_XMM 1 | ||||
| WIN64_RESTORE_XMM_INTERNAL %1 | WIN64_RESTORE_XMM_INTERNAL %1 | ||||
| %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16 | |||||
| %assign stack_offset (stack_offset-stack_size_padded) | |||||
| %assign xmm_regs_used 0 | %assign xmm_regs_used 0 | ||||
| %endmacro | %endmacro | ||||
| %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 | |||||
| %define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32 || stack_size > 0 | |||||
| %macro RET 0 | %macro RET 0 | ||||
| WIN64_RESTORE_XMM_INTERNAL rsp | WIN64_RESTORE_XMM_INTERNAL rsp | ||||
| @@ -406,19 +494,28 @@ DECLARE_REG 12, R13, 56 | |||||
| DECLARE_REG 13, R14, 64 | DECLARE_REG 13, R14, 64 | ||||
| DECLARE_REG 14, R15, 72 | DECLARE_REG 14, R15, 72 | ||||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | |||||
| %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... | |||||
| %assign num_args %1 | %assign num_args %1 | ||||
| %assign regs_used %2 | %assign regs_used %2 | ||||
| SETUP_STACK_POINTER %4 | |||||
| ASSERT regs_used >= num_args | ASSERT regs_used >= num_args | ||||
| ASSERT regs_used <= 15 | ASSERT regs_used <= 15 | ||||
| PUSH_IF_USED 9, 10, 11, 12, 13, 14 | PUSH_IF_USED 9, 10, 11, 12, 13, 14 | ||||
| ALLOC_STACK %4 | |||||
| LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 | LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14 | ||||
| DEFINE_ARGS %4 | |||||
| DEFINE_ARGS_INTERNAL %0, %4, %5 | |||||
| %endmacro | %endmacro | ||||
| %define has_epilogue regs_used > 9 || mmsize == 32 | |||||
| %define has_epilogue regs_used > 9 || mmsize == 32 || stack_size > 0 | |||||
| %macro RET 0 | %macro RET 0 | ||||
| %if stack_size_padded > 0 | |||||
| %if mmsize == 32 || HAVE_ALIGNED_STACK == 0 | |||||
| mov rsp, rstkm | |||||
| %else | |||||
| add rsp, stack_size_padded | |||||
| %endif | |||||
| %endif | |||||
| POP_IF_USED 14, 13, 12, 11, 10, 9 | POP_IF_USED 14, 13, 12, 11, 10, 9 | ||||
| %if mmsize == 32 | %if mmsize == 32 | ||||
| vzeroupper | vzeroupper | ||||
| @@ -439,7 +536,7 @@ DECLARE_REG 6, ebp, 28 | |||||
| %macro DECLARE_ARG 1-* | %macro DECLARE_ARG 1-* | ||||
| %rep %0 | %rep %0 | ||||
| %define r%1m [esp + stack_offset + 4*%1 + 4] | |||||
| %define r%1m [rstk + stack_offset + 4*%1 + 4] | |||||
| %define r%1mp dword r%1m | %define r%1mp dword r%1m | ||||
| %rotate 1 | %rotate 1 | ||||
| %endrep | %endrep | ||||
| @@ -447,24 +544,31 @@ DECLARE_REG 6, ebp, 28 | |||||
| DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | ||||
| %macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names... | |||||
| %macro PROLOGUE 2-5+ ; #args, #regs, #xmm_regs, [stack_size,] arg_names... | |||||
| %assign num_args %1 | %assign num_args %1 | ||||
| %assign regs_used %2 | %assign regs_used %2 | ||||
| %if num_args > 7 | |||||
| %assign num_args 7 | |||||
| %endif | |||||
| %if regs_used > 7 | %if regs_used > 7 | ||||
| %assign regs_used 7 | %assign regs_used 7 | ||||
| %endif | %endif | ||||
| SETUP_STACK_POINTER %4 | |||||
| ASSERT regs_used <= 7 | |||||
| ASSERT regs_used >= num_args | ASSERT regs_used >= num_args | ||||
| PUSH_IF_USED 3, 4, 5, 6 | PUSH_IF_USED 3, 4, 5, 6 | ||||
| ALLOC_STACK %4 | |||||
| LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 | LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6 | ||||
| DEFINE_ARGS %4 | |||||
| DEFINE_ARGS_INTERNAL %0, %4, %5 | |||||
| %endmacro | %endmacro | ||||
| %define has_epilogue regs_used > 3 || mmsize == 32 | |||||
| %define has_epilogue regs_used > 3 || mmsize == 32 || stack_size > 0 | |||||
| %macro RET 0 | %macro RET 0 | ||||
| %if stack_size_padded > 0 | |||||
| %if mmsize == 32 || HAVE_ALIGNED_STACK == 0 | |||||
| mov rsp, rstkm | |||||
| %else | |||||
| add rsp, stack_size_padded | |||||
| %endif | |||||
| %endif | |||||
| POP_IF_USED 6, 5, 4, 3 | POP_IF_USED 6, 5, 4, 3 | ||||
| %if mmsize == 32 | %if mmsize == 32 | ||||
| vzeroupper | vzeroupper | ||||
| @@ -479,6 +583,8 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | |||||
| %endmacro | %endmacro | ||||
| %macro WIN64_RESTORE_XMM 1 | %macro WIN64_RESTORE_XMM 1 | ||||
| %endmacro | %endmacro | ||||
| %macro WIN64_PUSH_XMM 0 | |||||
| %endmacro | |||||
| %endif | %endif | ||||
| %macro REP_RET 0 | %macro REP_RET 0 | ||||
| @@ -508,8 +614,12 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | |||||
| ; Applies any symbol mangling needed for C linkage, and sets up a define such that | ; Applies any symbol mangling needed for C linkage, and sets up a define such that | ||||
| ; subsequent uses of the function name automatically refer to the mangled version. | ; subsequent uses of the function name automatically refer to the mangled version. | ||||
| ; Appends cpuflags to the function name if cpuflags has been specified. | ; Appends cpuflags to the function name if cpuflags has been specified. | ||||
| %macro cglobal 1-2+ "" ; name, [PROLOGUE args] | |||||
| %macro cglobal 1-2+ ; name, [PROLOGUE args] | |||||
| %if %0 == 1 | |||||
| cglobal_internal %1 %+ SUFFIX | |||||
| %else | |||||
| cglobal_internal %1 %+ SUFFIX, %2 | cglobal_internal %1 %+ SUFFIX, %2 | ||||
| %endif | |||||
| %endmacro | %endmacro | ||||
| %macro cglobal_internal 1-2+ | %macro cglobal_internal 1-2+ | ||||
| %ifndef cglobaled_%1 | %ifndef cglobaled_%1 | ||||
| @@ -526,8 +636,11 @@ DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14 | |||||
| align function_align | align function_align | ||||
| %1: | %1: | ||||
| RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer | RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer | ||||
| %xdefine rstk rsp | |||||
| %assign stack_offset 0 | %assign stack_offset 0 | ||||
| %ifnidn %2, "" | |||||
| %assign stack_size 0 | |||||
| %assign stack_size_padded 0 | |||||
| %if %0 > 1 | |||||
| PROLOGUE %2 | PROLOGUE %2 | ||||
| %endif | %endif | ||||
| %endmacro | %endmacro | ||||