|  |  | @@ -58,6 +58,13 @@ VP9_IDCT_COEFFS  8423, 14053 | 
		
	
		
			
			|  |  |  | VP9_IDCT_COEFFS 13160,  9760 | 
		
	
		
			
			|  |  |  | VP9_IDCT_COEFFS  2404, 16207 | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | pw_5283_13377: times 4 dw 5283, 13377 | 
		
	
		
			
			|  |  |  | pw_9929_13377: times 4 dw 9929, 13377 | 
		
	
		
			
			|  |  |  | pw_15212_m13377: times 4 dw 15212, -13377 | 
		
	
		
			
			|  |  |  | pw_15212_9929: times 4 dw 15212, 9929 | 
		
	
		
			
			|  |  |  | pw_m5283_m15212: times 4 dw -5283, -15212 | 
		
	
		
			
			|  |  |  | pw_13377x2: times 8 dw 13377*2 | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | pd_8192: times 4 dd 8192 | 
		
	
		
			
			|  |  |  | pw_2048: times 8 dw 2048 | 
		
	
		
			
			|  |  |  | pw_1024: times 8 dw 1024 | 
		
	
	
		
			
				|  |  | @@ -239,6 +246,68 @@ cglobal vp9_idct_idct_4x4_add, 4,4,0, dst, stride, block, eob | 
		
	
		
			
			|  |  |  | VP9_IDCT4_WRITEOUT | 
		
	
		
			
			|  |  |  | RET | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | ;------------------------------------------------------------------------------------------- | 
		
	
		
			
			|  |  |  | ; void vp9_iadst_iadst_4x4_add_<opt>(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob); | 
		
	
		
			
			|  |  |  | ;------------------------------------------------------------------------------------------- | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | %macro VP9_IADST4_1D 0 | 
		
	
		
			
			|  |  |  | movq2dq           xmm0, m0 | 
		
	
		
			
			|  |  |  | movq2dq           xmm1, m1 | 
		
	
		
			
			|  |  |  | movq2dq           xmm2, m2 | 
		
	
		
			
			|  |  |  | movq2dq           xmm3, m3 | 
		
	
		
			
			|  |  |  | paddw               m3, m0 | 
		
	
		
			
			|  |  |  | punpcklwd         xmm0, xmm1 | 
		
	
		
			
			|  |  |  | punpcklwd         xmm2, xmm3 | 
		
	
		
			
			|  |  |  | pmaddwd           xmm1, xmm0, [pw_5283_13377] | 
		
	
		
			
			|  |  |  | pmaddwd           xmm4, xmm0, [pw_9929_13377] | 
		
	
		
			
			|  |  |  | pmaddwd           xmm0, [pw_15212_m13377] | 
		
	
		
			
			|  |  |  | pmaddwd           xmm3, xmm2, [pw_15212_9929] | 
		
	
		
			
			|  |  |  | pmaddwd           xmm2, [pw_m5283_m15212] | 
		
	
		
			
			|  |  |  | psubw               m3, m2 | 
		
	
		
			
			|  |  |  | paddd             xmm0, xmm2 | 
		
	
		
			
			|  |  |  | paddd             xmm3, [pd_8192] | 
		
	
		
			
			|  |  |  | paddd             xmm2, [pd_8192] | 
		
	
		
			
			|  |  |  | paddd             xmm1, xmm3 | 
		
	
		
			
			|  |  |  | paddd             xmm0, xmm3 | 
		
	
		
			
			|  |  |  | paddd             xmm4, xmm2 | 
		
	
		
			
			|  |  |  | psrad             xmm1, 14 | 
		
	
		
			
			|  |  |  | psrad             xmm0, 14 | 
		
	
		
			
			|  |  |  | psrad             xmm4, 14 | 
		
	
		
			
			|  |  |  | pmulhrsw            m3, [pw_13377x2]        ; out2 | 
		
	
		
			
			|  |  |  | packssdw          xmm0, xmm0 | 
		
	
		
			
			|  |  |  | packssdw          xmm1, xmm1 | 
		
	
		
			
			|  |  |  | packssdw          xmm4, xmm4 | 
		
	
		
			
			|  |  |  | movdq2q             m0, xmm0                ; out3 | 
		
	
		
			
			|  |  |  | movdq2q             m1, xmm1                ; out0 | 
		
	
		
			
			|  |  |  | movdq2q             m2, xmm4                ; out1 | 
		
	
		
			
			|  |  |  | SWAP                 0, 1, 2, 3 | 
		
	
		
			
			|  |  |  | %endmacro | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | %macro IADST4_FN 5 | 
		
	
		
			
			|  |  |  | INIT_MMX %5 | 
		
	
		
			
			|  |  |  | cglobal vp9_%1_%3_4x4_add, 3, 3, 8, dst, stride, block, eob | 
		
	
		
			
			|  |  |  | mova                m0, [blockq+ 0] | 
		
	
		
			
			|  |  |  | mova                m1, [blockq+ 8] | 
		
	
		
			
			|  |  |  | mova                m2, [blockq+16] | 
		
	
		
			
			|  |  |  | mova                m3, [blockq+24] | 
		
	
		
			
			|  |  |  | mova                m6, [pw_11585x2] | 
		
	
		
			
			|  |  |  | mova                m7, [pd_8192]       ; rounding | 
		
	
		
			
			|  |  |  | VP9_%2_1D | 
		
	
		
			
			|  |  |  | TRANSPOSE4x4W  0, 1, 2, 3, 4 | 
		
	
		
			
			|  |  |  | VP9_%4_1D | 
		
	
		
			
			|  |  |  | pxor                m4, m4  ; used for the block reset, and VP9_STORE_2X | 
		
	
		
			
			|  |  |  | mova       [blockq+ 0], m4 | 
		
	
		
			
			|  |  |  | mova       [blockq+ 8], m4 | 
		
	
		
			
			|  |  |  | mova       [blockq+16], m4 | 
		
	
		
			
			|  |  |  | mova       [blockq+24], m4 | 
		
	
		
			
			|  |  |  | VP9_IDCT4_WRITEOUT | 
		
	
		
			
			|  |  |  | RET | 
		
	
		
			
			|  |  |  | %endmacro | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | IADST4_FN idct,  IDCT4,  iadst, IADST4, ssse3 | 
		
	
		
			
			|  |  |  | IADST4_FN iadst, IADST4, idct,  IDCT4,  ssse3 | 
		
	
		
			
			|  |  |  | IADST4_FN iadst, IADST4, iadst, IADST4, ssse3 | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | %if ARCH_X86_64 ; TODO: 32-bit? (32-bit limited to 8 xmm reg, we use more) | 
		
	
		
			
			|  |  |  | 
 | 
		
	
		
			
			|  |  |  | ;------------------------------------------------------------------------------------------- | 
		
	
	
		
			
				|  |  | 
 |