| 
							- /* Modified version of https://github.com/to-miz/sse_mathfun_extension for VCV Rack.
 - 
 - The following changes were made.
 - - Make functions inline.
 - - Change global constants to function-scope.
 - 
 - This derived source file is released under the zlib license.
 - */
 - 
 - /*
 - sse_mathfun_extension.h - zlib license
 - Written by Tolga Mizrak 2016
 - Extension of sse_mathfun.h, which is written by Julien Pommier
 - 
 - Based on the corresponding algorithms of the cephes math library
 - 
 - This is written as an extension to sse_mathfun.h instead of modifying it, just because I didn't want
 - to maintain a modified version of the original library. This way switching to a newer version of the
 - library won't be a hassle.
 - 
 - Note that non SSE2 implementations of tan_ps, atan_ps, cot_ps and atan2_ps are not implemented yet.
 - As such, currently you need to #define USE_SSE2 to compile.
 - 
 - With tan_ps, cot_ps you get good precision on input ranges that are further away from the domain
 - borders (-PI/2, PI/2 for tan and 0, 1 for cot). See the results on the deviations for these
 - functions on my machine:
 - checking tan on [-0.25*Pi, 0.25*Pi]
 - max deviation from tanf(x): 1.19209e-07 at 0.250000006957*Pi, max deviation from cephes_tan(x):
 - 5.96046e-08
 -    ->> precision OK for the tan_ps <<-
 - 
 - checking tan on [-0.49*Pi, 0.49*Pi]
 - max deviation from tanf(x): 3.8147e-06 at -0.490000009841*Pi, max deviation from cephes_tan(x):
 - 9.53674e-07
 -    ->> precision OK for the tan_ps <<-
 - 
 - checking cot on [0.2*Pi, 0.7*Pi]
 - max deviation from cotf(x): 1.19209e-07 at 0.204303119606*Pi, max deviation from cephes_cot(x):
 - 1.19209e-07
 -    ->> precision OK for the cot_ps <<-
 - 
 - checking cot on [0.01*Pi, 0.99*Pi]
 - max deviation from cotf(x): 3.8147e-06 at 0.987876517942*Pi, max deviation from cephes_cot(x):
 - 9.53674e-07
 -    ->> precision OK for the cot_ps <<-
 - 
 - With atan_ps and atan2_ps you get pretty good precision, atan_ps max deviation is < 2e-7 and
 - atan2_ps max deviation is < 2.5e-7
 - */
 - 
 - /* Copyright (C) 2016 Tolga Mizrak
 - 
 -   This software is provided 'as-is', without any express or implied
 -   warranty.  In no event will the authors be held liable for any damages
 -   arising from the use of this software.
 - 
 -   Permission is granted to anyone to use this software for any purpose,
 -   including commercial applications, and to alter it and redistribute it
 -   freely, subject to the following restrictions:
 - 
 -   1. The origin of this software must not be misrepresented; you must not
 - 	 claim that you wrote the original software. If you use this software
 - 	 in a product, an acknowledgment in the product documentation would be
 - 	 appreciated but is not required.
 -   2. Altered source versions must be plainly marked as such, and must not be
 - 	 misrepresented as being the original software.
 -   3. This notice may not be removed or altered from any source distribution.
 - 
 -   (this is the zlib license)
 - */
 - #pragma once
 - #include "sse_mathfun.h"
 - 
 - 
 - inline __m128 sse_mathfun_tancot_ps(__m128 x, int cotFlag) {
 - 	__m128 p0 = _mm_set_ps1(9.38540185543E-3);
 - 	__m128 p1 = _mm_set_ps1(3.11992232697E-3);
 - 	__m128 p2 = _mm_set_ps1(2.44301354525E-2);
 - 	__m128 p3 = _mm_set_ps1(5.34112807005E-2);
 - 	__m128 p4 = _mm_set_ps1(1.33387994085E-1);
 - 	__m128 p5 = _mm_set_ps1(3.33331568548E-1);
 - 
 - 	__m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;
 - 
 - 	__m128i emm2;
 - 	sign_bit = x;
 - 	/* take the absolute value */
 - 	__m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 - 	__m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
 - 	x = _mm_and_ps(x, inv_sign_mask);
 - 	/* extract the sign bit (upper one) */
 - 	sign_bit = _mm_and_ps(sign_bit, sign_mask);
 - 
 - 	/* scale by 4/Pi */
 - 	__m128 cephes_FOPI = _mm_set_ps1(1.27323954473516);
 - 	y = _mm_mul_ps(x, cephes_FOPI);
 - 
 - 	/* store the integer part of y in mm0 */
 - 	emm2 = _mm_cvttps_epi32(y);
 - 	/* j=(j+1) & (~1) (see the cephes sources) */
 - 	emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1));
 - 	emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1));
 - 	y = _mm_cvtepi32_ps(emm2);
 - 
 - 	emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2));
 - 	emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
 - 
 - 	__m128 poly_mask = _mm_castsi128_ps(emm2);
 - 	/* The magic pass: "Extended precision modular arithmetic"
 - 	   x = ((x - y * DP1) - y * DP2) - y * DP3; */
 - 	__m128 minus_cephes_DP1 = _mm_set_ps1(-0.78515625);
 - 	__m128 minus_cephes_DP2 = _mm_set_ps1(-2.4187564849853515625e-4);
 - 	__m128 minus_cephes_DP3 = _mm_set_ps1(-3.77489497744594108e-8);
 - 	xmm1 = minus_cephes_DP1;
 - 	xmm2 = minus_cephes_DP2;
 - 	xmm3 = minus_cephes_DP3;
 - 	xmm1 = _mm_mul_ps(y, xmm1);
 - 	xmm2 = _mm_mul_ps(y, xmm2);
 - 	xmm3 = _mm_mul_ps(y, xmm3);
 - 	__m128 z = _mm_add_ps(x, xmm1);
 - 	z = _mm_add_ps(z, xmm2);
 - 	z = _mm_add_ps(z, xmm3);
 - 
 - 	__m128 zz = _mm_mul_ps(z, z);
 - 
 - 	y = p0;
 - 	y = _mm_mul_ps(y, zz);
 - 	y = _mm_add_ps(y, p1);
 - 	y = _mm_mul_ps(y, zz);
 - 	y = _mm_add_ps(y, p2);
 - 	y = _mm_mul_ps(y, zz);
 - 	y = _mm_add_ps(y, p3);
 - 	y = _mm_mul_ps(y, zz);
 - 	y = _mm_add_ps(y, p4);
 - 	y = _mm_mul_ps(y, zz);
 - 	y = _mm_add_ps(y, p5);
 - 	y = _mm_mul_ps(y, zz);
 - 	y = _mm_mul_ps(y, z);
 - 	y = _mm_add_ps(y, z);
 - 
 - 	__m128 y2;
 - 	if (cotFlag) {
 - 		y2 = _mm_xor_ps(y, sign_mask);
 - 		/* y = _mm_rcp_ps(y); */
 - 		/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
 - 		y = _mm_div_ps(_mm_set_ps1(1.f), y);
 - 	}
 - 	else {
 - 		/* y2 = _mm_rcp_ps(y); */
 - 		/* using _mm_rcp_ps here loses on way too much precision, better to do a div */
 - 		y2 = _mm_div_ps(_mm_set_ps1(1.f), y);
 - 		y2 = _mm_xor_ps(y2, sign_mask);
 - 	}
 - 
 - 	/* select the correct result from the two polynoms */
 - 	xmm3 = poly_mask;
 - 	y = _mm_and_ps(xmm3, y);
 - 	y2 = _mm_andnot_ps(xmm3, y2);
 - 	y = _mm_or_ps(y, y2);
 - 
 - 	/* update the sign */
 - 	y = _mm_xor_ps(y, sign_bit);
 - 
 - 	return y;
 - }
 - 
 - inline __m128 sse_mathfun_tan_ps(__m128 x) {
 - 	return sse_mathfun_tancot_ps(x, 0);
 - }
 - 
 - inline __m128 sse_mathfun_cot_ps(__m128 x) {
 - 	return sse_mathfun_tancot_ps(x, 1);
 - }
 - 
 - 
 - inline __m128 sse_mathfun_atan_ps(__m128 x) {
 - 	__m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 - 	__m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));
 - 
 - 	__m128 atanrange_hi = _mm_set_ps1(2.414213562373095);
 - 	__m128 atanrange_lo = _mm_set_ps1(0.4142135623730950);
 - 	__m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
 - 	__m128 cephes_PIO4F = _mm_set_ps1(0.7853981633974483096);
 - 
 - 	__m128 atancof_p0 = _mm_set_ps1(8.05374449538e-2);
 - 	__m128 atancof_p1 = _mm_set_ps1(1.38776856032E-1);
 - 	__m128 atancof_p2 = _mm_set_ps1(1.99777106478E-1);
 - 	__m128 atancof_p3 = _mm_set_ps1(3.33329491539E-1);
 - 
 - 	__m128 sign_bit, y;
 - 
 - 	sign_bit = x;
 - 	/* take the absolute value */
 - 	x = _mm_and_ps(x, inv_sign_mask);
 - 	/* extract the sign bit (upper one) */
 - 	sign_bit = _mm_and_ps(sign_bit, sign_mask);
 - 
 - 	/* range reduction, init x and y depending on range */
 - 	/* x > 2.414213562373095 */
 - 	__m128 cmp0 = _mm_cmpgt_ps(x, atanrange_hi);
 - 	/* x > 0.4142135623730950 */
 - 	__m128 cmp1 = _mm_cmpgt_ps(x, atanrange_lo);
 - 
 - 	/* x > 0.4142135623730950 && !(x > 2.414213562373095) */
 - 	__m128 cmp2 = _mm_andnot_ps(cmp0, cmp1);
 - 
 - 	/* -(1.0/x) */
 - 	__m128 y0 = _mm_and_ps(cmp0, cephes_PIO2F);
 - 	__m128 x0 = _mm_div_ps(_mm_set_ps1(1.f), x);
 - 	x0 = _mm_xor_ps(x0, sign_mask);
 - 
 - 	__m128 y1 = _mm_and_ps(cmp2, cephes_PIO4F);
 - 	/* (x-1.0)/(x+1.0) */
 - 	__m128 x1_o = _mm_sub_ps(x, _mm_set_ps1(1.f));
 - 	__m128 x1_u = _mm_add_ps(x, _mm_set_ps1(1.f));
 - 	__m128 x1 = _mm_div_ps(x1_o, x1_u);
 - 
 - 	__m128 x2 = _mm_and_ps(cmp2, x1);
 - 	x0 = _mm_and_ps(cmp0, x0);
 - 	x2 = _mm_or_ps(x2, x0);
 - 	cmp1 = _mm_or_ps(cmp0, cmp2);
 - 	x2 = _mm_and_ps(cmp1, x2);
 - 	x = _mm_andnot_ps(cmp1, x);
 - 	x = _mm_or_ps(x2, x);
 - 
 - 	y = _mm_or_ps(y0, y1);
 - 
 - 	__m128 zz = _mm_mul_ps(x, x);
 - 	__m128 acc = atancof_p0;
 - 	acc = _mm_mul_ps(acc, zz);
 - 	acc = _mm_sub_ps(acc, atancof_p1);
 - 	acc = _mm_mul_ps(acc, zz);
 - 	acc = _mm_add_ps(acc, atancof_p2);
 - 	acc = _mm_mul_ps(acc, zz);
 - 	acc = _mm_sub_ps(acc, atancof_p3);
 - 	acc = _mm_mul_ps(acc, zz);
 - 	acc = _mm_mul_ps(acc, x);
 - 	acc = _mm_add_ps(acc, x);
 - 	y = _mm_add_ps(y, acc);
 - 
 - 	/* update the sign */
 - 	y = _mm_xor_ps(y, sign_bit);
 - 
 - 	return y;
 - }
 - 
 - inline __m128 sse_mathfun_atan2_ps(__m128 y, __m128 x) {
 - 	__m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000));
 - 	__m128 x_eq_0 = _mm_cmpeq_ps(x, _mm_setzero_ps());
 - 	__m128 x_gt_0 = _mm_cmpgt_ps(x, _mm_setzero_ps());
 - 	__m128 x_le_0 = _mm_cmple_ps(x, _mm_setzero_ps());
 - 	__m128 y_eq_0 = _mm_cmpeq_ps(y, _mm_setzero_ps());
 - 	__m128 x_lt_0 = _mm_cmplt_ps(x, _mm_setzero_ps());
 - 	__m128 y_lt_0 = _mm_cmplt_ps(y, _mm_setzero_ps());
 - 	__m128 cephes_PIF = _mm_set_ps1(3.141592653589793238);
 - 	__m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192);
 - 
 - 	__m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0);
 - 	__m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0);
 - 	zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case);
 - 
 - 	__m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0);
 - 	__m128 pio2_mask_sign = _mm_and_ps(y_lt_0, sign_mask);
 - 	__m128 pio2_result = cephes_PIO2F;
 - 	pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign);
 - 	pio2_result = _mm_and_ps(pio2_mask, pio2_result);
 - 
 - 	__m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0);
 - 	__m128 pi = cephes_PIF;
 - 	__m128 pi_result = _mm_and_ps(pi_mask, pi);
 - 
 - 	__m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0);
 - 	swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, sign_mask);
 - 
 - 	__m128 offset0 = _mm_setzero_ps();
 - 	__m128 offset1 = cephes_PIF;
 - 	offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset);
 - 
 - 	__m128 offset = _mm_andnot_ps(x_lt_0, offset0);
 - 	offset = _mm_and_ps(x_lt_0, offset1);
 - 
 - 	__m128 arg = _mm_div_ps(y, x);
 - 	__m128 atan_result = sse_mathfun_atan_ps(arg);
 - 	atan_result = _mm_add_ps(atan_result, offset);
 - 
 - 	/* select between zero_result, pio2_result and atan_result */
 - 
 - 	__m128 result = _mm_andnot_ps(zero_mask, pio2_result);
 - 	atan_result = _mm_andnot_ps(pio2_mask, atan_result);
 - 	atan_result = _mm_andnot_ps(pio2_mask, atan_result);
 - 	result = _mm_or_ps(result, atan_result);
 - 	result = _mm_or_ps(result, pi_result);
 - 
 - 	return result;
 - }
 
 
  |