From cdc0bde8b72baa4740ff16ea317033a9f4df9953 Mon Sep 17 00:00:00 2001 From: Andrew Belt Date: Mon, 5 Aug 2019 07:44:08 -0400 Subject: [PATCH] Port sse_mathfun_extension to simd folder. --- include/simd/functions.hpp | 22 +- include/simd/sse_mathfun.h | 6 +- include/simd/sse_mathfun_extension.h | 295 +++++++++++++++++++++++++++ 3 files changed, 317 insertions(+), 6 deletions(-) create mode 100644 include/simd/sse_mathfun_extension.h diff --git a/include/simd/functions.hpp b/include/simd/functions.hpp index c68830d0..63c9e298 100644 --- a/include/simd/functions.hpp +++ b/include/simd/functions.hpp @@ -1,8 +1,8 @@ #pragma once #include -#include +#include +#include #include -#include namespace rack { @@ -75,6 +75,24 @@ inline float_4 cos(float_4 x) { return float_4(sse_mathfun_cos_ps(x.v)); } +using std::tan; + +inline float_4 tan(float_4 x) { + return float_4(sse_mathfun_tan_ps(x.v)); +} + +using std::atan; + +inline float_4 atan(float_4 x) { + return float_4(sse_mathfun_atan_ps(x.v)); +} + +using std::atan2; + +inline float_4 atan2(float_4 x, float_4 y) { + return float_4(sse_mathfun_atan2_ps(x.v, y.v)); +} + using std::floor; inline float_4 floor(float_4 a) { diff --git a/include/simd/sse_mathfun.h b/include/simd/sse_mathfun.h index 452229e1..6276d489 100644 --- a/include/simd/sse_mathfun.h +++ b/include/simd/sse_mathfun.h @@ -1,4 +1,3 @@ -#pragma once /* Modified version of http://gruntthepeon.free.fr/ssemath/ for VCV Rack. The following changes were made. @@ -6,7 +5,7 @@ The following changes were made. - Make all functions inline since this is a header file. - Remove non-SSE2 code, since Rack assumes SSE2 CPUs. - Move `const static` variables to function variables for clarity. See https://stackoverflow.com/a/52139901/272642 for explanation of why the performance is not worse. -- Change header file to since we're using SSE2 intrinsics. +- Change header file to since we're using SSE2 intrinsics. - Prefix functions with `sse_mathfun_`. - Add floor, ceil, fmod. @@ -43,8 +42,7 @@ This derived source file is released under the zlib license. (this is the zlib license) */ - - +#pragma once #include diff --git a/include/simd/sse_mathfun_extension.h b/include/simd/sse_mathfun_extension.h new file mode 100644 index 00000000..84305261 --- /dev/null +++ b/include/simd/sse_mathfun_extension.h @@ -0,0 +1,295 @@ +/* Modified version of https://github.com/to-miz/sse_mathfun_extension for VCV Rack. + +The following changes were made. +- Make functions inline. +- Change global constants to function-scope. + +This derived source file is released under the zlib license. +*/ + +/* +sse_mathfun_extension.h - zlib license +Written by Tolga Mizrak 2016 +Extension of sse_mathfun.h, which is written by Julien Pommier + +Based on the corresponding algorithms of the cephes math library + +This is written as an extension to sse_mathfun.h instead of modifying it, just because I didn't want +to maintain a modified version of the original library. This way switching to a newer version of the +library won't be a hassle. + +Note that non SSE2 implementations of tan_ps, atan_ps, cot_ps and atan2_ps are not implemented yet. +As such, currently you need to #define USE_SSE2 to compile. + +With tan_ps, cot_ps you get good precision on input ranges that are further away from the domain +borders (-PI/2, PI/2 for tan and 0, 1 for cot). See the results on the deviations for these +functions on my machine: +checking tan on [-0.25*Pi, 0.25*Pi] +max deviation from tanf(x): 1.19209e-07 at 0.250000006957*Pi, max deviation from cephes_tan(x): +5.96046e-08 + ->> precision OK for the tan_ps <<- + +checking tan on [-0.49*Pi, 0.49*Pi] +max deviation from tanf(x): 3.8147e-06 at -0.490000009841*Pi, max deviation from cephes_tan(x): +9.53674e-07 + ->> precision OK for the tan_ps <<- + +checking cot on [0.2*Pi, 0.7*Pi] +max deviation from cotf(x): 1.19209e-07 at 0.204303119606*Pi, max deviation from cephes_cot(x): +1.19209e-07 + ->> precision OK for the cot_ps <<- + +checking cot on [0.01*Pi, 0.99*Pi] +max deviation from cotf(x): 3.8147e-06 at 0.987876517942*Pi, max deviation from cephes_cot(x): +9.53674e-07 + ->> precision OK for the cot_ps <<- + +With atan_ps and atan2_ps you get pretty good precision, atan_ps max deviation is < 2e-7 and +atan2_ps max deviation is < 2.5e-7 +*/ + +/* Copyright (C) 2016 Tolga Mizrak + + This software is provided 'as-is', without any express or implied + warranty. In no event will the authors be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this software must not be misrepresented; you must not + claim that you wrote the original software. If you use this software + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + 3. This notice may not be removed or altered from any source distribution. + + (this is the zlib license) +*/ +#pragma once +#include "sse_mathfun.h" + + +inline __m128 sse_mathfun_tancot_ps(__m128 x, int cotFlag) { + __m128 p0 = _mm_set_ps1(9.38540185543E-3); + __m128 p1 = _mm_set_ps1(3.11992232697E-3); + __m128 p2 = _mm_set_ps1(2.44301354525E-2); + __m128 p3 = _mm_set_ps1(5.34112807005E-2); + __m128 p4 = _mm_set_ps1(1.33387994085E-1); + __m128 p5 = _mm_set_ps1(3.33331568548E-1); + + __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y; + + __m128i emm2; + sign_bit = x; + /* take the absolute value */ + __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); + x = _mm_and_ps(x, inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, sign_mask); + + /* scale by 4/Pi */ + __m128 cephes_FOPI = _mm_set_ps1(1.27323954473516); + y = _mm_mul_ps(x, cephes_FOPI); + + /* store the integer part of y in mm0 */ + emm2 = _mm_cvttps_epi32(y); + /* j=(j+1) & (~1) (see the cephes sources) */ + emm2 = _mm_add_epi32(emm2, _mm_set1_epi32(1)); + emm2 = _mm_and_si128(emm2, _mm_set1_epi32(~1)); + y = _mm_cvtepi32_ps(emm2); + + emm2 = _mm_and_si128(emm2, _mm_set1_epi32(2)); + emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128()); + + __m128 poly_mask = _mm_castsi128_ps(emm2); + /* The magic pass: "Extended precision modular arithmetic" + x = ((x - y * DP1) - y * DP2) - y * DP3; */ + __m128 minus_cephes_DP1 = _mm_set_ps1(-0.78515625); + __m128 minus_cephes_DP2 = _mm_set_ps1(-2.4187564849853515625e-4); + __m128 minus_cephes_DP3 = _mm_set_ps1(-3.77489497744594108e-8); + xmm1 = minus_cephes_DP1; + xmm2 = minus_cephes_DP2; + xmm3 = minus_cephes_DP3; + xmm1 = _mm_mul_ps(y, xmm1); + xmm2 = _mm_mul_ps(y, xmm2); + xmm3 = _mm_mul_ps(y, xmm3); + __m128 z = _mm_add_ps(x, xmm1); + z = _mm_add_ps(z, xmm2); + z = _mm_add_ps(z, xmm3); + + __m128 zz = _mm_mul_ps(z, z); + + y = p0; + y = _mm_mul_ps(y, zz); + y = _mm_add_ps(y, p1); + y = _mm_mul_ps(y, zz); + y = _mm_add_ps(y, p2); + y = _mm_mul_ps(y, zz); + y = _mm_add_ps(y, p3); + y = _mm_mul_ps(y, zz); + y = _mm_add_ps(y, p4); + y = _mm_mul_ps(y, zz); + y = _mm_add_ps(y, p5); + y = _mm_mul_ps(y, zz); + y = _mm_mul_ps(y, z); + y = _mm_add_ps(y, z); + + __m128 y2; + if (cotFlag) { + y2 = _mm_xor_ps(y, sign_mask); + /* y = _mm_rcp_ps(y); */ + /* using _mm_rcp_ps here loses on way too much precision, better to do a div */ + y = _mm_div_ps(_mm_set_ps1(1.f), y); + } + else { + /* y2 = _mm_rcp_ps(y); */ + /* using _mm_rcp_ps here loses on way too much precision, better to do a div */ + y2 = _mm_div_ps(_mm_set_ps1(1.f), y); + y2 = _mm_xor_ps(y2, sign_mask); + } + + /* select the correct result from the two polynoms */ + xmm3 = poly_mask; + y = _mm_and_ps(xmm3, y); + y2 = _mm_andnot_ps(xmm3, y2); + y = _mm_or_ps(y, y2); + + /* update the sign */ + y = _mm_xor_ps(y, sign_bit); + + return y; +} + +inline __m128 sse_mathfun_tan_ps(__m128 x) { + return sse_mathfun_tancot_ps(x, 0); +} + +inline __m128 sse_mathfun_cot_ps(__m128 x) { + return sse_mathfun_tancot_ps(x, 1); +} + + +inline __m128 sse_mathfun_atan_ps(__m128 x) { + __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + __m128 inv_sign_mask = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); + + __m128 atanrange_hi = _mm_set_ps1(2.414213562373095); + __m128 atanrange_lo = _mm_set_ps1(0.4142135623730950); + __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192); + __m128 cephes_PIO4F = _mm_set_ps1(0.7853981633974483096); + + __m128 atancof_p0 = _mm_set_ps1(8.05374449538e-2); + __m128 atancof_p1 = _mm_set_ps1(1.38776856032E-1); + __m128 atancof_p2 = _mm_set_ps1(1.99777106478E-1); + __m128 atancof_p3 = _mm_set_ps1(3.33329491539E-1); + + __m128 sign_bit, y; + + sign_bit = x; + /* take the absolute value */ + x = _mm_and_ps(x, inv_sign_mask); + /* extract the sign bit (upper one) */ + sign_bit = _mm_and_ps(sign_bit, sign_mask); + + /* range reduction, init x and y depending on range */ + /* x > 2.414213562373095 */ + __m128 cmp0 = _mm_cmpgt_ps(x, atanrange_hi); + /* x > 0.4142135623730950 */ + __m128 cmp1 = _mm_cmpgt_ps(x, atanrange_lo); + + /* x > 0.4142135623730950 && !(x > 2.414213562373095) */ + __m128 cmp2 = _mm_andnot_ps(cmp0, cmp1); + + /* -(1.0/x) */ + __m128 y0 = _mm_and_ps(cmp0, cephes_PIO2F); + __m128 x0 = _mm_div_ps(_mm_set_ps1(1.f), x); + x0 = _mm_xor_ps(x0, sign_mask); + + __m128 y1 = _mm_and_ps(cmp2, cephes_PIO4F); + /* (x-1.0)/(x+1.0) */ + __m128 x1_o = _mm_sub_ps(x, _mm_set_ps1(1.f)); + __m128 x1_u = _mm_add_ps(x, _mm_set_ps1(1.f)); + __m128 x1 = _mm_div_ps(x1_o, x1_u); + + __m128 x2 = _mm_and_ps(cmp2, x1); + x0 = _mm_and_ps(cmp0, x0); + x2 = _mm_or_ps(x2, x0); + cmp1 = _mm_or_ps(cmp0, cmp2); + x2 = _mm_and_ps(cmp1, x2); + x = _mm_andnot_ps(cmp1, x); + x = _mm_or_ps(x2, x); + + y = _mm_or_ps(y0, y1); + + __m128 zz = _mm_mul_ps(x, x); + __m128 acc = atancof_p0; + acc = _mm_mul_ps(acc, zz); + acc = _mm_sub_ps(acc, atancof_p1); + acc = _mm_mul_ps(acc, zz); + acc = _mm_add_ps(acc, atancof_p2); + acc = _mm_mul_ps(acc, zz); + acc = _mm_sub_ps(acc, atancof_p3); + acc = _mm_mul_ps(acc, zz); + acc = _mm_mul_ps(acc, x); + acc = _mm_add_ps(acc, x); + y = _mm_add_ps(y, acc); + + /* update the sign */ + y = _mm_xor_ps(y, sign_bit); + + return y; +} + +inline __m128 sse_mathfun_atan2_ps(__m128 y, __m128 x) { + __m128 sign_mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); + __m128 x_eq_0 = _mm_cmpeq_ps(x, _mm_setzero_ps()); + __m128 x_gt_0 = _mm_cmpgt_ps(x, _mm_setzero_ps()); + __m128 x_le_0 = _mm_cmple_ps(x, _mm_setzero_ps()); + __m128 y_eq_0 = _mm_cmpeq_ps(y, _mm_setzero_ps()); + __m128 x_lt_0 = _mm_cmplt_ps(x, _mm_setzero_ps()); + __m128 y_lt_0 = _mm_cmplt_ps(y, _mm_setzero_ps()); + __m128 cephes_PIF = _mm_set_ps1(3.141592653589793238); + __m128 cephes_PIO2F = _mm_set_ps1(1.5707963267948966192); + + __m128 zero_mask = _mm_and_ps(x_eq_0, y_eq_0); + __m128 zero_mask_other_case = _mm_and_ps(y_eq_0, x_gt_0); + zero_mask = _mm_or_ps(zero_mask, zero_mask_other_case); + + __m128 pio2_mask = _mm_andnot_ps(y_eq_0, x_eq_0); + __m128 pio2_mask_sign = _mm_and_ps(y_lt_0, sign_mask); + __m128 pio2_result = cephes_PIO2F; + pio2_result = _mm_xor_ps(pio2_result, pio2_mask_sign); + pio2_result = _mm_and_ps(pio2_mask, pio2_result); + + __m128 pi_mask = _mm_and_ps(y_eq_0, x_le_0); + __m128 pi = cephes_PIF; + __m128 pi_result = _mm_and_ps(pi_mask, pi); + + __m128 swap_sign_mask_offset = _mm_and_ps(x_lt_0, y_lt_0); + swap_sign_mask_offset = _mm_and_ps(swap_sign_mask_offset, sign_mask); + + __m128 offset0 = _mm_setzero_ps(); + __m128 offset1 = cephes_PIF; + offset1 = _mm_xor_ps(offset1, swap_sign_mask_offset); + + __m128 offset = _mm_andnot_ps(x_lt_0, offset0); + offset = _mm_and_ps(x_lt_0, offset1); + + __m128 arg = _mm_div_ps(y, x); + __m128 atan_result = sse_mathfun_atan_ps(arg); + atan_result = _mm_add_ps(atan_result, offset); + + /* select between zero_result, pio2_result and atan_result */ + + __m128 result = _mm_andnot_ps(zero_mask, pio2_result); + atan_result = _mm_andnot_ps(pio2_mask, atan_result); + atan_result = _mm_andnot_ps(pio2_mask, atan_result); + result = _mm_or_ps(result, atan_result); + result = _mm_or_ps(result, pi_result); + + return result; +}