Browse Source

arm64: Enable simd implementations (#398)

GCC defines __ARM_NEON__ for asimd on ARMv7 and __ARM_NEON for simd on
ARMv8. Therefore also check for __ARM_NEON.

Signed-off-by: Timo Wischer <twischer@de.adit-jv.com>
tags/v1.9.13
twischer-adit Filipe Coelho <falktx@falktx.com> 6 years ago
parent
commit
04bdd8a635
3 changed files with 22 additions and 18 deletions
  1. +3
    -3
      common/JackAudioPort.cpp
  2. +14
    -14
      common/memops.c
  3. +5
    -1
      example-clients/simdtests.cpp

+ 3
- 3
common/JackAudioPort.cpp View File

@@ -28,7 +28,7 @@ Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
#include <Accelerate/Accelerate.h> #include <Accelerate/Accelerate.h>
#elif defined (__SSE__) && !defined (__sun__) #elif defined (__SSE__) && !defined (__sun__)
#include <xmmintrin.h> #include <xmmintrin.h>
#elif defined (__ARM_NEON__)
#elif defined (__ARM_NEON__) || defined (__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif


@@ -56,7 +56,7 @@ static inline void MixAudioBuffer(jack_default_audio_sample_t* mixbuffer, jack_d
mixbuffer += 4; mixbuffer += 4;
buffer += 4; buffer += 4;
frames_group--; frames_group--;
#elif defined (__ARM_NEON__)
#elif defined (__ARM_NEON__) || defined (__ARM_NEON)
float32x4_t vec = vaddq_f32(vld1q_f32(mixbuffer), vld1q_f32(buffer)); float32x4_t vec = vaddq_f32(vld1q_f32(mixbuffer), vld1q_f32(buffer));
vst1q_f32(mixbuffer, vec); vst1q_f32(mixbuffer, vec);


@@ -125,7 +125,7 @@ static void AudioBufferMixdown(void* mixbuffer, void** src_buffers, int src_coun
for (jack_nframes_t i = 0; i != remaining_frames; ++i) { for (jack_nframes_t i = 0; i != remaining_frames; ++i) {
target[i] = source[i]; target[i] = source[i];
} }
#elif defined (__ARM_NEON__)
#elif defined (__ARM_NEON__) || defined (__ARM_NEON)
jack_nframes_t frames_group = nframes / 4; jack_nframes_t frames_group = nframes / 4;
jack_nframes_t remaining_frames = nframes % 4; jack_nframes_t remaining_frames = nframes % 4;




+ 14
- 14
common/memops.c View File

@@ -42,7 +42,7 @@
#endif #endif
#endif #endif


#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif


@@ -198,7 +198,7 @@ static inline __m128i float_24_sse(__m128 s)
#endif #endif




#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)


static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max) static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
{ {
@@ -281,7 +281,7 @@ void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsign


void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
{ {
#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
nsamples = nsamples & 3; nsamples = nsamples & 3;


@@ -380,7 +380,7 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne
src++; src++;
} }


#elif defined(__ARM_NEON__)
#elif defined (__ARM_NEON__) || defined (__ARM_NEON)
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
nsamples = nsamples & 3; nsamples = nsamples & 3;


@@ -417,7 +417,7 @@ void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigne


void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
{ {
#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING); float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
while (unrolled--) { while (unrolled--) {
@@ -506,7 +506,7 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne
dst += 4; dst += 4;
} }
nsamples = nsamples & 3; nsamples = nsamples & 3;
#elif defined(__ARM_NEON__)
#elif defined (__ARM_NEON__) || defined (__ARM_NEON)
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING); float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
while (unrolled--) { while (unrolled--) {
@@ -548,7 +548,7 @@ void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigne


void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
{ {
#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
while (unrolled--) { while (unrolled--) {
int i; int i;
@@ -619,7 +619,7 @@ void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned l
nsamples -= 4; nsamples -= 4;
src += 4; src += 4;
} }
#elif defined(__ARM_NEON__)
#elif defined (__ARM_NEON__) || defined (__ARM_NEON)
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
while (unrolled--) { while (unrolled--) {
int i; int i;
@@ -655,7 +655,7 @@ void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned
{ {
const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING; const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;


#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
// we shift 8 to the right by dividing by 256.0 -> no sign extra handling // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
const float32x4_t vscaling = vdupq_n_f32(scaling/256.0); const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
int32_t x[4]; int32_t x[4];
@@ -740,7 +740,7 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l
dst += 4; dst += 4;
nsamples -= 4; nsamples -= 4;
} }
#elif defined(__ARM_NEON__)
#elif defined (__ARM_NEON__) || defined (__ARM_NEON)
// we shift 8 to the right by dividing by 256.0 -> no sign extra handling // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
const float32x4_t vscaling = vdupq_n_f32(scaling/256.0); const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
int32_t x[4]; int32_t x[4];
@@ -787,7 +787,7 @@ void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned l


void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
{ {
#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
nsamples = nsamples & 3; nsamples = nsamples & 3;


@@ -838,7 +838,7 @@ void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned


void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
{ {
#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
nsamples = nsamples & 3; nsamples = nsamples & 3;


@@ -1017,7 +1017,7 @@ void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned
{ {
short z; short z;
const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING; const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
const float32x4_t vscaling = vdupq_n_f32(scaling); const float32x4_t vscaling = vdupq_n_f32(scaling);
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
while (unrolled--) { while (unrolled--) {
@@ -1069,7 +1069,7 @@ void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned l
{ {
/* ALERT: signed sign-extension portability !!! */ /* ALERT: signed sign-extension portability !!! */
const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING; const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
const float32x4_t vscaling = vdupq_n_f32(scaling); const float32x4_t vscaling = vdupq_n_f32(scaling);
unsigned long unrolled = nsamples / 4; unsigned long unrolled = nsamples / 4;
while (unrolled--) { while (unrolled--) {


+ 5
- 1
example-clients/simdtests.cpp View File

@@ -40,7 +40,7 @@
#endif #endif
#endif #endif


#ifdef __ARM_NEON__
#if defined (__ARM_NEON__) || defined (__ARM_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif


@@ -63,6 +63,10 @@ namespace origerated {
#undef __ARM_NEON__ #undef __ARM_NEON__
#endif #endif


#ifdef __ARM_NEON
#undef __ARM_NEON
#endif

#include "../common/memops.c" #include "../common/memops.c"
} }




Loading…
Cancel
Save