- /*
- Copyright (C) 2000 Paul Davis
- This program is free software; you can redistribute it and/or modify
- it under the terms of the GNU General Public License as published by
- the Free Software Foundation; either version 2 of the License, or
- (at your option) any later version.
- This program is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- GNU General Public License for more details.
- You should have received a copy of the GNU General Public License
- along with this program; if not, write to the Free Software
- Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
- #define _ISOC9X_SOURCE 1
- #define _ISOC99_SOURCE 1
- #define __USE_ISOC9X 1
- #define __USE_ISOC99 1
- #include <stdio.h>
- #include <string.h>
- #include <math.h>
- #include <memory.h>
- #include <stdlib.h>
- #include <stdint.h>
- #include <limits.h>
- #ifdef __linux__
- #include <endian.h>
- #endif
- #include "memops.h"
- #if defined (__SSE2__) && !defined (__sun__)
- #include <emmintrin.h>
- #ifdef __SSE4_1__
- #include <smmintrin.h>
- #endif
- #endif
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- #include <arm_neon.h>
- #endif
- /* Notes about these *_SCALING values.
- the MAX_<N>BIT values are floating point. when multiplied by
- a full-scale normalized floating point sample value (-1.0..+1.0)
- they should give the maximum value representable with an integer
- sample type of N bits. Note that this is asymmetric. Sample ranges
- for signed integer, 2's complement values are -(2^(N-1) to +(2^(N-1)-1)
- Complications
- -------------
- If we use +2^(N-1) for the scaling factors, we run into a problem:
- if we start with a normalized float value of -1.0, scaling
- to 24 bits would give -8388608 (-2^23), which is ideal.
- But with +1.0, we get +8388608, which is technically out of range.
- We never multiply a full range normalized value by this constant,
- but we could multiply it by a positive value that is close enough to +1.0
- to produce a value > +(2^(N-1)-1.
- There is no way around this paradox without wasting CPU cycles to determine
- which scaling factor to use (i.e. determine if its negative or not,
- use the right factor).
- So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor.
- */
- #define SAMPLE_32BIT_SCALING 2147483647.0
- #define SAMPLE_24BIT_SCALING 8388607.0f
- #define SAMPLE_16BIT_SCALING 32767.0f
- /* these are just values to use if the floating point value was out of range
- advice from Fons Adriaensen: make the limits symmetrical
- */
- #define SAMPLE_32BIT_MAX 2147483647
- #define SAMPLE_32BIT_MIN -2147483647
- #define SAMPLE_32BIT_MAX_D 2147483647.0
- #define SAMPLE_32BIT_MIN_D -2147483647.0
- #define SAMPLE_24BIT_MAX 8388607
- #define SAMPLE_24BIT_MIN -8388607
- #define SAMPLE_24BIT_MAX_F 8388607.0f
- #define SAMPLE_24BIT_MIN_F -8388607.0f
- #define SAMPLE_16BIT_MAX 32767
- #define SAMPLE_16BIT_MIN -32767
- #define SAMPLE_16BIT_MAX_F 32767.0f
- #define SAMPLE_16BIT_MIN_F -32767.0f
- /* these mark the outer edges of the range considered "within" range
- for a floating point sample value. values outside (and on the boundaries)
- of this range will be clipped before conversion; values within this
- range will be scaled to appropriate values for the target sample
- type.
- */
- #define NORMALIZED_FLOAT_MIN -1.0f
- /* define this in case we end up on a platform that is missing
- the real lrintf functions
- */
- #define f_round(f) lrintf(f)
- #define d_round(f) lrint(f)
- #define float_16(s, d)\
- if ((s) <= NORMALIZED_FLOAT_MIN) {\
- (d) = SAMPLE_16BIT_MIN;\
- } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
- (d) = SAMPLE_16BIT_MAX;\
- } else {\
- (d) = f_round ((s) * SAMPLE_16BIT_SCALING);\
- }
- /* call this when "s" has already been scaled (e.g. when dithering)
- */
- #define float_16_scaled(s, d)\
- if ((s) <= SAMPLE_16BIT_MIN_F) {\
- (d) = SAMPLE_16BIT_MIN_F;\
- } else if ((s) >= SAMPLE_16BIT_MAX_F) { \
- (d) = SAMPLE_16BIT_MAX;\
- } else {\
- (d) = f_round ((s));\
- }
- #define float_24u32(s, d) \
- if ((s) <= NORMALIZED_FLOAT_MIN) {\
- (d) = SAMPLE_24BIT_MIN << 8;\
- } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
- (d) = SAMPLE_24BIT_MAX << 8;\
- } else {\
- (d) = f_round ((s) * SAMPLE_24BIT_SCALING) << 8;\
- }
- #define float_24l32(s, d) \
- if ((s) <= NORMALIZED_FLOAT_MIN) {\
- (d) = SAMPLE_24BIT_MIN; \
- } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
- (d) = SAMPLE_24BIT_MAX; \
- } else {\
- (d) = f_round ((s) * SAMPLE_24BIT_SCALING); \
- }
- #define float_32(s, d) \
- do { \
- double clipped = fmin(NORMALIZED_FLOAT_MAX, \
- fmax((double)(s), NORMALIZED_FLOAT_MIN)); \
- double scaled = clipped * SAMPLE_32BIT_MAX_D; \
- (d) = d_round(scaled); \
- } \
- while (0)
- /* call this when "s" has already been scaled (e.g. when dithering)
- */
- #define float_24u32_scaled(s, d)\
- if ((s) <= SAMPLE_24BIT_MIN_F) {\
- (d) = SAMPLE_24BIT_MIN << 8;\
- } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
- (d) = SAMPLE_24BIT_MAX << 8; \
- } else {\
- (d) = f_round ((s)) << 8; \
- }
- #define float_24(s, d) \
- if ((s) <= NORMALIZED_FLOAT_MIN) {\
- (d) = SAMPLE_24BIT_MIN;\
- } else if ((s) >= NORMALIZED_FLOAT_MAX) {\
- (d) = SAMPLE_24BIT_MAX;\
- } else {\
- (d) = f_round ((s) * SAMPLE_24BIT_SCALING);\
- }
- /* call this when "s" has already been scaled (e.g. when dithering)
- */
- #define float_24_scaled(s, d)\
- if ((s) <= SAMPLE_24BIT_MIN_F) {\
- (d) = SAMPLE_24BIT_MIN;\
- } else if ((s) >= SAMPLE_24BIT_MAX_F) { \
- (d) = SAMPLE_24BIT_MAX; \
- } else {\
- (d) = f_round ((s)); \
- }
- #if defined (__SSE2__) && !defined (__sun__)
- /* generates same as _mm_set_ps(1.f, 1.f, 1f., 1f) but faster */
- static inline __m128 gen_one(void)
- {
- volatile __m128i x = { 0 }; /* shut up, GCC */
- __m128i ones = _mm_cmpeq_epi32(x, x);
- return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 25), 23);
- }
- static inline __m128 clip(__m128 s, __m128 min, __m128 max)
- {
- return _mm_min_ps(max, _mm_max_ps(s, min));
- }
- static inline __m128d clip_double(__m128d s, __m128d min, __m128d max)
- {
- return _mm_min_pd(max, _mm_max_pd(s, min));
- }
- static inline __m128i float_24_sse(__m128 s)
- {
- const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */
- const __m128 lower_bound = _mm_sub_ps(_mm_setzero_ps(), upper_bound);
- __m128 clipped = clip(s, lower_bound, upper_bound);
- __m128 scaled = _mm_mul_ps(clipped, _mm_set1_ps(SAMPLE_24BIT_SCALING));
- return _mm_cvtps_epi32(scaled);
- }
- #endif
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max)
- {
- return vminq_f32(max, vmaxq_f32(s, min));
- }
- static inline int32x4_t float_24_neon(float32x4_t s)
- {
- const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
- const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
- float32x4_t clipped = clip(s, lower_bound, upper_bound);
- float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_24BIT_SCALING));
- return vcvtq_s32_f32(scaled);
- }
- static inline int16x4_t float_16_neon(float32x4_t s)
- {
- const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX);
- const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN);
- float32x4_t clipped = clip(s, lower_bound, upper_bound);
- float32x4_t scaled = vmulq_f32(clipped, vdupq_n_f32(SAMPLE_16BIT_SCALING));
- return vmovn_s32(vcvtq_s32_f32(scaled));
- }
- #endif
- /* Linear Congruential noise generator. From the music-dsp list
- * less random than rand(), but good enough and 10x faster
- */
- static unsigned int seed = 22222;
- static inline unsigned int fast_rand() {
- seed = (seed * 196314165) + 907633515;
- return seed;
- }
- /* functions for native float sample data */
- void sample_move_floatLE_sSs (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) {
- while (nsamples--) {
- *dst = *((float *) src);
- dst++;
- src += src_skip;
- }
- }
- void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) {
- while (nsamples--) {
- *((float *) dst) = *src;
- dst += dst_skip;
- src++;
- }
- }
- /* NOTES on function naming:
- foo_bar_d<TYPE>_s<TYPE>
- the "d<TYPE>" component defines the destination type for the operation
- the "s<TYPE>" component defines the source type for the operation
- TYPE can be one of:
- S - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value
- Ss - like S but reverse endian from the host CPU
- 32 - sample is a signed 32 bit integer value
- 32u24 - sample is a signed 32 bit integer value, but data is in upper 24 bits only
- 32u24s - like 32u24 but reverse endian from the host CPU
- 32l24 - sample is a signed 32 bit integer value, but data is in lower 24 bits only
- 32l24s - like 32l24 but reverse endian from the host CPU
- 24 - sample is a signed 24 bit integer value
- 24s - like 24 but reverse endian from the host CPU
- 16 - sample is a signed 16 bit integer value
- 16s - like 16 but reverse endian from the host CPU
- For obvious reasons, the reverse endian versions only show as source types.
- This covers all known sample formats at 16 bits or larger.
- */
- /* functions for native integer sample data */
- void sample_move_d32_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- while (nsamples--) {
- int32_t z;
- float_32(*src, z);
- dst[0]=(char)(z>>24);
- dst[1]=(char)(z>>16);
- dst[2]=(char)(z>>8);
- dst[3]=(char)(z);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(z);
- dst[1]=(char)(z>>8);
- dst[2]=(char)(z>>16);
- dst[3]=(char)(z>>24);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_d32_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- while (nsamples--) {
- float_32(*src, *(int32_t *)dst);
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- float32x4_t samples = vld1q_f32(src);
- int32x4_t converted = float_24_neon(samples);
- int32x4_t shifted = vshlq_n_s32(converted, 8);
- shifted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(shifted)));
- switch(dst_skip) {
- case 4:
- vst1q_s32((int32_t*)dst, shifted);
- break;
- default:
- vst1q_lane_s32((int32_t*)(dst), shifted, 0);
- vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
- vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
- vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
- break;
- }
- dst += 4*dst_skip;
- src+= 4;
- }
- #endif
- int32_t z;
- while (nsamples--) {
- float_24u32 (*src, z);
- dst[0]=(char)(z>>24);
- dst[1]=(char)(z>>16);
- dst[2]=(char)(z>>8);
- dst[3]=(char)(z);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(z);
- dst[1]=(char)(z>>8);
- dst[2]=(char)(z>>16);
- dst[3]=(char)(z>>24);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__SSE2__) && !defined (__sun__)
- __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
- __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
- __m128 factor = int_max;
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- __m128 in = _mm_load_ps(src);
- __m128 scaled = _mm_mul_ps(in, factor);
- __m128 clipped = clip(scaled, int_min, int_max);
- __m128i y = _mm_cvttps_epi32(clipped);
- __m128i shifted = _mm_slli_epi32(y, 8);
- #ifdef __SSE4_1__
- *(int32_t*)dst = _mm_extract_epi32(shifted, 0);
- *(int32_t*)(dst+dst_skip) = _mm_extract_epi32(shifted, 1);
- *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
- *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
- #else
- __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
- __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
- __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
- _mm_store_ss((float*)dst, (__m128)shifted);
- _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
- _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
- _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
- #endif
- dst += 4*dst_skip;
- src+= 4;
- }
- while (nsamples--) {
- __m128 in = _mm_load_ss(src);
- __m128 scaled = _mm_mul_ss(in, factor);
- __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
- int y = _mm_cvttss_si32(clipped);
- *((int *) dst) = y<<8;
- dst += dst_skip;
- src++;
- }
- #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- float32x4_t samples = vld1q_f32(src);
- int32x4_t converted = float_24_neon(samples);
- int32x4_t shifted = vshlq_n_s32(converted, 8);
- switch(dst_skip) {
- case 4:
- vst1q_s32((int32_t*)dst, shifted);
- break;
- default:
- vst1q_lane_s32((int32_t*)(dst), shifted, 0);
- vst1q_lane_s32((int32_t*)(dst+dst_skip), shifted, 1);
- vst1q_lane_s32((int32_t*)(dst+2*dst_skip), shifted, 2);
- vst1q_lane_s32((int32_t*)(dst+3*dst_skip), shifted, 3);
- break;
- }
- dst += 4*dst_skip;
- src+= 4;
- }
- #endif
- #if !defined (__SSE2__)
- while (nsamples--) {
- float_24u32 (*src, *((int32_t*) dst));
- dst += dst_skip;
- src++;
- }
- #endif
- }
- void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- int32x4_t src128;
- switch(src_skip)
- {
- case 4:
- src128 = vld1q_s32((int32_t*)src);
- break;
- case 8:
- src128 = vld2q_s32((int32_t*)src).val[0];
- break;
- default:
- src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
- src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
- src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
- src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
- break;
- }
- src128 = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(src128)));
- int32x4_t shifted = vshrq_n_s32(src128, 8);
- float32x4_t as_float = vcvtq_f32_s32(shifted);
- float32x4_t divided = vmulq_f32(as_float, factor);
- vst1q_f32(dst, divided);
- src += 4*src_skip;
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- /* ALERT: signed sign-extension portability !!! */
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
- while (nsamples--) {
- int x;
- x = (unsigned char)(src[0]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[2]);
- x <<= 8;
- x |= (unsigned char)(src[3]);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- x = (unsigned char)(src[3]);
- x <<= 8;
- x |= (unsigned char)(src[2]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[0]);
- #endif
- *dst = (x >> 8) * scaling;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- #if defined (__SSE2__) && !defined (__sun__)
- unsigned long unrolled = nsamples / 4;
- static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
- __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
- while (unrolled--)
- {
- int i1 = *((int *) src);
- src+= src_skip;
- int i2 = *((int *) src);
- src+= src_skip;
- int i3 = *((int *) src);
- src+= src_skip;
- int i4 = *((int *) src);
- src+= src_skip;
- __m128i src = _mm_set_epi32(i4, i3, i2, i1);
- __m128i shifted = _mm_srai_epi32(src, 8);
- __m128 as_float = _mm_cvtepi32_ps(shifted);
- __m128 divided = _mm_mul_ps(as_float, factor);
- _mm_storeu_ps(dst, divided);
- dst += 4;
- }
- nsamples = nsamples & 3;
- #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
- while (unrolled--) {
- int32x4_t src128;
- switch(src_skip) {
- case 4:
- src128 = vld1q_s32((int32_t*)src);
- break;
- case 8:
- src128 = vld2q_s32((int32_t*)src).val[0];
- break;
- default:
- src128 = vld1q_lane_s32((int32_t*)src, src128, 0);
- src128 = vld1q_lane_s32((int32_t*)(src+src_skip), src128, 1);
- src128 = vld1q_lane_s32((int32_t*)(src+2*src_skip), src128, 2);
- src128 = vld1q_lane_s32((int32_t*)(src+3*src_skip), src128, 3);
- break;
- }
- int32x4_t shifted = vshrq_n_s32(src128, 8);
- float32x4_t as_float = vcvtq_f32_s32(shifted);
- float32x4_t divided = vmulq_f32(as_float, factor);
- vst1q_f32(dst, divided);
- src += 4*src_skip;
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- /* ALERT: signed sign-extension portability !!! */
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
- while (nsamples--) {
- *dst = (*((int *) src) >> 8) * scaling;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_d32l24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- float32x4_t samples = vld1q_f32(src);
- int32x4_t converted = float_24_neon(samples);
- converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
- switch(dst_skip) {
- case 4:
- vst1q_s32((int32_t*)dst, converted);
- break;
- default:
- vst1q_lane_s32((int32_t*)(dst), converted, 0);
- vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1);
- vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2);
- vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3);
- break;
- }
- dst += 4*dst_skip;
- src+= 4;
- }
- #endif
- int32_t z;
- while (nsamples--) {
- float_24l32 (*src, z);
- dst[0]=(char)(z>>24);
- dst[1]=(char)(z>>16);
- dst[2]=(char)(z>>8);
- dst[3]=(char)(z);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(z);
- dst[1]=(char)(z>>8);
- dst[2]=(char)(z>>16);
- dst[3]=(char)(z>>24);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_d32l24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__SSE2__) && !defined (__sun__)
- __m128 int_max = _mm_set1_ps(SAMPLE_24BIT_MAX_F);
- __m128 int_min = _mm_sub_ps(_mm_setzero_ps(), int_max);
- __m128 factor = int_max;
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- __m128 in = _mm_load_ps(src);
- __m128 scaled = _mm_mul_ps(in, factor);
- __m128 clipped = clip(scaled, int_min, int_max);
- __m128i shifted = _mm_cvttps_epi32(clipped);
- #ifdef __SSE4_1__
- *(int32_t*)dst = _mm_extract_epi32(shifted, 0);
- *(int32_t*)(dst+dst_skip) = _mm_extract_epi32(shifted, 1);
- *(int32_t*)(dst+2*dst_skip) = _mm_extract_epi32(shifted, 2);
- *(int32_t*)(dst+3*dst_skip) = _mm_extract_epi32(shifted, 3);
- #else
- __m128i shuffled1 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(0, 3, 2, 1));
- __m128i shuffled2 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(1, 0, 3, 2));
- __m128i shuffled3 = _mm_shuffle_epi32(shifted, _MM_SHUFFLE(2, 1, 0, 3));
- _mm_store_ss((float*)dst, (__m128)shifted);
- _mm_store_ss((float*)(dst+dst_skip), (__m128)shuffled1);
- _mm_store_ss((float*)(dst+2*dst_skip), (__m128)shuffled2);
- _mm_store_ss((float*)(dst+3*dst_skip), (__m128)shuffled3);
- #endif
- dst += 4*dst_skip;
- src+= 4;
- }
- while (nsamples--) {
- __m128 in = _mm_load_ss(src);
- __m128 scaled = _mm_mul_ss(in, factor);
- __m128 clipped = _mm_min_ss(int_max, _mm_max_ss(scaled, int_min));
- int y = _mm_cvttss_si32(clipped);
- *((int *) dst) = y<<8;
- dst += dst_skip;
- src++;
- }
- #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- float32x4_t samples = vld1q_f32(src);
- int32x4_t converted = float_24_neon(samples);
- switch(dst_skip) {
- case 4:
- vst1q_s32((int32_t*)dst, converted);
- break;
- default:
- vst1q_lane_s32((int32_t*)(dst), converted, 0);
- vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1);
- vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2);
- vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3);
- break;
- }
- dst += 4*dst_skip;
- src+= 4;
- }
- #endif
- #if !defined (__SSE2__)
- while (nsamples--) {
- float_24l32 (*src, *((int32_t*) dst));
- dst += dst_skip;
- src++;
- }
- #endif
- }
- void sample_move_dS_s32s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_32BIT_SCALING;
- while (nsamples--) {
- int32_t x;
- x = (unsigned char)(src[0]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[2]);
- x <<= 8;
- x |= (unsigned char)(src[3]);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- x = (unsigned char)(src[3]);
- x <<= 8;
- x |= (unsigned char)(src[2]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[0]);
- #endif
- double extended = x * scaling;
- *dst = (float)extended;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_dS_s32l24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- uint32x4_t src128;
- switch(src_skip)
- {
- case 4:
- src128 = vld1q_u32((uint32_t*)src);
- break;
- case 8:
- src128 = vld2q_u32((uint32_t*)src).val[0];
- break;
- default:
- src128 = vld1q_lane_u32((uint32_t*)src, src128, 0);
- src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1);
- src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2);
- src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3);
- break;
- }
- src128 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(src128)));
- uint32x4_t toupper = vshlq_n_u32(src128, 8);
- int32x4_t shifted = vshrq_n_s32((int32x4_t)toupper, 8);
- float32x4_t as_float = vcvtq_f32_s32(shifted);
- float32x4_t divided = vmulq_f32(as_float, factor);
- vst1q_f32(dst, divided);
- src += 4*src_skip;
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- /* ALERT: signed sign-extension portability !!! */
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
- while (nsamples--) {
- int32_t x;
- x = (unsigned char)(src[0]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[2]);
- x <<= 8;
- x |= (unsigned char)(src[3]);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- x = (unsigned char)(src[3]);
- x <<= 8;
- x |= (unsigned char)(src[2]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[0]);
- #endif
- *dst = (x >> 0) * scaling;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_dS_s32 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- const double scaling = 1.0 / SAMPLE_32BIT_SCALING;
- while (nsamples--) {
- int32_t val=(*((int32_t*)src));
- double extended = val * scaling;
- *dst = (float)extended;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_dS_s32l24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- #if defined (__SSE2__) && !defined (__sun__)
- unsigned long unrolled = nsamples / 4;
- static float inv_sample_max_24bit = 1.0 / SAMPLE_24BIT_SCALING;
- __m128 factor = _mm_set1_ps(inv_sample_max_24bit);
- while (unrolled--)
- {
- int i1 = *((int *) src);
- src+= src_skip;
- int i2 = *((int *) src);
- src+= src_skip;
- int i3 = *((int *) src);
- src+= src_skip;
- int i4 = *((int *) src);
- src+= src_skip;
- __m128i shifted = _mm_set_epi32(i4, i3, i2, i1);
- __m128 as_float = _mm_cvtepi32_ps(shifted);
- __m128 divided = _mm_mul_ps(as_float, factor);
- _mm_storeu_ps(dst, divided);
- dst += 4;
- }
- nsamples = nsamples & 3;
- #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- float32x4_t factor = vdupq_n_f32(1.0 / SAMPLE_24BIT_SCALING);
- while (unrolled--) {
- uint32x4_t src128;
- switch(src_skip) {
- case 4:
- src128 = vld1q_u32((uint32_t*)src);
- break;
- case 8:
- src128 = vld2q_u32((uint32_t*)src).val[0];
- break;
- default:
- src128 = vld1q_lane_u32((uint32_t*)src, src128, 0);
- src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1);
- src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2);
- src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3);
- break;
- }
- // Sign extension by moving to upper as unsigned, then down
- uint32x4_t toupper = vshlq_n_u32(src128, 8);
- int32x4_t shifted = vshrq_n_s32((int32x4_t)toupper, 8);
- float32x4_t as_float = vcvtq_f32_s32(shifted);
- float32x4_t divided = vmulq_f32(as_float, factor);
- vst1q_f32(dst, divided);
- src += 4*src_skip;
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- /* ALERT: signed sign-extension portability !!! */
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
- while (nsamples--) {
- uint32_t val=(*((uint32_t*)src));
- if (val & 0x800000u) val|=0xFF000000u;
- *dst = (*((int32_t *) &val)) * scaling;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_d24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- int i;
- int32_t z[4];
- float32x4_t samples = vld1q_f32(src);
- int32x4_t converted = float_24_neon(samples);
- converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted)));
- vst1q_s32(z, converted);
- for (i = 0; i != 4; ++i) {
- memcpy (dst, ((char*)(z+i))+1, 3);
- dst += dst_skip;
- }
- src += 4;
- }
- nsamples = nsamples & 3;
- #endif
- int32_t z;
- while (nsamples--) {
- float_24 (*src, z);
- dst[0]=(char)(z>>16);
- dst[1]=(char)(z>>8);
- dst[2]=(char)(z);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(z);
- dst[1]=(char)(z>>8);
- dst[2]=(char)(z>>16);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_d24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__SSE2__) && !defined (__sun__)
- while (nsamples >= 4) {
- int i;
- int32_t z[4];
- __m128 samples = _mm_loadu_ps(src);
- __m128i converted = float_24_sse(samples);
- #ifdef __SSE4_1__
- z[0] = _mm_extract_epi32(converted, 0);
- z[1] = _mm_extract_epi32(converted, 1);
- z[2] = _mm_extract_epi32(converted, 2);
- z[3] = _mm_extract_epi32(converted, 3);
- #else
- __m128i shuffled1 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(0, 3, 2, 1));
- __m128i shuffled2 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(1, 0, 3, 2));
- __m128i shuffled3 = _mm_shuffle_epi32(converted, _MM_SHUFFLE(2, 1, 0, 3));
- _mm_store_ss((float*)z, (__m128)converted);
- _mm_store_ss((float*)z+1, (__m128)shuffled1);
- _mm_store_ss((float*)z+2, (__m128)shuffled2);
- _mm_store_ss((float*)z+3, (__m128)shuffled3);
- #endif
- for (i = 0; i != 4; ++i) {
- memcpy (dst, z+i, 3);
- dst += dst_skip;
- }
- nsamples -= 4;
- src += 4;
- }
- #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- int i;
- int32_t z[4];
- float32x4_t samples = vld1q_f32(src);
- int32x4_t converted = float_24_neon(samples);
- vst1q_s32(z, converted);
- for (i = 0; i != 4; ++i) {
- memcpy (dst, z+i, 3);
- dst += dst_skip;
- }
- src += 4;
- }
- nsamples = nsamples & 3;
- #endif
- int32_t z;
- while (nsamples--) {
- float_24 (*src, z);
- memcpy (dst, &z, 3);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- memcpy (dst, (char *)&z + 1, 3);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_dS_s24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_24BIT_SCALING;
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
- const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
- int32_t x[4];
- memset(x, 0, sizeof(x));
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
- // right aligned / inverse sequence below -> *256
- memcpy(((char*)&x[0])+1, src, 3);
- memcpy(((char*)&x[1])+1, src+src_skip, 3);
- memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
- memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
- #else
- memcpy(&x[0], src, 3);
- memcpy(&x[1], src+src_skip, 3);
- memcpy(&x[2], src+2*src_skip, 3);
- memcpy(&x[3], src+3*src_skip, 3);
- #endif
- src += 4 * src_skip;
- int32x4_t source = vld1q_s32(x);
- source = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(source)));
- float32x4_t converted = vcvtq_f32_s32(source);
- float32x4_t scaled = vmulq_f32(converted, vscaling);
- vst1q_f32(dst, scaled);
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- /* ALERT: signed sign-extension portability !!! */
- while (nsamples--) {
- int x;
- x = (unsigned char)(src[0]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[2]);
- /* correct sign bit and the rest of the top byte */
- if (src[0] & 0x80) {
- x |= 0xff << 24;
- }
- #elif __BYTE_ORDER == __BIG_ENDIAN
- x = (unsigned char)(src[2]);
- x <<= 8;
- x |= (unsigned char)(src[1]);
- x <<= 8;
- x |= (unsigned char)(src[0]);
- /* correct sign bit and the rest of the top byte */
- if (src[2] & 0x80) {
- x |= 0xff << 24;
- }
- #endif
- *dst = x * scaling;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_dS_s24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- const jack_default_audio_sample_t scaling = 1.f/SAMPLE_24BIT_SCALING;
- #if defined (__SSE2__) && !defined (__sun__)
- const __m128 scaling_block = _mm_set_ps1(scaling);
- while (nsamples >= 4) {
- int x0, x1, x2, x3;
- memcpy((char*)&x0 + 1, src, 3);
- memcpy((char*)&x1 + 1, src+src_skip, 3);
- memcpy((char*)&x2 + 1, src+2*src_skip, 3);
- memcpy((char*)&x3 + 1, src+3*src_skip, 3);
- src += 4 * src_skip;
- const __m128i block_i = _mm_set_epi32(x3, x2, x1, x0);
- const __m128i shifted = _mm_srai_epi32(block_i, 8);
- const __m128 converted = _mm_cvtepi32_ps (shifted);
- const __m128 scaled = _mm_mul_ps(converted, scaling_block);
- _mm_storeu_ps(dst, scaled);
- dst += 4;
- nsamples -= 4;
- }
- #elif defined (__ARM_NEON__) || defined (__ARM_NEON)
- // we shift 8 to the right by dividing by 256.0 -> no sign extra handling
- const float32x4_t vscaling = vdupq_n_f32(scaling/256.0);
- int32_t x[4];
- memset(x, 0, sizeof(x));
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- #if __BYTE_ORDER == __BIG_ENDIAN /* ARM big endian?? */
- // left aligned -> *256
- memcpy(&x[0], src, 3);
- memcpy(&x[1], src+src_skip, 3);
- memcpy(&x[2], src+2*src_skip, 3);
- memcpy(&x[3], src+3*src_skip, 3);
- #else
- memcpy(((char*)&x[0])+1, src, 3);
- memcpy(((char*)&x[1])+1, src+src_skip, 3);
- memcpy(((char*)&x[2])+1, src+2*src_skip, 3);
- memcpy(((char*)&x[3])+1, src+3*src_skip, 3);
- #endif
- src += 4 * src_skip;
- int32x4_t source = vld1q_s32(x);
- float32x4_t converted = vcvtq_f32_s32(source);
- float32x4_t scaled = vmulq_f32(converted, vscaling);
- vst1q_f32(dst, scaled);
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- while (nsamples--) {
- int x;
- memcpy((char*)&x + 1, src, 3);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- memcpy(&x, src, 3);
- #endif
- x >>= 8;
- *dst = x * scaling;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- float32x4_t samples = vld1q_f32(src);
- int16x4_t converted = float_16_neon(samples);
- converted = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(converted)));
- switch(dst_skip) {
- case 2:
- vst1_s16((int16_t*)dst, converted);
- break;
- default:
- vst1_lane_s16((int16_t*)(dst), converted, 0);
- vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
- vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
- vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
- break;
- }
- dst += 4*dst_skip;
- src+= 4;
- }
- #endif
- int16_t tmp;
- while (nsamples--) {
- // float_16 (*src, tmp);
- if (*src <= NORMALIZED_FLOAT_MIN) {
- tmp = SAMPLE_16BIT_MIN;
- } else if (*src >= NORMALIZED_FLOAT_MAX) {
- tmp = SAMPLE_16BIT_MAX;
- } else {
- tmp = (int16_t) f_round (*src * SAMPLE_16BIT_SCALING);
- }
- dst[0]=(char)(tmp>>8);
- dst[1]=(char)(tmp);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(tmp);
- dst[1]=(char)(tmp>>8);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- unsigned long unrolled = nsamples / 4;
- nsamples = nsamples & 3;
- while (unrolled--) {
- float32x4_t samples = vld1q_f32(src);
- int16x4_t converted = float_16_neon(samples);
- switch(dst_skip) {
- case 2:
- vst1_s16((int16_t*)dst, converted);
- break;
- default:
- vst1_lane_s16((int16_t*)(dst), converted, 0);
- vst1_lane_s16((int16_t*)(dst+dst_skip), converted, 1);
- vst1_lane_s16((int16_t*)(dst+2*dst_skip), converted, 2);
- vst1_lane_s16((int16_t*)(dst+3*dst_skip), converted, 3);
- break;
- }
- dst += 4*dst_skip;
- src+= 4;
- }
- #endif
- while (nsamples--) {
- float_16 (*src, *((int16_t*) dst));
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_dither_rect_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- jack_default_audio_sample_t val;
- int16_t tmp;
- while (nsamples--) {
- val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float) UINT_MAX - 0.5f;
- float_16_scaled (val, tmp);
- dst[0]=(char)(tmp>>8);
- dst[1]=(char)(tmp);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(tmp);
- dst[1]=(char)(tmp>>8);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_dither_rect_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- jack_default_audio_sample_t val;
- while (nsamples--) {
- val = (*src * SAMPLE_16BIT_SCALING) + fast_rand() / (float)UINT_MAX - 0.5f;
- float_16_scaled (val, *((int16_t*) dst));
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_dither_tri_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- jack_default_audio_sample_t val;
- int16_t tmp;
- while (nsamples--) {
- val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
- float_16_scaled (val, tmp);
- dst[0]=(char)(tmp>>8);
- dst[1]=(char)(tmp);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(tmp);
- dst[1]=(char)(tmp>>8);
- #endif
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_dither_tri_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- jack_default_audio_sample_t val;
- while (nsamples--) {
- val = (*src * SAMPLE_16BIT_SCALING) + ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
- float_16_scaled (val, *((int16_t*) dst));
- dst += dst_skip;
- src++;
- }
- }
- void sample_move_dither_shaped_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- jack_default_audio_sample_t x;
- jack_default_audio_sample_t xe; /* the innput sample - filtered error */
- jack_default_audio_sample_t xp; /* x' */
- float r;
- float rm1 = state->rm1;
- unsigned int idx = state->idx;
- int16_t tmp;
- while (nsamples--) {
- x = *src * SAMPLE_16BIT_SCALING;
- r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
- /* Filter the error with Lipshitz's minimally audible FIR:
- [2.033 -2.165 1.959 -1.590 0.6149] */
- xe = x
- - state->e[idx] * 2.033f
- + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
- - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
- + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
- - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
- xp = xe + r - rm1;
- rm1 = r;
- float_16_scaled (xp, tmp);
- /* Intrinsic z^-1 delay */
- idx = (idx + 1) & DITHER_BUF_MASK;
- state->e[idx] = xp - xe;
- dst[0]=(char)(tmp>>8);
- dst[1]=(char)(tmp);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- dst[0]=(char)(tmp);
- dst[1]=(char)(tmp>>8);
- #endif
- dst += dst_skip;
- src++;
- }
- state->rm1 = rm1;
- state->idx = idx;
- }
- void sample_move_dither_shaped_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state)
- {
- jack_default_audio_sample_t x;
- jack_default_audio_sample_t xe; /* the innput sample - filtered error */
- jack_default_audio_sample_t xp; /* x' */
- float r;
- float rm1 = state->rm1;
- unsigned int idx = state->idx;
- while (nsamples--) {
- x = *src * SAMPLE_16BIT_SCALING;
- r = ((float)fast_rand() + (float)fast_rand()) / (float)UINT_MAX - 1.0f;
- /* Filter the error with Lipshitz's minimally audible FIR:
- [2.033 -2.165 1.959 -1.590 0.6149] */
- xe = x
- - state->e[idx] * 2.033f
- + state->e[(idx - 1) & DITHER_BUF_MASK] * 2.165f
- - state->e[(idx - 2) & DITHER_BUF_MASK] * 1.959f
- + state->e[(idx - 3) & DITHER_BUF_MASK] * 1.590f
- - state->e[(idx - 4) & DITHER_BUF_MASK] * 0.6149f;
- xp = xe + r - rm1;
- rm1 = r;
- float_16_scaled (xp, *((int16_t*) dst));
- /* Intrinsic z^-1 delay */
- idx = (idx + 1) & DITHER_BUF_MASK;
- state->e[idx] = *((int16_t*) dst) - xe;
- dst += dst_skip;
- src++;
- }
- state->rm1 = rm1;
- state->idx = idx;
- }
- void sample_move_dS_s16s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- short z;
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- const float32x4_t vscaling = vdupq_n_f32(scaling);
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- int16x4_t source16x4;
- switch(src_skip) {
- case 2:
- source16x4 = vld1_s16((int16_t*)src);
- break;
- case 4:
- source16x4 = vld2_s16((int16_t*)src).val[0];
- break;
- default:
- source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
- source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
- source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
- source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
- break;
- }
- source16x4 = vreinterpret_s16_u8(vrev16_u8(vreinterpret_u8_s16(source16x4)));
- int32x4_t source32x4 = vmovl_s16(source16x4);
- src += 4 * src_skip;
- float32x4_t converted = vcvtq_f32_s32(source32x4);
- float32x4_t scaled = vmulq_f32(converted, vscaling);
- vst1q_f32(dst, scaled);
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- /* ALERT: signed sign-extension portability !!! */
- while (nsamples--) {
- z = (unsigned char)(src[0]);
- z <<= 8;
- z |= (unsigned char)(src[1]);
- #elif __BYTE_ORDER == __BIG_ENDIAN
- z = (unsigned char)(src[1]);
- z <<= 8;
- z |= (unsigned char)(src[0]);
- #endif
- *dst = z * scaling;
- dst++;
- src += src_skip;
- }
- }
- void sample_move_dS_s16 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip)
- {
- /* ALERT: signed sign-extension portability !!! */
- const jack_default_audio_sample_t scaling = 1.0/SAMPLE_16BIT_SCALING;
- #if defined (__ARM_NEON__) || defined (__ARM_NEON)
- const float32x4_t vscaling = vdupq_n_f32(scaling);
- unsigned long unrolled = nsamples / 4;
- while (unrolled--) {
- int16x4_t source16x4;
- switch(src_skip) {
- case 2:
- source16x4 = vld1_s16((int16_t*)src);
- break;
- case 4:
- source16x4 = vld2_s16((int16_t*)src).val[0];
- break;
- default:
- source16x4 = vld1_lane_s16((int16_t*)src, source16x4, 0);
- source16x4 = vld1_lane_s16((int16_t*)(src+src_skip), source16x4, 1);
- source16x4 = vld1_lane_s16((int16_t*)(src+2*src_skip), source16x4, 2);
- source16x4 = vld1_lane_s16((int16_t*)(src+3*src_skip), source16x4, 3);
- break;
- }
- int32x4_t source32x4 = vmovl_s16(source16x4);
- src += 4 * src_skip;
- float32x4_t converted = vcvtq_f32_s32(source32x4);
- float32x4_t scaled = vmulq_f32(converted, vscaling);
- vst1q_f32(dst, scaled);
- dst += 4;
- }
- nsamples = nsamples & 3;
- #endif
- while (nsamples--) {
- *dst = (*((short *) src)) * scaling;
- dst++;
- src += src_skip;
- }
- }
- void memset_interleave (char *dst, char val, unsigned long bytes,
- unsigned long unit_bytes,
- unsigned long skip_bytes)
- {
- switch (unit_bytes) {
- case 1:
- while (bytes--) {
- *dst = val;
- dst += skip_bytes;
- }
- break;
- case 2:
- while (bytes) {
- *((short *) dst) = (short) val;
- dst += skip_bytes;
- bytes -= 2;
- }
- break;
- case 4:
- while (bytes) {
- *((int *) dst) = (int) val;
- dst += skip_bytes;
- bytes -= 4;
- }
- break;
- default:
- while (bytes) {
- memset(dst, val, unit_bytes);
- dst += skip_bytes;
- bytes -= unit_bytes;
- }
- break;
- }
- }
- /* COPY FUNCTIONS: used to move data from an input channel to an
- output channel. Note that we assume that the skip distance
- is the same for both channels. This is completely fine
- unless the input and output were on different audio interfaces that
- were interleaved differently. We don't try to handle that.
- */
- void
- memcpy_fake (char *dst, char *src, unsigned long src_bytes, unsigned long foo, unsigned long bar)
- {
- memcpy (dst, src, src_bytes);
- }
- void
- memcpy_interleave_d16_s16 (char *dst, char *src, unsigned long src_bytes,
- unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
- {
- while (src_bytes) {
- *((short *) dst) = *((short *) src);
- dst += dst_skip_bytes;
- src += src_skip_bytes;
- src_bytes -= 2;
- }
- }
- void
- memcpy_interleave_d24_s24 (char *dst, char *src, unsigned long src_bytes,
- unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
- {
- while (src_bytes) {
- memcpy(dst, src, 3);
- dst += dst_skip_bytes;
- src += src_skip_bytes;
- src_bytes -= 3;
- }
- }
- void
- memcpy_interleave_d32_s32 (char *dst, char *src, unsigned long src_bytes,
- unsigned long dst_skip_bytes, unsigned long src_skip_bytes)
- {
- while (src_bytes) {
- *((int *) dst) = *((int *) src);
- dst += dst_skip_bytes;
- src += src_skip_bytes;
- src_bytes -= 4;
- }
- }