diff --git a/common/memops.c b/common/memops.c index 725c49e1..3f1b67e1 100644 --- a/common/memops.c +++ b/common/memops.c @@ -73,6 +73,7 @@ So, for now (October 2008) we use 2^(N-1)-1 as the scaling factor. */ +#define SAMPLE_32BIT_SCALING 2147483647.0 #define SAMPLE_24BIT_SCALING 8388607.0f #define SAMPLE_16BIT_SCALING 32767.0f @@ -81,6 +82,11 @@ advice from Fons Adriaensen: make the limits symmetrical */ +#define SAMPLE_32BIT_MAX 2147483647 +#define SAMPLE_32BIT_MIN -2147483647 +#define SAMPLE_32BIT_MAX_F 2147483647.0 +#define SAMPLE_32BIT_MIN_F -2147483647.0 + #define SAMPLE_24BIT_MAX 8388607 #define SAMPLE_24BIT_MIN -8388607 #define SAMPLE_24BIT_MAX_F 8388607.0f @@ -106,6 +112,7 @@ */ #define f_round(f) lrintf(f) +#define d_round(f) lrint(f) #define float_16(s, d)\ if ((s) <= NORMALIZED_FLOAT_MIN) {\ @@ -146,6 +153,16 @@ (d) = f_round ((s) * SAMPLE_24BIT_SCALING); \ } +#define float_32(s, d) \ + if ((s) <= NORMALIZED_FLOAT_MIN) {\ + (d) = SAMPLE_32BIT_MIN; \ + } else if ((s) >= NORMALIZED_FLOAT_MAX) {\ + (d) = SAMPLE_32BIT_MAX; \ + } else {\ + double extended_value = ((double)s) * SAMPLE_32BIT_SCALING; \ + (d) = d_round (extended_value); \ + } + /* call this when "s" has already been scaled (e.g. when dithering) */ @@ -195,6 +212,11 @@ static inline __m128 clip(__m128 s, __m128 min, __m128 max) return _mm_min_ps(max, _mm_max_ps(s, min)); } +static inline __m128d clip_double(__m128d s, __m128d min, __m128d max) +{ + return _mm_min_pd(max, _mm_max_pd(s, min)); +} + static inline __m128i float_24_sse(__m128 s) { const __m128 upper_bound = gen_one(); /* NORMALIZED_FLOAT_MAX */ @@ -214,6 +236,26 @@ static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max) return vminq_f32(max, vmaxq_f32(s, min)); } +static inline float32x4_t float_32_multiply_extended(float32x4_t samples, double factor) +{ + float64x2_t factor64 = vdupq_n_f64(factor); + float64x2_t lower_elements = vcvt_f64_f32(vget_low_f32(samples)); + float64x2_t upper_elements = vcvt_high_f64_f32(samples); + lower_elements = vmulq_f64(lower_elements, factor64); + upper_elements = vmulq_f64(upper_elements, factor64); + float32x2_t lower_down_scaled = vcvt_f32_f64(lower_elements); + return vcvt_high_f32_f64(lower_down_scaled, upper_elements); +} + +static inline int32x4_t float_32_neon(float32x4_t s) +{ + const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX); + const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN); + + float32x4_t clipped = clip(s, lower_bound, upper_bound); + return vcvtq_s32_f32(float_32_multiply_extended(clipped, SAMPLE_32BIT_SCALING)); +} + static inline int32x4_t float_24_neon(float32x4_t s) { const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX); @@ -274,6 +316,7 @@ void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsign S - sample is a jack_default_audio_sample_t, currently (October 2008) a 32 bit floating point value Ss - like S but reverse endian from the host CPU + 32 - sample is an signed 32 bit integer value 32u24 - sample is an signed 32 bit integer value, but data is in upper 24 bits only 32u24s - like 32u24 but reverse endian from the host CPU 32l24 - sample is an signed 32 bit integer value, but data is in lower 24 bits only @@ -290,6 +333,93 @@ void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsign /* functions for native integer sample data */ +void sample_move_d32_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) +{ +#if defined (__ARM_NEON__) || defined (__ARM_NEON) + unsigned long unrolled = nsamples / 4; + nsamples = nsamples & 3; + + while (unrolled--) { + float32x4_t samples = vld1q_f32(src); + int32x4_t converted = float_32_neon(samples); + converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted))); + + switch(dst_skip) { + case 4: + vst1q_s32((int32_t*)dst, converted); + break; + default: + vst1q_lane_s32((int32_t*)(dst), converted, 0); + vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1); + vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2); + vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3); + break; + } + dst += 4*dst_skip; + src+= 4; + } +#endif + + int32_t z; + + while (nsamples--) { + + float_32 (*src, z); + +#if __BYTE_ORDER == __LITTLE_ENDIAN + dst[0]=(char)(z>>24); + dst[1]=(char)(z>>16); + dst[2]=(char)(z>>8); + dst[3]=(char)(z); +#elif __BYTE_ORDER == __BIG_ENDIAN + dst[0]=(char)(z); + dst[1]=(char)(z>>8); + dst[2]=(char)(z>>16); + dst[3]=(char)(z>>24); +#endif + dst += dst_skip; + src++; + } +} + +void sample_move_d32_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) +{ +#if defined (__ARM_NEON__) || defined (__ARM_NEON) + unsigned long unrolled = nsamples / 4; + nsamples = nsamples & 3; + + while (unrolled--) { + float32x4_t samples = vld1q_f32(src); + int32x4_t converted = float_32_neon(samples); + + switch(dst_skip) { + case 4: + vst1q_s32((int32_t*)dst, converted); + break; + default: + vst1q_lane_s32((int32_t*)(dst), converted, 0); + vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1); + vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2); + vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3); + break; + } + dst += 4*dst_skip; + + src+= 4; + } +#endif + while (nsamples--) { + double sample = *((float *)src); + double clipped = fmin(1.0, fmax(sample, -1.0)); + double scaled = clipped * SAMPLE_32BIT_MAX_F; + int y = (int)scaled; + *((int *) dst) = y; + + dst += dst_skip; + src++; + } +} + void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) { #if defined (__ARM_NEON__) || defined (__ARM_NEON) @@ -689,6 +819,68 @@ void sample_move_d32l24_sS (char *dst, jack_default_audio_sample_t *src, unsigne #endif } +void sample_move_dS_s32s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) +{ + const jack_default_audio_sample_t scaling = 1.0/SAMPLE_32BIT_SCALING; +#if defined (__ARM_NEON__) || defined (__ARM_NEON) + unsigned long unrolled = nsamples / 4; + while (unrolled--) { + uint32x4_t src128; + switch(src_skip) + { + case 4: + src128 = vld1q_u32((uint32_t*)src); + break; + case 8: + src128 = vld2q_u32((uint32_t*)src).val[0]; + break; + default: + src128 = vld1q_lane_u32((uint32_t*)src, src128, 0); + src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1); + src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2); + src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3); + break; + } + src128 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(src128))); + float32x4_t samples = vcvtq_f32_s32(vreinterpretq_s32_u32(src128)); + samples = float_32_multiply_extended(samples, scaling); + vst1q_f32(dst, samples); + + src += 4*src_skip; + dst += 4; + } + nsamples = nsamples & 3; +#endif + + /* ALERT: signed sign-extension portability !!! */ + + + while (nsamples--) { + int32_t x; +#if __BYTE_ORDER == __LITTLE_ENDIAN + x = (unsigned char)(src[0]); + x <<= 8; + x |= (unsigned char)(src[1]); + x <<= 8; + x |= (unsigned char)(src[2]); + x <<= 8; + x |= (unsigned char)(src[3]); +#elif __BYTE_ORDER == __BIG_ENDIAN + x = (unsigned char)(src[3]); + x <<= 8; + x |= (unsigned char)(src[2]); + x <<= 8; + x |= (unsigned char)(src[1]); + x <<= 8; + x |= (unsigned char)(src[0]); +#endif + double extended = x * scaling; + *dst = (float)extended; + dst++; + src += src_skip; + } +} + void sample_move_dS_s32l24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) { #if defined (__ARM_NEON__) || defined (__ARM_NEON) @@ -753,6 +945,47 @@ void sample_move_dS_s32l24s (jack_default_audio_sample_t *dst, char *src, unsign } } +void sample_move_dS_s32 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) +{ + const double scaling = 1.0 / SAMPLE_32BIT_SCALING; +#if defined (__ARM_NEON__) || defined (__ARM_NEON) + unsigned long unrolled = nsamples / 4; + nsamples = nsamples & 3; + while (unrolled--) { + uint32x4_t src128; + switch(src_skip) { + case 4: + src128 = vld1q_u32((uint32_t*)src); + break; + case 8: + src128 = vld2q_u32((uint32_t*)src).val[0]; + break; + default: + src128 = vld1q_lane_u32((uint32_t*)src, src128, 0); + src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1); + src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2); + src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3); + break; + } + + float32x4_t samples = vcvtq_f32_s32(vreinterpretq_s32_u32(src128)); + samples = float_32_multiply_extended(samples, scaling); + vst1q_f32(dst, samples); + + src += 4*src_skip; + dst += 4; + } +#endif + + while (nsamples--) { + int32_t val=(*((int32_t*)src)); + double extended = val * scaling; + *dst = (float)extended; + dst++; + src += src_skip; + } +} + void sample_move_dS_s32l24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) { #if defined (__SSE2__) && !defined (__sun__) diff --git a/common/memops.h b/common/memops.h index a69087ff..edc99726 100644 --- a/common/memops.h +++ b/common/memops.h @@ -53,6 +53,8 @@ void sample_move_floatLE_sSs (jack_default_audio_sample_t *dst, char *src, unsig void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); /* integer functions */ +void sample_move_d32_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); +void sample_move_d32_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); void sample_move_d32u24_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); void sample_move_d32l24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); @@ -81,6 +83,8 @@ void sample_move_dither_tri_d16_sS (char *dst, jack_default_audio_sample_ void sample_move_dither_shaped_d16_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); void sample_move_dither_shaped_d16_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state); +void sample_move_dS_s32s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip); +void sample_move_dS_s32 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip); void sample_move_dS_s32u24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip); void sample_move_dS_s32u24 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip); void sample_move_dS_s32l24s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip); diff --git a/linux/alsa/alsa_driver.c b/linux/alsa/alsa_driver.c index f14c8884..6a7b92d6 100644 --- a/linux/alsa/alsa_driver.c +++ b/linux/alsa/alsa_driver.c @@ -314,10 +314,24 @@ alsa_driver_setup_io_function_pointers (alsa_driver_t *driver) break; case 4: /* NO DITHER */ + switch (driver->playback_sample_format) + { + case SND_PCM_FORMAT_S24_LE: + case SND_PCM_FORMAT_S24_BE: driver->write_via_copy = driver->quirk_bswap? sample_move_d32l24_sSs: sample_move_d32l24_sS; break; + case SND_PCM_FORMAT_S32_LE: + driver->write_via_copy = driver->quirk_bswap? + sample_move_d32_sSs: + sample_move_d32_sS; + break; + default: + jack_error("unsupported 4 byte sample_format"); + exit (1); + } + break; default: jack_error ("impossible sample width (%d) discovered!", @@ -343,10 +357,24 @@ alsa_driver_setup_io_function_pointers (alsa_driver_t *driver) sample_move_dS_s24; break; case 4: + switch (driver->capture_sample_format) + { + case SND_PCM_FORMAT_S24_LE: + case SND_PCM_FORMAT_S24_BE: driver->read_via_copy = driver->quirk_bswap? sample_move_dS_s32l24s: sample_move_dS_s32l24; break; + case SND_PCM_FORMAT_S32_LE: + driver->read_via_copy = driver->quirk_bswap? + sample_move_dS_s32s: + sample_move_dS_s32; + break; + default: + jack_error("unsupported 4 byte sample_format"); + exit (1); + } + break; } } }