| @@ -236,26 +236,6 @@ static inline float32x4_t clip(float32x4_t s, float32x4_t min, float32x4_t max) | |||
| return vminq_f32(max, vmaxq_f32(s, min)); | |||
| } | |||
| static inline float32x4_t float_32_multiply_extended(float32x4_t samples, double factor) | |||
| { | |||
| float64x2_t factor64 = vdupq_n_f64(factor); | |||
| float64x2_t lower_elements = vcvt_f64_f32(vget_low_f32(samples)); | |||
| float64x2_t upper_elements = vcvt_high_f64_f32(samples); | |||
| lower_elements = vmulq_f64(lower_elements, factor64); | |||
| upper_elements = vmulq_f64(upper_elements, factor64); | |||
| float32x2_t lower_down_scaled = vcvt_f32_f64(lower_elements); | |||
| return vcvt_high_f32_f64(lower_down_scaled, upper_elements); | |||
| } | |||
| static inline int32x4_t float_32_neon(float32x4_t s) | |||
| { | |||
| const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX); | |||
| const float32x4_t lower_bound = vdupq_n_f32(NORMALIZED_FLOAT_MIN); | |||
| float32x4_t clipped = clip(s, lower_bound, upper_bound); | |||
| return vcvtq_s32_f32(float_32_multiply_extended(clipped, SAMPLE_32BIT_SCALING)); | |||
| } | |||
| static inline int32x4_t float_24_neon(float32x4_t s) | |||
| { | |||
| const float32x4_t upper_bound = vdupq_n_f32(NORMALIZED_FLOAT_MAX); | |||
| @@ -335,37 +315,9 @@ void sample_move_dS_floatLE (char *dst, jack_default_audio_sample_t *src, unsign | |||
| void sample_move_d32_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) | |||
| { | |||
| #if defined (__ARM_NEON__) || defined (__ARM_NEON) | |||
| unsigned long unrolled = nsamples / 4; | |||
| nsamples = nsamples & 3; | |||
| while (unrolled--) { | |||
| float32x4_t samples = vld1q_f32(src); | |||
| int32x4_t converted = float_32_neon(samples); | |||
| converted = vreinterpretq_s32_u8(vrev32q_u8(vreinterpretq_u8_s32(converted))); | |||
| switch(dst_skip) { | |||
| case 4: | |||
| vst1q_s32((int32_t*)dst, converted); | |||
| break; | |||
| default: | |||
| vst1q_lane_s32((int32_t*)(dst), converted, 0); | |||
| vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1); | |||
| vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2); | |||
| vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3); | |||
| break; | |||
| } | |||
| dst += 4*dst_skip; | |||
| src+= 4; | |||
| } | |||
| #endif | |||
| int32_t z; | |||
| while (nsamples--) { | |||
| int32_t z; | |||
| float_32 (*src, z); | |||
| #if __BYTE_ORDER == __LITTLE_ENDIAN | |||
| dst[0]=(char)(z>>24); | |||
| dst[1]=(char)(z>>16); | |||
| @@ -384,40 +336,16 @@ void sample_move_d32_sSs (char *dst, jack_default_audio_sample_t *src, unsigned | |||
| void sample_move_d32_sS (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) | |||
| { | |||
| #if defined (__ARM_NEON__) || defined (__ARM_NEON) | |||
| unsigned long unrolled = nsamples / 4; | |||
| nsamples = nsamples & 3; | |||
| while (unrolled--) { | |||
| float32x4_t samples = vld1q_f32(src); | |||
| int32x4_t converted = float_32_neon(samples); | |||
| switch(dst_skip) { | |||
| case 4: | |||
| vst1q_s32((int32_t*)dst, converted); | |||
| break; | |||
| default: | |||
| vst1q_lane_s32((int32_t*)(dst), converted, 0); | |||
| vst1q_lane_s32((int32_t*)(dst+dst_skip), converted, 1); | |||
| vst1q_lane_s32((int32_t*)(dst+2*dst_skip), converted, 2); | |||
| vst1q_lane_s32((int32_t*)(dst+3*dst_skip), converted, 3); | |||
| break; | |||
| } | |||
| dst += 4*dst_skip; | |||
| while (nsamples--) { | |||
| double sample = *((float *)src); | |||
| double clipped = fmin(1.0, fmax(sample, -1.0)); | |||
| double scaled = clipped * SAMPLE_32BIT_MAX_F; | |||
| int y = (int)scaled; | |||
| *((int *) dst) = y; | |||
| src+= 4; | |||
| dst += dst_skip; | |||
| src++; | |||
| } | |||
| #endif | |||
| while (nsamples--) { | |||
| double sample = *((float *)src); | |||
| double clipped = fmin(1.0, fmax(sample, -1.0)); | |||
| double scaled = clipped * SAMPLE_32BIT_MAX_F; | |||
| int y = (int)scaled; | |||
| *((int *) dst) = y; | |||
| dst += dst_skip; | |||
| src++; | |||
| } | |||
| } | |||
| void sample_move_d32u24_sSs (char *dst, jack_default_audio_sample_t *src, unsigned long nsamples, unsigned long dst_skip, dither_state_t *state) | |||
| @@ -822,39 +750,6 @@ void sample_move_d32l24_sS (char *dst, jack_default_audio_sample_t *src, unsigne | |||
| void sample_move_dS_s32s (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) | |||
| { | |||
| const jack_default_audio_sample_t scaling = 1.0/SAMPLE_32BIT_SCALING; | |||
| #if defined (__ARM_NEON__) || defined (__ARM_NEON) | |||
| unsigned long unrolled = nsamples / 4; | |||
| while (unrolled--) { | |||
| uint32x4_t src128; | |||
| switch(src_skip) | |||
| { | |||
| case 4: | |||
| src128 = vld1q_u32((uint32_t*)src); | |||
| break; | |||
| case 8: | |||
| src128 = vld2q_u32((uint32_t*)src).val[0]; | |||
| break; | |||
| default: | |||
| src128 = vld1q_lane_u32((uint32_t*)src, src128, 0); | |||
| src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1); | |||
| src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2); | |||
| src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3); | |||
| break; | |||
| } | |||
| src128 = vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(src128))); | |||
| float32x4_t samples = vcvtq_f32_s32(vreinterpretq_s32_u32(src128)); | |||
| samples = float_32_multiply_extended(samples, scaling); | |||
| vst1q_f32(dst, samples); | |||
| src += 4*src_skip; | |||
| dst += 4; | |||
| } | |||
| nsamples = nsamples & 3; | |||
| #endif | |||
| /* ALERT: signed sign-extension portability !!! */ | |||
| while (nsamples--) { | |||
| int32_t x; | |||
| #if __BYTE_ORDER == __LITTLE_ENDIAN | |||
| @@ -948,35 +843,6 @@ void sample_move_dS_s32l24s (jack_default_audio_sample_t *dst, char *src, unsign | |||
| void sample_move_dS_s32 (jack_default_audio_sample_t *dst, char *src, unsigned long nsamples, unsigned long src_skip) | |||
| { | |||
| const double scaling = 1.0 / SAMPLE_32BIT_SCALING; | |||
| #if defined (__ARM_NEON__) || defined (__ARM_NEON) | |||
| unsigned long unrolled = nsamples / 4; | |||
| nsamples = nsamples & 3; | |||
| while (unrolled--) { | |||
| uint32x4_t src128; | |||
| switch(src_skip) { | |||
| case 4: | |||
| src128 = vld1q_u32((uint32_t*)src); | |||
| break; | |||
| case 8: | |||
| src128 = vld2q_u32((uint32_t*)src).val[0]; | |||
| break; | |||
| default: | |||
| src128 = vld1q_lane_u32((uint32_t*)src, src128, 0); | |||
| src128 = vld1q_lane_u32((uint32_t*)(src+src_skip), src128, 1); | |||
| src128 = vld1q_lane_u32((uint32_t*)(src+2*src_skip), src128, 2); | |||
| src128 = vld1q_lane_u32((uint32_t*)(src+3*src_skip), src128, 3); | |||
| break; | |||
| } | |||
| float32x4_t samples = vcvtq_f32_s32(vreinterpretq_s32_u32(src128)); | |||
| samples = float_32_multiply_extended(samples, scaling); | |||
| vst1q_f32(dst, samples); | |||
| src += 4*src_skip; | |||
| dst += 4; | |||
| } | |||
| #endif | |||
| while (nsamples--) { | |||
| int32_t val=(*((int32_t*)src)); | |||
| double extended = val * scaling; | |||