/* Copyright 2013 The Chromium OS Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the LICENSE file.
*/
#include <limits.h>
#include <syslog.h>
#include "dsp_util.h"
#ifndef max
#define max(a, b) ({ __typeof__(a) _a = (a); \
__typeof__(b) _b = (b); \
_a > _b ? _a : _b; })
#endif
#ifndef min
#define min(a, b) ({ __typeof__(a) _a = (a); \
__typeof__(b) _b = (b); \
_a < _b ? _a : _b; })
#endif
#undef deinterleave_stereo
#undef interleave_stereo
/* Converts shorts in range of -32768 to 32767 to floats in range of
* -1.0f to 1.0f.
* scvtf instruction accepts fixed point ints, so sxtl is used to lengthen
* shorts to int with sign extension.
*/
#ifdef __aarch64__
static void deinterleave_stereo(int16_t *input, float *output1,
float *output2, int frames)
{
int chunk = frames >> 3;
frames &= 7;
/* Process 8 frames (16 samples) each loop. */
/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
if (chunk) {
__asm__ __volatile__ (
"1: \n"
"ld2 {v2.8h, v3.8h}, [%[input]], #32 \n"
"subs %w[chunk], %w[chunk], #1 \n"
"sxtl v0.4s, v2.4h \n"
"sxtl2 v1.4s, v2.8h \n"
"sxtl v2.4s, v3.4h \n"
"sxtl2 v3.4s, v3.8h \n"
"scvtf v0.4s, v0.4s, #15 \n"
"scvtf v1.4s, v1.4s, #15 \n"
"scvtf v2.4s, v2.4s, #15 \n"
"scvtf v3.4s, v3.4s, #15 \n"
"st1 {v0.4s, v1.4s}, [%[output1]], #32 \n"
"st1 {v2.4s, v3.4s}, [%[output2]], #32 \n"
"b.ne 1b \n"
: /* output */
[chunk]"+r"(chunk),
[input]"+r"(input),
[output1]"+r"(output1),
[output2]"+r"(output2)
: /* input */
: /* clobber */
"v0", "v1", "v2", "v3", "memory", "cc"
);
}
/* The remaining samples. */
while (frames--) {
*output1++ = *input++ / 32768.0f;
*output2++ = *input++ / 32768.0f;
}
}
#define deinterleave_stereo deinterleave_stereo
/* Converts floats in range of -1.0f to 1.0f to shorts in range of
* -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
* from zero.
* Rounding is achieved by using fcvtas instruction. (a = away)
* The float scaled to a range of -32768 to 32767 by adding 15 to the exponent.
* Add to exponent is equivalent to multiply for exponent range of 0 to 239,
* which is 2.59 * 10^33. A signed saturating add (sqadd) limits exponents
* from 240 to 255 to clamp to 255.
* For very large values, beyond +/- 2 billion, fcvtas will clamp the result
* to the min or max value that fits an int.
* For other values, sqxtn clamps the output to -32768 to 32767 range.
*/
static void interleave_stereo(float *input1, float *input2,
int16_t *output, int frames)
{
/* Process 4 frames (8 samples) each loop. */
/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
int chunk = frames >> 2;
frames &= 3;
if (chunk) {
__asm__ __volatile__ (
"dup v2.4s, %w[scale] \n"
"1: \n"
"ld1 {v0.4s}, [%[input1]], #16 \n"
"ld1 {v1.4s}, [%[input2]], #16 \n"
"subs %w[chunk], %w[chunk], #1 \n"
"sqadd v0.4s, v0.4s, v2.4s \n"
"sqadd v1.4s, v1.4s, v2.4s \n"
"fcvtas v0.4s, v0.4s \n"
"fcvtas v1.4s, v1.4s \n"
"sqxtn v0.4h, v0.4s \n"
"sqxtn v1.4h, v1.4s \n"
"st2 {v0.4h, v1.4h}, [%[output]], #16 \n"
"b.ne 1b \n"
: /* output */
[chunk]"+r"(chunk),
[input1]"+r"(input1),
[input2]"+r"(input2),
[output]"+r"(output)
: /* input */
[scale]"r"(15 << 23)
: /* clobber */
"v0", "v1", "v2", "memory", "cc"
);
}
/* The remaining samples */
while (frames--) {
float f;
f = *input1++ * 32768.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output++ = max(-32768, min(32767, (int)(f)));
f = *input2++ * 32768.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output++ = max(-32768, min(32767, (int)(f)));
}
}
#define interleave_stereo interleave_stereo
#endif
#ifdef __ARM_NEON__
#include <arm_neon.h>
static void deinterleave_stereo(int16_t *input, float *output1,
float *output2, int frames)
{
/* Process 8 frames (16 samples) each loop. */
/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
int chunk = frames >> 3;
frames &= 7;
if (chunk) {
__asm__ __volatile__ (
"1: \n"
"vld2.16 {d0-d3}, [%[input]]! \n"
"subs %[chunk], #1 \n"
"vmovl.s16 q3, d3 \n"
"vmovl.s16 q2, d2 \n"
"vmovl.s16 q1, d1 \n"
"vmovl.s16 q0, d0 \n"
"vcvt.f32.s32 q3, q3, #15 \n"
"vcvt.f32.s32 q2, q2, #15 \n"
"vcvt.f32.s32 q1, q1, #15 \n"
"vcvt.f32.s32 q0, q0, #15 \n"
"vst1.32 {d4-d7}, [%[output2]]! \n"
"vst1.32 {d0-d3}, [%[output1]]! \n"
"bne 1b \n"
: /* output */
[chunk]"+r"(chunk),
[input]"+r"(input),
[output1]"+r"(output1),
[output2]"+r"(output2)
: /* input */
: /* clobber */
"q0", "q1", "q2", "q3", "memory", "cc"
);
}
/* The remaining samples. */
while (frames--) {
*output1++ = *input++ / 32768.0f;
*output2++ = *input++ / 32768.0f;
}
}
#define deinterleave_stereo deinterleave_stereo
/* Converts floats in range of -1.0f to 1.0f to shorts in range of
* -32768 to 32767 with rounding to nearest, with ties (0.5) rounding away
* from zero.
* Rounding is achieved by adding 0.5 or -0.5 adjusted for fixed point
* precision, and then converting float to fixed point using vcvt instruction
* which truncated toward zero.
* For very large values, beyond +/- 2 billion, vcvt will clamp the result
* to the min or max value that fits an int.
* For other values, vqmovn clamps the output to -32768 to 32767 range.
*/
static void interleave_stereo(float *input1, float *input2,
int16_t *output, int frames)
{
/* Process 4 frames (8 samples) each loop. */
/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
float32x4_t pos = vdupq_n_f32(0.5f / 32768.0f);
float32x4_t neg = vdupq_n_f32(-0.5f / 32768.0f);
int chunk = frames >> 2;
frames &= 3;
if (chunk) {
__asm__ __volatile__ (
"veor q0, q0, q0 \n"
"1: \n"
"vld1.32 {d2-d3}, [%[input1]]! \n"
"vld1.32 {d4-d5}, [%[input2]]! \n"
"subs %[chunk], #1 \n"
/* We try to round to the nearest number by adding 0.5
* to positive input, and adding -0.5 to the negative
* input, then truncate.
*/
"vcgt.f32 q3, q1, q0 \n"
"vcgt.f32 q4, q2, q0 \n"
"vbsl q3, %q[pos], %q[neg] \n"
"vbsl q4, %q[pos], %q[neg] \n"
"vadd.f32 q1, q1, q3 \n"
"vadd.f32 q2, q2, q4 \n"
"vcvt.s32.f32 q1, q1, #15 \n"
"vcvt.s32.f32 q2, q2, #15 \n"
"vqmovn.s32 d2, q1 \n"
"vqmovn.s32 d3, q2 \n"
"vst2.16 {d2-d3}, [%[output]]! \n"
"bne 1b \n"
: /* output */
[chunk]"+r"(chunk),
[input1]"+r"(input1),
[input2]"+r"(input2),
[output]"+r"(output)
: /* input */
[pos]"w"(pos),
[neg]"w"(neg)
: /* clobber */
"q0", "q1", "q2", "q3", "q4", "memory", "cc"
);
}
/* The remaining samples */
while (frames--) {
float f;
f = *input1++ * 32768.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output++ = max(-32768, min(32767, (int)(f)));
f = *input2++ * 32768.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output++ = max(-32768, min(32767, (int)(f)));
}
}
#define interleave_stereo interleave_stereo
#endif
#ifdef __SSE3__
#include <emmintrin.h>
/* Converts shorts in range of -32768 to 32767 to floats in range of
* -1.0f to 1.0f.
* pslld and psrad shifts are used to isolate the low and high word, but
* each in a different range:
* The low word is shifted to the high bits in range 0x80000000 .. 0x7fff0000.
* The high word is shifted to the low bits in range 0x00008000 .. 0x00007fff.
* cvtdq2ps converts ints to floats as is.
* mulps is used to normalize the range of the low and high words, adjusting
* for high and low words being in different range.
*/
static void deinterleave_stereo(int16_t *input, float *output1,
float *output2, int frames)
{
/* Process 8 frames (16 samples) each loop. */
/* L0 R0 L1 R1 L2 R2 L3 R3... -> L0 L1 L2 L3... R0 R1 R2 R3... */
int chunk = frames >> 3;
frames &= 7;
if (chunk) {
__asm__ __volatile__ (
"1: \n"
"lddqu (%[input]), %%xmm0 \n"
"lddqu 16(%[input]), %%xmm1 \n"
"add $32, %[input] \n"
"movdqa %%xmm0, %%xmm2 \n"
"movdqa %%xmm1, %%xmm3 \n"
"pslld $16, %%xmm0 \n"
"pslld $16, %%xmm1 \n"
"psrad $16, %%xmm2 \n"
"psrad $16, %%xmm3 \n"
"cvtdq2ps %%xmm0, %%xmm0 \n"
"cvtdq2ps %%xmm1, %%xmm1 \n"
"cvtdq2ps %%xmm2, %%xmm2 \n"
"cvtdq2ps %%xmm3, %%xmm3 \n"
"mulps %[scale_2_n31], %%xmm0 \n"
"mulps %[scale_2_n31], %%xmm1 \n"
"mulps %[scale_2_n15], %%xmm2 \n"
"mulps %[scale_2_n15], %%xmm3 \n"
"movdqu %%xmm0, (%[output1]) \n"
"movdqu %%xmm1, 16(%[output1]) \n"
"movdqu %%xmm2, (%[output2]) \n"
"movdqu %%xmm3, 16(%[output2]) \n"
"add $32, %[output1] \n"
"add $32, %[output2] \n"
"sub $1, %[chunk] \n"
"jnz 1b \n"
: /* output */
[chunk]"+r"(chunk),
[input]"+r"(input),
[output1]"+r"(output1),
[output2]"+r"(output2)
: /* input */
[scale_2_n31]"x"(_mm_set1_ps(1.0f/(1<<15)/(1<<16))),
[scale_2_n15]"x"(_mm_set1_ps(1.0f/(1<<15)))
: /* clobber */
"xmm0", "xmm1", "xmm2", "xmm3", "memory", "cc"
);
}
/* The remaining samples. */
while (frames--) {
*output1++ = *input++ / 32768.0f;
*output2++ = *input++ / 32768.0f;
}
}
#define deinterleave_stereo deinterleave_stereo
/* Converts floats in range of -1.0f to 1.0f to shorts in range of
* -32768 to 32767 with rounding to nearest, with ties (0.5) rounding to
* even.
* For very large values, beyond +/- 2 billion, cvtps2dq will produce
* 0x80000000 and packssdw will clamp -32768.
*/
static void interleave_stereo(float *input1, float *input2,
int16_t *output, int frames)
{
/* Process 4 frames (8 samples) each loop. */
/* L0 L1 L2 L3, R0 R1 R2 R3 -> L0 R0 L1 R1, L2 R2 L3 R3 */
int chunk = frames >> 2;
frames &= 3;
if (chunk) {
__asm__ __volatile__ (
"1: \n"
"lddqu (%[input1]), %%xmm0 \n"
"lddqu (%[input2]), %%xmm2 \n"
"add $16, %[input1] \n"
"add $16, %[input2] \n"
"movaps %%xmm0, %%xmm1 \n"
"unpcklps %%xmm2, %%xmm0 \n"
"unpckhps %%xmm2, %%xmm1 \n"
"paddsw %[scale_2_15], %%xmm0 \n"
"paddsw %[scale_2_15], %%xmm1 \n"
"cvtps2dq %%xmm0, %%xmm0 \n"
"cvtps2dq %%xmm1, %%xmm1 \n"
"packssdw %%xmm1, %%xmm0 \n"
"movdqu %%xmm0, (%[output]) \n"
"add $16, %[output] \n"
"sub $1, %[chunk] \n"
"jnz 1b \n"
: /* output */
[chunk]"+r"(chunk),
[input1]"+r"(input1),
[input2]"+r"(input2),
[output]"+r"(output)
: /* input */
[scale_2_15]"x"(_mm_set1_epi32(15 << 23)),
[clamp_large]"x"(_mm_set1_ps(32767.0f))
: /* clobber */
"xmm0", "xmm1", "xmm2", "memory", "cc"
);
}
/* The remaining samples */
while (frames--) {
float f;
f = *input1++ * 32768.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output++ = max(-32768, min(32767, (int)(f)));
f = *input2++ * 32768.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output++ = max(-32768, min(32767, (int)(f)));
}
}
#define interleave_stereo interleave_stereo
#endif
static void dsp_util_deinterleave_s16le(int16_t *input, float *const *output,
int channels, int frames)
{
float *output_ptr[channels];
int i, j;
#ifdef deinterleave_stereo
if (channels == 2) {
deinterleave_stereo(input, output[0], output[1], frames);
return;
}
#endif
for (i = 0; i < channels; i++)
output_ptr[i] = output[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++)
*(output_ptr[j]++) = *input++ / 32768.0f;
}
static void dsp_util_deinterleave_s24le(int32_t *input, float *const *output,
int channels, int frames)
{
float *output_ptr[channels];
int i, j;
for (i = 0; i < channels; i++)
output_ptr[i] = output[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++, input++)
*(output_ptr[j]++) =
(*input << 8) / 2147483648.0f;
}
static void dsp_util_deinterleave_s243le(uint8_t *input, float *const *output,
int channels, int frames)
{
float *output_ptr[channels];
int32_t sample;
int i, j;
for (i = 0; i < channels; i++)
output_ptr[i] = output[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++, input += 3) {
sample = 0;
memcpy((uint8_t *)&sample + 1, input, 3);
*(output_ptr[j]++) = sample / 2147483648.0f;
}
}
static void dsp_util_deinterleave_s32le(int32_t *input, float *const *output,
int channels, int frames)
{
float *output_ptr[channels];
int i, j;
for (i = 0; i < channels; i++)
output_ptr[i] = output[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++, input++)
*(output_ptr[j]++) = *input / 2147483648.0f;
}
int dsp_util_deinterleave(uint8_t *input, float *const *output, int channels,
snd_pcm_format_t format, int frames)
{
switch (format) {
case SND_PCM_FORMAT_S16_LE:
dsp_util_deinterleave_s16le((int16_t *)input, output,
channels, frames);
break;
case SND_PCM_FORMAT_S24_LE:
dsp_util_deinterleave_s24le((int32_t *)input, output,
channels, frames);
break;
case SND_PCM_FORMAT_S24_3LE:
dsp_util_deinterleave_s243le(input, output,
channels, frames);
break;
case SND_PCM_FORMAT_S32_LE:
dsp_util_deinterleave_s32le((int32_t *)input, output,
channels, frames);
break;
default:
syslog(LOG_ERR, "Invalid format to deinterleave");
return -EINVAL;
}
return 0;
}
static void dsp_util_interleave_s16le(float *const *input, int16_t *output,
int channels, int frames)
{
float *input_ptr[channels];
int i, j;
#ifdef interleave_stereo
if (channels == 2) {
interleave_stereo(input[0], input[1], output, frames);
return;
}
#endif
for (i = 0; i < channels; i++)
input_ptr[i] = input[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++) {
float f = *(input_ptr[j]++) * 32768.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output++ = max(-32768, min(32767, (int)(f)));
}
}
static void dsp_util_interleave_s24le(float *const *input, int32_t *output,
int channels, int frames)
{
float *input_ptr[channels];
int i, j;
for (i = 0; i < channels; i++)
input_ptr[i] = input[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++, output++) {
float f = *(input_ptr[j]++) * 2147483648.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output = max((float)INT_MIN, min((float)INT_MAX, f));
*output >>= 8;
}
}
static void dsp_util_interleave_s243le(float *const *input, uint8_t *output,
int channels, int frames)
{
float *input_ptr[channels];
int i, j;
int32_t tmp;
for (i = 0; i < channels; i++)
input_ptr[i] = input[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++, output += 3) {
float f = *(input_ptr[j]++) * 2147483648.0f;
f += (f >= 0) ? 0.5f : -0.5f;
tmp = max((float)INT_MIN, min((float)INT_MAX, f));
tmp >>= 8;
memcpy(output, &tmp, 3);
}
}
static void dsp_util_interleave_s32le(float *const *input, int32_t *output,
int channels, int frames)
{
float *input_ptr[channels];
int i, j;
for (i = 0; i < channels; i++)
input_ptr[i] = input[i];
for (i = 0; i < frames; i++)
for (j = 0; j < channels; j++, output++) {
float f = *(input_ptr[j]++) * 2147483648.0f;
f += (f >= 0) ? 0.5f : -0.5f;
*output = max((float)INT_MIN, min((float)INT_MAX, f));
}
}
int dsp_util_interleave(float *const *input, uint8_t *output, int channels,
snd_pcm_format_t format, int frames)
{
switch (format) {
case SND_PCM_FORMAT_S16_LE:
dsp_util_interleave_s16le(input, (int16_t *)output,
channels, frames);
break;
case SND_PCM_FORMAT_S24_LE:
dsp_util_interleave_s24le(input, (int32_t *)output,
channels, frames);
break;
case SND_PCM_FORMAT_S24_3LE:
dsp_util_interleave_s243le(input, output, channels, frames);
break;
case SND_PCM_FORMAT_S32_LE:
dsp_util_interleave_s32le(input, (int32_t *)output,
channels, frames);
break;
default:
syslog(LOG_ERR, "Invalid format to interleave");
return -EINVAL;
}
return 0;
}
void dsp_enable_flush_denormal_to_zero()
{
#if defined(__i386__) || defined(__x86_64__)
unsigned int mxcsr;
mxcsr = __builtin_ia32_stmxcsr();
__builtin_ia32_ldmxcsr(mxcsr | 0x8040);
#elif defined(__aarch64__)
uint64_t cw;
__asm__ __volatile__ (
"mrs %0, fpcr \n"
"orr %0, %0, #0x1000000 \n"
"msr fpcr, %0 \n"
"isb \n"
: "=r"(cw) :: "memory");
#elif defined(__arm__)
uint32_t cw;
__asm__ __volatile__ (
"vmrs %0, fpscr \n"
"orr %0, %0, #0x1000000 \n"
"vmsr fpscr, %0 \n"
: "=r"(cw) :: "memory");
#else
#warning "Don't know how to disable denorms. Performace may suffer."
#endif
}