diff options
author | Lennart Poettering <lennart@poettering.net> | 2009-08-23 00:06:35 +0200 |
---|---|---|
committer | Lennart Poettering <lennart@poettering.net> | 2009-08-23 00:06:35 +0200 |
commit | ab5ac06ac76c3afbbd99bce2840329dd74756a73 (patch) | |
tree | 5735f1e62502d0706a0ed7631cb365cab281303a /src/pulsecore | |
parent | d6fb8d10819bebc1cee203de7330cceeafde9fed (diff) | |
parent | 6076cef2092391d8b46aa84f86857cffebce4583 (diff) |
Merge commit 'wtay/optimize'
Diffstat (limited to 'src/pulsecore')
-rw-r--r-- | src/pulsecore/cpu-arm.c | 140 | ||||
-rw-r--r-- | src/pulsecore/cpu-arm.h | 42 | ||||
-rw-r--r-- | src/pulsecore/cpu-x86.c | 122 | ||||
-rw-r--r-- | src/pulsecore/cpu-x86.h | 68 | ||||
-rw-r--r-- | src/pulsecore/remap.c | 204 | ||||
-rw-r--r-- | src/pulsecore/remap.h | 48 | ||||
-rw-r--r-- | src/pulsecore/remap_mmx.c | 148 | ||||
-rw-r--r-- | src/pulsecore/resampler.c | 229 | ||||
-rw-r--r-- | src/pulsecore/sample-util.c | 396 | ||||
-rw-r--r-- | src/pulsecore/sample-util.h | 5 | ||||
-rw-r--r-- | src/pulsecore/sconv-s16le.c | 42 | ||||
-rw-r--r-- | src/pulsecore/sconv.c | 188 | ||||
-rw-r--r-- | src/pulsecore/sconv.h | 6 | ||||
-rw-r--r-- | src/pulsecore/svolume_arm.c | 195 | ||||
-rw-r--r-- | src/pulsecore/svolume_c.c | 335 | ||||
-rw-r--r-- | src/pulsecore/svolume_mmx.c | 313 | ||||
-rw-r--r-- | src/pulsecore/svolume_sse.c | 314 |
17 files changed, 2199 insertions, 596 deletions
diff --git a/src/pulsecore/cpu-arm.c b/src/pulsecore/cpu-arm.c new file mode 100644 index 00000000..5a994b71 --- /dev/null +++ b/src/pulsecore/cpu-arm.c @@ -0,0 +1,140 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdint.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#include <pulse/xmalloc.h> +#include <pulsecore/log.h> + +#include "cpu-arm.h" + +#if defined (__arm__) && defined (__linux__) + +#define MAX_BUFFER 4096 +static char * +get_cpuinfo_line (char *cpuinfo, const char *tag) { + char *line, *end, *colon; + + if (!(line = strstr (cpuinfo, tag))) + return NULL; + + if (!(end = strchr (line, '\n'))) + return NULL; + + if (!(colon = strchr (line, ':'))) + return NULL; + + if (++colon >= end) + return NULL; + + return pa_xstrndup (colon, end - colon); +} + +static char *get_cpuinfo(void) { + char *cpuinfo; + int n, fd; + + if (!(cpuinfo = malloc(MAX_BUFFER))) + return NULL; + + if ((fd = open("/proc/cpuinfo", O_RDONLY)) < 0) { + free (cpuinfo); + return NULL; + } + + if ((n = read(fd, cpuinfo, MAX_BUFFER-1)) < 0) { + free (cpuinfo); + close (fd); + return NULL; + } + cpuinfo[n] = 0; + close (fd); + + return cpuinfo; +} +#endif /* defined (__arm__) && defined (__linux__) */ + +void pa_cpu_init_arm (void) { +#if defined (__arm__) +#if defined (__linux__) + char *cpuinfo, *line; + int arch; + pa_cpu_arm_flag_t flags = 0; + + /* We need to read the CPU flags from /proc/cpuinfo because there is no user + * space support to get the CPU features. This only works on linux AFAIK. */ + if (!(cpuinfo = get_cpuinfo ())) { + pa_log ("Can't read cpuinfo"); + return; + } + + /* get the CPU architecture */ + if ((line = get_cpuinfo_line (cpuinfo, "CPU architecture"))) { + arch = strtoul (line, NULL, 0); + if (arch >= 6) + flags |= PA_CPU_ARM_V6; + if (arch >= 7) + flags |= PA_CPU_ARM_V7; + + free (line); + } + /* get the CPU features */ + if ((line = get_cpuinfo_line (cpuinfo, "Features"))) { + char *state = NULL, *current; + + while ((current = pa_split_spaces (line, &state))) { + if (!strcmp (current, "vfp")) + flags |= PA_CPU_ARM_VFP; + else if (!strcmp (current, "edsp")) + flags |= PA_CPU_ARM_EDSP; + else if (!strcmp (current, "neon")) + flags |= PA_CPU_ARM_NEON; + else if (!strcmp (current, "vfpv3")) + flags |= PA_CPU_ARM_VFPV3; + + free (current); + } + } + free (cpuinfo); + + pa_log_info ("CPU flags: %s%s%s%s%s%s", + (flags & PA_CPU_ARM_V6) ? "V6 " : "", + (flags & PA_CPU_ARM_V7) ? "V7 " : "", + (flags & PA_CPU_ARM_VFP) ? "VFP " : "", + (flags & PA_CPU_ARM_EDSP) ? "EDSP " : "", + (flags & PA_CPU_ARM_NEON) ? "NEON " : "", + (flags & PA_CPU_ARM_VFPV3) ? "VFPV3 " : ""); +#else /* defined (__linux__) */ + pa_log ("ARM cpu features not yet supported on this OS"); +#endif /* defined (__linux__) */ + + if (flags & PA_CPU_ARM_V6) + pa_volume_func_init_arm (flags); +#endif /* defined (__arm__) */ +} diff --git a/src/pulsecore/cpu-arm.h b/src/pulsecore/cpu-arm.h new file mode 100644 index 00000000..3ccd0708 --- /dev/null +++ b/src/pulsecore/cpu-arm.h @@ -0,0 +1,42 @@ +#ifndef foocpuarmhfoo +#define foocpuarmhfoo + +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#include <stdint.h> + +typedef enum pa_cpu_arm_flag { + PA_CPU_ARM_V6 = (1 << 0), + PA_CPU_ARM_V7 = (1 << 1), + PA_CPU_ARM_VFP = (1 << 2), + PA_CPU_ARM_EDSP = (1 << 3), + PA_CPU_ARM_NEON = (1 << 4), + PA_CPU_ARM_VFPV3 = (1 << 5) +} pa_cpu_arm_flag_t; + +void pa_cpu_init_arm (void); + +/* some optimized functions */ +void pa_volume_func_init_arm(pa_cpu_arm_flag_t flags); + +#endif /* foocpuarmhfoo */ diff --git a/src/pulsecore/cpu-x86.c b/src/pulsecore/cpu-x86.c new file mode 100644 index 00000000..bc093ec0 --- /dev/null +++ b/src/pulsecore/cpu-x86.c @@ -0,0 +1,122 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <stdint.h> + +#include <pulsecore/log.h> + +#include "cpu-x86.h" + +#if defined (__i386__) || defined (__amd64__) +static void +get_cpuid (uint32_t op, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) +{ + __asm__ __volatile__ ( + " push %%"PA_REG_b" \n\t" + " cpuid \n\t" + " mov %%ebx, %%esi \n\t" + " pop %%"PA_REG_b" \n\t" + + : "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d) + : "0" (op) + ); +} +#endif + +void pa_cpu_init_x86 (void) { +#if defined (__i386__) || defined (__amd64__) + uint32_t eax, ebx, ecx, edx; + uint32_t level; + pa_cpu_x86_flag_t flags = 0; + + /* get standard level */ + get_cpuid (0x00000000, &level, &ebx, &ecx, &edx); + if (level >= 1) { + get_cpuid (0x00000001, &eax, &ebx, &ecx, &edx); + + if (edx & (1<<23)) + flags |= PA_CPU_X86_MMX; + + if (edx & (1<<25)) + flags |= PA_CPU_X86_SSE; + + if (edx & (1<<26)) + flags |= PA_CPU_X86_SSE2; + + if (ecx & (1<<0)) + flags |= PA_CPU_X86_SSE3; + + if (ecx & (1<<9)) + flags |= PA_CPU_X86_SSSE3; + + if (ecx & (1<<19)) + flags |= PA_CPU_X86_SSE4_1; + + if (ecx & (1<<20)) + flags |= PA_CPU_X86_SSE4_2; + } + + /* get extended level */ + get_cpuid (0x80000000, &level, &ebx, &ecx, &edx); + if (level >= 0x80000001) { + get_cpuid (0x80000001, &eax, &ebx, &ecx, &edx); + + if (edx & (1<<22)) + flags |= PA_CPU_X86_MMXEXT; + + if (edx & (1<<23)) + flags |= PA_CPU_X86_MMX; + + if (edx & (1<<30)) + flags |= PA_CPU_X86_3DNOWEXT; + + if (edx & (1<<31)) + flags |= PA_CPU_X86_3DNOW; + } + + pa_log_info ("CPU flags: %s%s%s%s%s%s%s%s%s%s", + (flags & PA_CPU_X86_MMX) ? "MMX " : "", + (flags & PA_CPU_X86_SSE) ? "SSE " : "", + (flags & PA_CPU_X86_SSE2) ? "SSE2 " : "", + (flags & PA_CPU_X86_SSE3) ? "SSE3 " : "", + (flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "", + (flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "", + (flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "", + (flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "", + (flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "", + (flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : ""); + + /* activate various optimisations */ + if (flags & PA_CPU_X86_MMX) { + pa_volume_func_init_mmx (flags); + pa_remap_func_init_mmx (flags); + } + + if (flags & PA_CPU_X86_SSE) + pa_volume_func_init_sse (flags); + +#endif /* defined (__i386__) || defined (__amd64__) */ +} diff --git a/src/pulsecore/cpu-x86.h b/src/pulsecore/cpu-x86.h new file mode 100644 index 00000000..b11ef6ea --- /dev/null +++ b/src/pulsecore/cpu-x86.h @@ -0,0 +1,68 @@ +#ifndef foocpux86hfoo +#define foocpux86hfoo + +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#include <stdint.h> + +typedef enum pa_cpu_x86_flag { + PA_CPU_X86_MMX = (1 << 0), + PA_CPU_X86_MMXEXT = (1 << 1), + PA_CPU_X86_SSE = (1 << 2), + PA_CPU_X86_SSE2 = (1 << 3), + PA_CPU_X86_SSE3 = (1 << 4), + PA_CPU_X86_SSSE3 = (1 << 5), + PA_CPU_X86_SSE4_1 = (1 << 6), + PA_CPU_X86_SSE4_2 = (1 << 7), + PA_CPU_X86_3DNOW = (1 << 8), + PA_CPU_X86_3DNOWEXT = (1 << 9) +} pa_cpu_x86_flag_t; + +void pa_cpu_init_x86 (void); + + +#if defined (__i386__) +typedef int32_t pa_reg_x86; +#define PA_REG_a "eax" +#define PA_REG_b "ebx" +#define PA_REG_c "ecx" +#define PA_REG_d "edx" +#define PA_REG_D "edi" +#define PA_REG_S "esi" +#elif defined (__amd64__) +typedef int64_t pa_reg_x86; +#define PA_REG_a "rax" +#define PA_REG_b "rbx" +#define PA_REG_c "rcx" +#define PA_REG_d "rdx" +#define PA_REG_D "rdi" +#define PA_REG_S "rsi" +#endif + +/* some optimized functions */ +void pa_volume_func_init_mmx(pa_cpu_x86_flag_t flags); +void pa_volume_func_init_sse(pa_cpu_x86_flag_t flags); + +void pa_remap_func_init_mmx(pa_cpu_x86_flag_t flags); + +#endif /* foocpux86hfoo */ diff --git a/src/pulsecore/remap.c b/src/pulsecore/remap.c new file mode 100644 index 00000000..a0fc85b9 --- /dev/null +++ b/src/pulsecore/remap.c @@ -0,0 +1,204 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> + +#include <pulse/sample.h> +#include <pulsecore/log.h> +#include <pulsecore/macro.h> + +#include "remap.h" + +static void remap_mono_to_stereo_c (pa_remap_t *m, void *dst, const void *src, unsigned n) { + unsigned i; + + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + float *d, *s; + + d = (float *) dst; + s = (float *) src; + + for (i = n >> 2; i; i--) { + d[0] = d[1] = s[0]; + d[2] = d[3] = s[1]; + d[4] = d[5] = s[2]; + d[6] = d[7] = s[3]; + s += 4; + d += 8; + } + for (i = n & 3; i; i--) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + case PA_SAMPLE_S16NE: + { + int16_t *d, *s; + + d = (int16_t *) dst; + s = (int16_t *) src; + + for (i = n >> 2; i; i--) { + d[0] = d[1] = s[0]; + d[2] = d[3] = s[1]; + d[4] = d[5] = s[2]; + d[6] = d[7] = s[3]; + s += 4; + d += 8; + } + for (i = n & 3; i; i--) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } + default: + pa_assert_not_reached(); + } +} + +static void remap_channels_matrix_c (pa_remap_t *m, void *dst, const void *src, unsigned n) { + unsigned oc, ic, i; + unsigned n_ic, n_oc; + + n_ic = m->i_ss->channels; + n_oc = m->o_ss->channels; + + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + float *d, *s; + + memset(dst, 0, n * sizeof (float) * n_oc); + + for (oc = 0; oc < n_oc; oc++) { + + for (ic = 0; ic < n_ic; ic++) { + float vol; + + vol = m->map_table_f[oc][ic]; + + if (vol <= 0.0) + continue; + + d = (float *)dst + oc; + s = (float *)src + ic; + + if (vol >= 1.0) { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += *s; + } else { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += *s * vol; + } + } + } + + break; + } + case PA_SAMPLE_S16NE: + { + int16_t *d, *s; + + memset(dst, 0, n * sizeof (int16_t) * n_oc); + + for (oc = 0; oc < n_oc; oc++) { + + for (ic = 0; ic < n_ic; ic++) { + int32_t vol; + + vol = m->map_table_i[oc][ic]; + + if (vol <= 0) + continue; + + d = (int16_t *)dst + oc; + s = (int16_t *)src + ic; + + if (vol >= 0x10000) { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += *s; + } else { + for (i = n; i > 0; i--, s += n_ic, d += n_oc) + *d += (int16_t) (((int32_t)*s * vol) >> 16); + } + } + } + break; + } + default: + pa_assert_not_reached(); + } +} + +/* set the function that will execute the remapping based on the matrices */ +static void init_remap_c (pa_remap_t *m) { + unsigned n_oc, n_ic; + + n_oc = m->o_ss->channels; + n_ic = m->i_ss->channels; + + /* find some common channel remappings, fall back to full matrix operation. */ + if (n_ic == 1 && n_oc == 2 && + m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_c; + pa_log_info("Using mono to stereo remapping"); + } else { + m->do_remap = (pa_do_remap_func_t) remap_channels_matrix_c; + pa_log_info("Using generic matrix remapping"); + } +} + + +/* default C implementation */ +static pa_init_remap_func_t remap_func = init_remap_c; + +void pa_init_remap (pa_remap_t *m) { + pa_assert (remap_func); + + m->do_remap = NULL; + + /* call the installed remap init function */ + remap_func (m); + + if (m->do_remap == NULL) { + /* nothing was installed, fallback to C version */ + init_remap_c (m); + } +} + +pa_init_remap_func_t pa_get_init_remap_func(void) { + return remap_func; +} + +void pa_set_init_remap_func(pa_init_remap_func_t func) { + remap_func = func; +} diff --git a/src/pulsecore/remap.h b/src/pulsecore/remap.h new file mode 100644 index 00000000..32a67cdd --- /dev/null +++ b/src/pulsecore/remap.h @@ -0,0 +1,48 @@ +#ifndef fooremapfoo +#define fooremapfoo + +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#include <pulse/sample.h> + +typedef struct pa_remap pa_remap_t; + +typedef void (*pa_do_remap_func_t) (pa_remap_t *m, void *d, const void *s, unsigned n); + +struct pa_remap { + pa_sample_format_t *format; + pa_sample_spec *i_ss, *o_ss; + float map_table_f[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; + int32_t map_table_i[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; + pa_do_remap_func_t do_remap; +}; + +void pa_init_remap (pa_remap_t *m); + +/* custom installation of init functions */ +typedef void (*pa_init_remap_func_t) (pa_remap_t *m); + +pa_init_remap_func_t pa_get_init_remap_func(void); +void pa_set_init_remap_func(pa_init_remap_func_t func); + +#endif /* fooremapfoo */ diff --git a/src/pulsecore/remap_mmx.c b/src/pulsecore/remap_mmx.c new file mode 100644 index 00000000..bfcae6c5 --- /dev/null +++ b/src/pulsecore/remap_mmx.c @@ -0,0 +1,148 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk.com> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <string.h> + +#include <pulse/sample.h> +#include <pulsecore/log.h> +#include <pulsecore/macro.h> + +#include "cpu-x86.h" +#include "remap.h" + +#define LOAD_SAMPLES \ + " movq (%1), %%mm0 \n\t" \ + " movq 8(%1), %%mm2 \n\t" \ + " movq 16(%1), %%mm4 \n\t" \ + " movq 24(%1), %%mm6 \n\t" \ + " movq %%mm0, %%mm1 \n\t" \ + " movq %%mm2, %%mm3 \n\t" \ + " movq %%mm4, %%mm5 \n\t" \ + " movq %%mm6, %%mm7 \n\t" + +#define UNPACK_SAMPLES(s) \ + " punpckl"#s" %%mm0, %%mm0 \n\t" \ + " punpckh"#s" %%mm1, %%mm1 \n\t" \ + " punpckl"#s" %%mm2, %%mm2 \n\t" \ + " punpckh"#s" %%mm3, %%mm3 \n\t" \ + " punpckl"#s" %%mm4, %%mm4 \n\t" \ + " punpckh"#s" %%mm5, %%mm5 \n\t" \ + " punpckl"#s" %%mm6, %%mm6 \n\t" \ + " punpckh"#s" %%mm7, %%mm7 \n\t" \ + +#define STORE_SAMPLES \ + " movq %%mm0, (%0) \n\t" \ + " movq %%mm1, 8(%0) \n\t" \ + " movq %%mm2, 16(%0) \n\t" \ + " movq %%mm3, 24(%0) \n\t" \ + " movq %%mm4, 32(%0) \n\t" \ + " movq %%mm5, 40(%0) \n\t" \ + " movq %%mm6, 48(%0) \n\t" \ + " movq %%mm7, 56(%0) \n\t" \ + " add $32, %1 \n\t" \ + " add $64, %0 \n\t" + +#define HANDLE_SINGLE(s) \ + " movd (%1), %%mm0 \n\t" \ + " movq %%mm0, %%mm1 \n\t" \ + " punpckl"#s" %%mm0, %%mm0 \n\t" \ + " movq %%mm0, (%0) \n\t" \ + " add $4, %1 \n\t" \ + " add $8, %0 \n\t" + +#define MONO_TO_STEREO(s) \ + " mov %3, %2 \n\t" \ + " sar $3, %2 \n\t" \ + " cmp $0, %2 \n\t" \ + " je 2f \n\t" \ + "1: \n\t" \ + LOAD_SAMPLES \ + UNPACK_SAMPLES(s) \ + STORE_SAMPLES \ + " dec %2 \n\t" \ + " jne 1b \n\t" \ + "2: \n\t" \ + " mov %3, %2 \n\t" \ + " and $7, %2 \n\t" \ + " je 4f \n\t" \ + "3: \n\t" \ + HANDLE_SINGLE(s) \ + " dec %2 \n\t" \ + " jne 3b \n\t" \ + "4: \n\t" \ + " emms \n\t" + +static void remap_mono_to_stereo_mmx (pa_remap_t *m, void *dst, const void *src, unsigned n) { + pa_reg_x86 temp; + + switch (*m->format) { + case PA_SAMPLE_FLOAT32NE: + { + __asm__ __volatile__ ( + MONO_TO_STEREO(dq) /* do doubles to quads */ + : "+r" (dst), "+r" (src), "=&r" (temp) + : "r" ((pa_reg_x86)n) + : "cc" + ); + break; + } + case PA_SAMPLE_S16NE: + { + __asm__ __volatile__ ( + MONO_TO_STEREO(wd) /* do words to doubles */ + : "+r" (dst), "+r" (src), "=&r" (temp) + : "r" ((pa_reg_x86)n) + : "cc" + ); + break; + } + default: + pa_assert_not_reached(); + } +} + +/* set the function that will execute the remapping based on the matrices */ +static void init_remap_mmx (pa_remap_t *m) { + unsigned n_oc, n_ic; + + n_oc = m->o_ss->channels; + n_ic = m->i_ss->channels; + + /* find some common channel remappings, fall back to full matrix operation. */ + if (n_ic == 1 && n_oc == 2 && + m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) { + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_mmx; + pa_log_info("Using MMX mono to stereo remapping"); + } +} + +void pa_remap_func_init_mmx (pa_cpu_x86_flag_t flags) { +#if defined (__i386__) || defined (__amd64__) + pa_log_info("Initialising MMX optimized remappers."); + + pa_set_init_remap_func ((pa_init_remap_func_t) init_remap_mmx); +#endif /* defined (__i386__) || defined (__amd64__) */ +} diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c index 59e0a0c1..f1bfa156 100644 --- a/src/pulsecore/resampler.c +++ b/src/pulsecore/resampler.c @@ -31,9 +31,6 @@ #include <speex/speex_resampler.h> -#include <liboil/liboilfuncs.h> -#include <liboil/liboil.h> - #include <pulse/xmalloc.h> #include <pulsecore/sconv.h> #include <pulsecore/log.h> @@ -43,6 +40,7 @@ #include "ffmpeg/avcodec.h" #include "resampler.h" +#include "remap.h" /* Number of samples of extra space we allow the resamplers to return */ #define EXTRA_FRAMES 128 @@ -64,7 +62,7 @@ struct pa_resampler { pa_convert_func_t to_work_format_func; pa_convert_func_t from_work_format_func; - float map_table[PA_CHANNELS_MAX][PA_CHANNELS_MAX]; + pa_remap_t remap; pa_bool_t map_required; void (*impl_free)(pa_resampler *r); @@ -214,6 +212,11 @@ pa_resampler* pa_resampler_new( r->i_ss = *a; r->o_ss = *b; + /* set up the remap structure */ + r->remap.i_ss = &r->i_ss; + r->remap.o_ss = &r->o_ss; + r->remap.format = &r->work_format; + if (am) r->i_cm = *am; else if (!pa_channel_map_init_auto(&r->i_cm, r->i_ss.channels, PA_CHANNEL_MAP_DEFAULT)) @@ -580,32 +583,41 @@ static int front_rear_side(pa_channel_position_t p) { static void calc_map_table(pa_resampler *r) { unsigned oc, ic; + unsigned n_oc, n_ic; pa_bool_t ic_connected[PA_CHANNELS_MAX]; pa_bool_t remix; pa_strbuf *s; char *t; + pa_remap_t *m; pa_assert(r); if (!(r->map_required = (r->i_ss.channels != r->o_ss.channels || (!(r->flags & PA_RESAMPLER_NO_REMAP) && !pa_channel_map_equal(&r->i_cm, &r->o_cm))))) return; - memset(r->map_table, 0, sizeof(r->map_table)); + m = &r->remap; + + n_oc = r->o_ss.channels; + n_ic = r->i_ss.channels; + + memset(m->map_table_f, 0, sizeof(m->map_table_f)); + memset(m->map_table_i, 0, sizeof(m->map_table_i)); + memset(ic_connected, 0, sizeof(ic_connected)); remix = (r->flags & (PA_RESAMPLER_NO_REMAP|PA_RESAMPLER_NO_REMIX)) == 0; - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { pa_bool_t oc_connected = FALSE; pa_channel_position_t b = r->o_cm.map[oc]; - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { pa_channel_position_t a = r->i_cm.map[ic]; if (r->flags & PA_RESAMPLER_NO_REMAP) { /* We shall not do any remapping. Hence, just check by index */ if (ic == oc) - r->map_table[oc][ic] = 1.0; + m->map_table_f[oc][ic] = 1.0; continue; } @@ -614,7 +626,7 @@ static void calc_map_table(pa_resampler *r) { /* We shall not do any remixing. Hence, just check by name */ if (a == b) - r->map_table[oc][ic] = 1.0; + m->map_table_f[oc][ic] = 1.0; continue; } @@ -689,7 +701,7 @@ static void calc_map_table(pa_resampler *r) { */ if (a == b || a == PA_CHANNEL_POSITION_MONO || b == PA_CHANNEL_POSITION_MONO) { - r->map_table[oc][ic] = 1.0; + m->map_table_f[oc][ic] = 1.0; oc_connected = TRUE; ic_connected[ic] = TRUE; @@ -707,14 +719,14 @@ static void calc_map_table(pa_resampler *r) { /* We are not connected and on the left side, let's * average all left side input channels. */ - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_left(r->i_cm.map[ic])) n++; if (n > 0) - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_left(r->i_cm.map[ic])) { - r->map_table[oc][ic] = 1.0f / (float) n; + m->map_table_f[oc][ic] = 1.0f / (float) n; ic_connected[ic] = TRUE; } @@ -728,14 +740,14 @@ static void calc_map_table(pa_resampler *r) { /* We are not connected and on the right side, let's * average all right side input channels. */ - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_right(r->i_cm.map[ic])) n++; if (n > 0) - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_right(r->i_cm.map[ic])) { - r->map_table[oc][ic] = 1.0f / (float) n; + m->map_table_f[oc][ic] = 1.0f / (float) n; ic_connected[ic] = TRUE; } @@ -749,14 +761,14 @@ static void calc_map_table(pa_resampler *r) { /* We are not connected and at the center. Let's * average all center input channels. */ - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_center(r->i_cm.map[ic])) n++; if (n > 0) { - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_center(r->i_cm.map[ic])) { - r->map_table[oc][ic] = 1.0f / (float) n; + m->map_table_f[oc][ic] = 1.0f / (float) n; ic_connected[ic] = TRUE; } } else { @@ -766,14 +778,14 @@ static void calc_map_table(pa_resampler *r) { n = 0; - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_left(r->i_cm.map[ic]) || on_right(r->i_cm.map[ic])) n++; if (n > 0) - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) if (on_left(r->i_cm.map[ic]) || on_right(r->i_cm.map[ic])) { - r->map_table[oc][ic] = 1.0f / (float) n; + m->map_table_f[oc][ic] = 1.0f / (float) n; ic_connected[ic] = TRUE; } @@ -787,12 +799,12 @@ static void calc_map_table(pa_resampler *r) { /* We are not connected and an LFE. Let's average all * channels for LFE. */ - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { if (!(r->flags & PA_RESAMPLER_NO_LFE)) - r->map_table[oc][ic] = 1.0f / (float) r->i_ss.channels; + m->map_table_f[oc][ic] = 1.0f / (float) n_ic; else - r->map_table[oc][ic] = 0; + m->map_table_f[oc][ic] = 0; /* Please note that a channel connected to LFE * doesn't really count as connected. */ @@ -808,7 +820,7 @@ static void calc_map_table(pa_resampler *r) { ic_unconnected_center = 0, ic_unconnected_lfe = 0; - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { pa_channel_position_t a = r->i_cm.map[ic]; if (ic_connected[ic]) @@ -831,20 +843,20 @@ static void calc_map_table(pa_resampler *r) { * the left side by .9 and add in our averaged unconnected * channels multplied by .1 */ - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { if (!on_left(r->o_cm.map[oc])) continue; - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { if (ic_connected[ic]) { - r->map_table[oc][ic] *= .9f; + m->map_table_f[oc][ic] *= .9f; continue; } if (on_left(r->i_cm.map[ic])) - r->map_table[oc][ic] = .1f / (float) ic_unconnected_left; + m->map_table_f[oc][ic] = .1f / (float) ic_unconnected_left; } } } @@ -856,20 +868,20 @@ static void calc_map_table(pa_resampler *r) { * the right side by .9 and add in our averaged unconnected * channels multplied by .1 */ - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { if (!on_right(r->o_cm.map[oc])) continue; - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { if (ic_connected[ic]) { - r->map_table[oc][ic] *= .9f; + m->map_table_f[oc][ic] *= .9f; continue; } if (on_right(r->i_cm.map[ic])) - r->map_table[oc][ic] = .1f / (float) ic_unconnected_right; + m->map_table_f[oc][ic] = .1f / (float) ic_unconnected_right; } } } @@ -882,20 +894,20 @@ static void calc_map_table(pa_resampler *r) { * the center side by .9 and add in our averaged unconnected * channels multplied by .1 */ - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { if (!on_center(r->o_cm.map[oc])) continue; - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { if (ic_connected[ic]) { - r->map_table[oc][ic] *= .9f; + m->map_table_f[oc][ic] *= .9f; continue; } if (on_center(r->i_cm.map[ic])) { - r->map_table[oc][ic] = .1f / (float) ic_unconnected_center; + m->map_table_f[oc][ic] = .1f / (float) ic_unconnected_center; mixed_in = TRUE; } } @@ -913,7 +925,7 @@ static void calc_map_table(pa_resampler *r) { it into left and right. Using .375 and 0.75 as factors. */ - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { if (ic_connected[ic]) continue; @@ -921,7 +933,7 @@ static void calc_map_table(pa_resampler *r) { if (!on_center(r->i_cm.map[ic])) continue; - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { if (!on_left(r->o_cm.map[oc]) && !on_right(r->o_cm.map[oc])) continue; @@ -932,7 +944,7 @@ static void calc_map_table(pa_resampler *r) { } } - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { if (!on_left(r->o_cm.map[oc]) && !on_right(r->o_cm.map[oc])) continue; @@ -942,7 +954,7 @@ static void calc_map_table(pa_resampler *r) { } } - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { if (!on_left(r->o_cm.map[oc]) && !on_right(r->o_cm.map[oc])) continue; @@ -950,10 +962,10 @@ static void calc_map_table(pa_resampler *r) { if (ncenter[oc] <= 0) continue; - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { if (ic_connected[ic]) { - r->map_table[oc][ic] *= .75f; + m->map_table_f[oc][ic] *= .75f; continue; } @@ -961,7 +973,7 @@ static void calc_map_table(pa_resampler *r) { continue; if (!found_frs[ic] || front_rear_side(r->i_cm.map[ic]) == front_rear_side(r->o_cm.map[oc])) - r->map_table[oc][ic] = .375f / (float) ncenter[oc]; + m->map_table_f[oc][ic] = .375f / (float) ncenter[oc]; } } } @@ -972,40 +984,46 @@ static void calc_map_table(pa_resampler *r) { /* OK, so there is an unconnected LFE channel. Let's mix * it into all channels, with factor 0.375 */ - for (ic = 0; ic < r->i_ss.channels; ic++) { + for (ic = 0; ic < n_ic; ic++) { if (!on_lfe(r->i_cm.map[ic])) continue; - for (oc = 0; oc < r->o_ss.channels; oc++) - r->map_table[oc][ic] = 0.375f / (float) ic_unconnected_lfe; + for (oc = 0; oc < n_oc; oc++) + m->map_table_f[oc][ic] = 0.375f / (float) ic_unconnected_lfe; } } } - + /* make an 16:16 int version of the matrix */ + for (oc = 0; oc < n_oc; oc++) + for (ic = 0; ic < n_ic; ic++) + m->map_table_i[oc][ic] = (int32_t) (m->map_table_f[oc][ic] * 0x10000); s = pa_strbuf_new(); pa_strbuf_printf(s, " "); - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) pa_strbuf_printf(s, " I%02u ", ic); pa_strbuf_puts(s, "\n +"); - for (ic = 0; ic < r->i_ss.channels; ic++) + for (ic = 0; ic < n_ic; ic++) pa_strbuf_printf(s, "------"); pa_strbuf_puts(s, "\n"); - for (oc = 0; oc < r->o_ss.channels; oc++) { + for (oc = 0; oc < n_oc; oc++) { pa_strbuf_printf(s, "O%02u |", oc); - for (ic = 0; ic < r->i_ss.channels; ic++) - pa_strbuf_printf(s, " %1.3f", r->map_table[oc][ic]); + for (ic = 0; ic < n_ic; ic++) + pa_strbuf_printf(s, " %1.3f", m->map_table_f[oc][ic]); pa_strbuf_puts(s, "\n"); } pa_log_debug("Channel matrix:\n%s", t = pa_strbuf_tostring_free(s)); pa_xfree(t); + + /* initialize the remapping function */ + pa_init_remap (m); } static pa_memchunk* convert_to_work_format(pa_resampler *r, pa_memchunk *input) { @@ -1045,41 +1063,10 @@ static pa_memchunk* convert_to_work_format(pa_resampler *r, pa_memchunk *input) return &r->buf1; } -static void vectoradd_s16_with_fraction( - int16_t *d, int dstr, - const int16_t *s1, int sstr1, - const int16_t *s2, int sstr2, - int n, - float s3, float s4) { - - int32_t i3, i4; - - i3 = (int32_t) (s3 * 0x10000); - i4 = (int32_t) (s4 * 0x10000); - - for (; n > 0; n--) { - int32_t a, b; - - a = *s1; - b = *s2; - - a = (a * i3) / 0x10000; - b = (b * i4) / 0x10000; - - *d = (int16_t) (a + b); - - s1 = (const int16_t*) ((const uint8_t*) s1 + sstr1); - s2 = (const int16_t*) ((const uint8_t*) s2 + sstr2); - d = (int16_t*) ((uint8_t*) d + dstr); - - } -} - static pa_memchunk *remap_channels(pa_resampler *r, pa_memchunk *input) { unsigned in_n_samples, out_n_samples, n_frames; - int i_skip, o_skip; - unsigned oc; void *src, *dst; + pa_remap_t *remap; pa_assert(r); pa_assert(input); @@ -1108,76 +1095,14 @@ static pa_memchunk *remap_channels(pa_resampler *r, pa_memchunk *input) { src = ((uint8_t*) pa_memblock_acquire(input->memblock) + input->index); dst = pa_memblock_acquire(r->buf2.memblock); - memset(dst, 0, r->buf2.length); - - o_skip = (int) (r->w_sz * r->o_ss.channels); - i_skip = (int) (r->w_sz * r->i_ss.channels); - - switch (r->work_format) { - case PA_SAMPLE_FLOAT32NE: - - for (oc = 0; oc < r->o_ss.channels; oc++) { - unsigned ic; - static const float one = 1.0; - - for (ic = 0; ic < r->i_ss.channels; ic++) { + remap = &r->remap; - if (r->map_table[oc][ic] <= 0.0) - continue; - - oil_vectoradd_f32( - (float*) dst + oc, o_skip, - (float*) dst + oc, o_skip, - (float*) src + ic, i_skip, - (int) n_frames, - &one, &r->map_table[oc][ic]); - } - } - - break; - - case PA_SAMPLE_S16NE: - - for (oc = 0; oc < r->o_ss.channels; oc++) { - unsigned ic; - - for (ic = 0; ic < r->i_ss.channels; ic++) { - - if (r->map_table[oc][ic] <= 0.0) - continue; - - if (r->map_table[oc][ic] >= 1.0) { - static const int16_t one = 1; - - oil_vectoradd_s16( - (int16_t*) dst + oc, o_skip, - (int16_t*) dst + oc, o_skip, - (int16_t*) src + ic, i_skip, - (int) n_frames, - &one, &one); - - } else - - vectoradd_s16_with_fraction( - (int16_t*) dst + oc, o_skip, - (int16_t*) dst + oc, o_skip, - (int16_t*) src + ic, i_skip, - (int) n_frames, - 1.0f, r->map_table[oc][ic]); - } - } - - break; - - default: - pa_assert_not_reached(); - } + pa_assert (remap->do_remap); + remap->do_remap (remap, dst, src, n_frames); pa_memblock_release(input->memblock); pa_memblock_release(r->buf2.memblock); - r->buf2.length = out_n_samples * r->w_sz; - return &r->buf2; } @@ -1469,7 +1394,7 @@ static void trivial_resample(pa_resampler *r, const pa_memchunk *input, unsigned pa_assert(o_index * fz < pa_memblock_get_length(output->memblock)); - oil_memcpy((uint8_t*) dst + fz * o_index, + memcpy((uint8_t*) dst + fz * o_index, (uint8_t*) src + fz * j, (int) fz); } diff --git a/src/pulsecore/sample-util.c b/src/pulsecore/sample-util.c index 5b8ccf59..6e97e5a9 100644 --- a/src/pulsecore/sample-util.c +++ b/src/pulsecore/sample-util.c @@ -30,9 +30,6 @@ #include <stdio.h> #include <errno.h> -#include <liboil/liboilfuncs.h> -#include <liboil/liboil.h> - #include <pulse/timeval.h> #include <pulsecore/log.h> @@ -106,24 +103,36 @@ void* pa_silence_memory(void *p, size_t length, const pa_sample_spec *spec) { return p; } +#define VOLUME_PADDING 32 + static void calc_linear_integer_volume(int32_t linear[], const pa_cvolume *volume) { - unsigned channel; + unsigned channel, nchannels, padding; pa_assert(linear); pa_assert(volume); - for (channel = 0; channel < volume->channels; channel++) + nchannels = volume->channels; + + for (channel = 0; channel < nchannels; channel++) linear[channel] = (int32_t) lrint(pa_sw_volume_to_linear(volume->values[channel]) * 0x10000); + + for (padding = 0; padding < VOLUME_PADDING; padding++, channel++) + linear[channel] = linear[padding]; } static void calc_linear_float_volume(float linear[], const pa_cvolume *volume) { - unsigned channel; + unsigned channel, nchannels, padding; pa_assert(linear); pa_assert(volume); - for (channel = 0; channel < volume->channels; channel++) + nchannels = volume->channels; + + for (channel = 0; channel < nchannels; channel++) linear[channel] = (float) pa_sw_volume_to_linear(volume->values[channel]); + + for (padding = 0; padding < VOLUME_PADDING; padding++, channel++) + linear[channel] = linear[padding]; } static void calc_linear_integer_stream_volumes(pa_mix_info streams[], unsigned nstreams, const pa_cvolume *volume, const pa_sample_spec *spec) { @@ -690,6 +699,28 @@ size_t pa_mix( return length; } +typedef union { + float f; + uint32_t i; +} volume_val; + +typedef void (*pa_calc_volume_func_t) (void *volumes, const pa_cvolume *volume); + +static const pa_calc_volume_func_t calc_volume_table[] = { + [PA_SAMPLE_U8] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_ALAW] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_ULAW] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_S16LE] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_S16BE] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_FLOAT32LE] = (pa_calc_volume_func_t) calc_linear_float_volume, + [PA_SAMPLE_FLOAT32BE] = (pa_calc_volume_func_t) calc_linear_float_volume, + [PA_SAMPLE_S32LE] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_S32BE] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_S24LE] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_S24BE] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_S24_32LE] = (pa_calc_volume_func_t) calc_linear_integer_volume, + [PA_SAMPLE_S24_32BE] = (pa_calc_volume_func_t) calc_linear_integer_volume +}; void pa_volume_memchunk( pa_memchunk*c, @@ -697,6 +728,8 @@ void pa_volume_memchunk( const pa_cvolume *volume) { void *ptr; + volume_val linear[PA_CHANNELS_MAX + VOLUME_PADDING]; + pa_do_volume_func_t do_volume; pa_assert(c); pa_assert(spec); @@ -714,337 +747,19 @@ void pa_volume_memchunk( return; } - ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index; - - switch (spec->format) { - - case PA_SAMPLE_S16NE: { - int16_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (int16_t*) ptr + c->length/sizeof(int16_t); - - for (channel = 0, d = ptr; d < e; d++) { - int32_t t, hi, lo; - - /* Multiplying the 32bit volume factor with the 16bit - * sample might result in an 48bit value. We want to - * do without 64 bit integers and hence do the - * multiplication independantly for the HI and LO part - * of the volume. */ - - hi = linear[channel] >> 16; - lo = linear[channel] & 0xFFFF; - - t = (int32_t)(*d); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *d = (int16_t) t; - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - - break; - } - - case PA_SAMPLE_S16RE: { - int16_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (int16_t*) ptr + c->length/sizeof(int16_t); - - for (channel = 0, d = ptr; d < e; d++) { - int32_t t, hi, lo; - - hi = linear[channel] >> 16; - lo = linear[channel] & 0xFFFF; - - t = (int32_t) PA_INT16_SWAP(*d); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *d = PA_INT16_SWAP((int16_t) t); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - - break; - } - - case PA_SAMPLE_S32NE: { - int32_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (int32_t*) ptr + c->length/sizeof(int32_t); - - for (channel = 0, d = ptr; d < e; d++) { - int64_t t; - - t = (int64_t)(*d); - t = (t * linear[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *d = (int32_t) t; - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_S32RE: { - int32_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (int32_t*) ptr + c->length/sizeof(int32_t); - - for (channel = 0, d = ptr; d < e; d++) { - int64_t t; - - t = (int64_t) PA_INT32_SWAP(*d); - t = (t * linear[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *d = PA_INT32_SWAP((int32_t) t); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_S24NE: { - uint8_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (uint8_t*) ptr + c->length; - - for (channel = 0, d = ptr; d < e; d += 3) { - int64_t t; - - t = (int64_t)((int32_t) (PA_READ24NE(d) << 8)); - t = (t * linear[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - PA_WRITE24NE(d, ((uint32_t) (int32_t) t) >> 8); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_S24RE: { - uint8_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (uint8_t*) ptr + c->length; - - for (channel = 0, d = ptr; d < e; d += 3) { - int64_t t; - - t = (int64_t)((int32_t) (PA_READ24RE(d) << 8)); - t = (t * linear[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - PA_WRITE24RE(d, ((uint32_t) (int32_t) t) >> 8); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_S24_32NE: { - uint32_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (uint32_t*) ptr + c->length/sizeof(uint32_t); - - for (channel = 0, d = ptr; d < e; d++) { - int64_t t; - - t = (int64_t) ((int32_t) (*d << 8)); - t = (t * linear[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *d = ((uint32_t) ((int32_t) t)) >> 8; - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_S24_32RE: { - uint32_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (uint32_t*) ptr + c->length/sizeof(uint32_t); - - for (channel = 0, d = ptr; d < e; d++) { - int64_t t; - - t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*d) << 8)); - t = (t * linear[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *d = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_U8: { - uint8_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (uint8_t*) ptr + c->length; - - for (channel = 0, d = ptr; d < e; d++) { - int32_t t, hi, lo; - - hi = linear[channel] >> 16; - lo = linear[channel] & 0xFFFF; - - t = (int32_t) *d - 0x80; - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F); - *d = (uint8_t) (t + 0x80); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_ULAW: { - uint8_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (uint8_t*) ptr + c->length; - - for (channel = 0, d = ptr; d < e; d++) { - int32_t t, hi, lo; - - hi = linear[channel] >> 16; - lo = linear[channel] & 0xFFFF; - - t = (int32_t) st_ulaw2linear16(*d); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *d = (uint8_t) st_14linear2ulaw((int16_t) t >> 2); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_ALAW: { - uint8_t *d, *e; - unsigned channel; - int32_t linear[PA_CHANNELS_MAX]; - - calc_linear_integer_volume(linear, volume); - - e = (uint8_t*) ptr + c->length; - - for (channel = 0, d = ptr; d < e; d++) { - int32_t t, hi, lo; - - hi = linear[channel] >> 16; - lo = linear[channel] & 0xFFFF; - - t = (int32_t) st_alaw2linear16(*d); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *d = (uint8_t) st_13linear2alaw((int16_t) t >> 3); - - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - break; - } - - case PA_SAMPLE_FLOAT32NE: { - float *d; - int skip; - unsigned n; - unsigned channel; - - d = ptr; - skip = (int) (spec->channels * sizeof(float)); - n = (unsigned) (c->length/sizeof(float)/spec->channels); - - for (channel = 0; channel < spec->channels; channel ++) { - float v, *t; - - if (PA_UNLIKELY(volume->values[channel] == PA_VOLUME_NORM)) - continue; - - v = (float) pa_sw_volume_to_linear(volume->values[channel]); - t = d + channel; - oil_scalarmult_f32(t, skip, t, skip, &v, (int) n); - } - break; - } - - case PA_SAMPLE_FLOAT32RE: { - float *d, *e; - unsigned channel; - float linear[PA_CHANNELS_MAX]; - - calc_linear_float_volume(linear, volume); - - e = (float*) ptr + c->length/sizeof(float); - - for (channel = 0, d = ptr; d < e; d++) { - float t; + if (spec->format < 0 || spec->format > PA_SAMPLE_MAX) { + pa_log_warn(" Unable to change volume of format %s.", pa_sample_format_to_string(spec->format)); + return; + } - t = PA_FLOAT32_SWAP(*d); - t *= linear[channel]; - *d = PA_FLOAT32_SWAP(t); + do_volume = pa_get_volume_func (spec->format); + pa_assert(do_volume); - if (PA_UNLIKELY(++channel >= spec->channels)) - channel = 0; - } - - break; - } + calc_volume_table[spec->format] ((void *)linear, volume); + ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index; - default: - pa_log_warn(" Unable to change volume of format %s.", pa_sample_format_to_string(spec->format)); - /* If we cannot change the volume, we just don't do it */ - } + do_volume (ptr, (void *)linear, spec->channels, c->length); pa_memblock_release(c->memblock); } @@ -1090,7 +805,7 @@ void pa_interleave(const void *src[], unsigned channels, void *dst, size_t ss, u d = (uint8_t*) dst + c * ss; for (j = 0; j < n; j ++) { - oil_memcpy(d, s, (int) ss); + memcpy(d, s, (int) ss); s = (uint8_t*) s + ss; d = (uint8_t*) d + fs; } @@ -1118,7 +833,7 @@ void pa_deinterleave(const void *src, void *dst[], unsigned channels, size_t ss, d = dst[c]; for (j = 0; j < n; j ++) { - oil_memcpy(d, s, (int) ss); + memcpy(d, s, (int) ss); s = (uint8_t*) s + fs; d = (uint8_t*) d + ss; } @@ -1227,10 +942,15 @@ void pa_sample_clamp(pa_sample_format_t format, void *dst, size_t dstr, const vo s = src; d = dst; if (format == PA_SAMPLE_FLOAT32NE) { + for (; n > 0; n--) { + float f; - float minus_one = -1.0, plus_one = 1.0; - oil_clip_f32(d, (int) dstr, s, (int) sstr, (int) n, &minus_one, &plus_one); + f = *s; + *d = PA_CLAMP_UNLIKELY(f, -1.0f, 1.0f); + s = (const float*) ((const uint8_t*) s + sstr); + d = (float*) ((uint8_t*) d + dstr); + } } else { pa_assert(format == PA_SAMPLE_FLOAT32RE); diff --git a/src/pulsecore/sample-util.h b/src/pulsecore/sample-util.h index 6a306c11..34df5cf3 100644 --- a/src/pulsecore/sample-util.h +++ b/src/pulsecore/sample-util.h @@ -86,6 +86,11 @@ void pa_memchunk_dump_to_file(pa_memchunk *c, const char *fn); void pa_memchunk_sine(pa_memchunk *c, pa_mempool *pool, unsigned rate, unsigned freq); +typedef void (*pa_do_volume_func_t) (void *samples, void *volumes, unsigned channels, unsigned length); + +pa_do_volume_func_t pa_get_volume_func(pa_sample_format_t f); +void pa_set_volume_func(pa_sample_format_t f, pa_do_volume_func_t func); + #define PA_CHANNEL_POSITION_MASK_LEFT \ (PA_CHANNEL_POSITION_MASK(PA_CHANNEL_POSITION_FRONT_LEFT) \ | PA_CHANNEL_POSITION_MASK(PA_CHANNEL_POSITION_REAR_LEFT) \ diff --git a/src/pulsecore/sconv-s16le.c b/src/pulsecore/sconv-s16le.c index 43b8cb3e..0fefdf1c 100644 --- a/src/pulsecore/sconv-s16le.c +++ b/src/pulsecore/sconv-s16le.c @@ -28,8 +28,6 @@ #include <inttypes.h> #include <stdio.h> -#include <liboil/liboilfuncs.h> - #include <pulsecore/sconv.h> #include <pulsecore/macro.h> #include <pulsecore/log.h> @@ -86,17 +84,13 @@ void pa_sconv_s16le_to_float32ne(unsigned n, const int16_t *a, float *b) { pa_assert(b); #if SWAP_WORDS == 1 - for (; n > 0; n--) { int16_t s = *(a++); *(b++) = ((float) INT16_FROM(s))/(float) 0x7FFF; } - #else -{ - static const double add = 0, factor = 1.0/0x7FFF; - oil_scaleconv_f32_s16(b, a, (int) n, &add, &factor); -} + for (; n > 0; n--) + *(b++) = ((float) (*(a++)))/(float) 0x7FFF; #endif } @@ -105,17 +99,13 @@ void pa_sconv_s32le_to_float32ne(unsigned n, const int32_t *a, float *b) { pa_assert(b); #if SWAP_WORDS == 1 - for (; n > 0; n--) { int32_t s = *(a++); *(b++) = (float) (((double) INT32_FROM(s))/0x7FFFFFFF); } - #else -{ - static const double add = 0, factor = 1.0/0x7FFFFFFF; - oil_scaleconv_f32_s32(b, a, (int) n, &add, &factor); -} + for (; n > 0; n--) + *(b++) = (float) (((double) (*(a++)))/0x7FFFFFFF); #endif } @@ -124,7 +114,6 @@ void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) { pa_assert(b); #if SWAP_WORDS == 1 - for (; n > 0; n--) { int16_t s; float v = *(a++); @@ -133,12 +122,13 @@ void pa_sconv_s16le_from_float32ne(unsigned n, const float *a, int16_t *b) { s = (int16_t) lrintf(v * 0x7FFF); *(b++) = INT16_TO(s); } - #else -{ - static const double add = 0, factor = 0x7FFF; - oil_scaleconv_s16_f32(b, a, (int) n, &add, &factor); -} + for (; n > 0; n--) { + float v = *(a++); + + v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.f); + *(b++) = (int16_t) lrintf(v * 0x7FFF); + } #endif } @@ -147,7 +137,6 @@ void pa_sconv_s32le_from_float32ne(unsigned n, const float *a, int32_t *b) { pa_assert(b); #if SWAP_WORDS == 1 - for (; n > 0; n--) { int32_t s; float v = *(a++); @@ -156,12 +145,13 @@ void pa_sconv_s32le_from_float32ne(unsigned n, const float *a, int32_t *b) { s = (int32_t) lrint((double) v * (double) 0x7FFFFFFF); *(b++) = INT32_TO(s); } - #else -{ - static const double add = 0, factor = 0x7FFFFFFF; - oil_scaleconv_s32_f32(b, a, (int) n, &add, &factor); -} + for (; n > 0; n--) { + float v = *(a++); + + v = PA_CLAMP_UNLIKELY(v, -1.0f, 1.0f); + *(b++) = (int32_t) lrint((double) v * (double) 0x7FFFFFFF); + } #endif } diff --git a/src/pulsecore/sconv.c b/src/pulsecore/sconv.c index d89f4283..d06d6985 100644 --- a/src/pulsecore/sconv.c +++ b/src/pulsecore/sconv.c @@ -27,9 +27,6 @@ #include <stdio.h> #include <stdlib.h> -#include <liboil/liboilfuncs.h> -#include <liboil/liboil.h> - #include <pulsecore/g711.h> #include <pulsecore/macro.h> @@ -41,32 +38,31 @@ /* u8 */ static void u8_to_float32ne(unsigned n, const uint8_t *a, float *b) { - static const double add = -1, factor = 1.0/128.0; - pa_assert(a); pa_assert(b); - oil_scaleconv_f32_u8(b, a, (int) n, &add, &factor); + for (; n > 0; n--, a++, b++) + *b = (*a * 1.0/128.0) - 1.0; } static void u8_from_float32ne(unsigned n, const float *a, uint8_t *b) { - static const double add = 128, factor = 127.0; - pa_assert(a); pa_assert(b); - oil_scaleconv_u8_f32(b, a, (int) n, &add, &factor); + for (; n > 0; n--, a++, b++) { + float v; + v = (*a * 127.0) + 128.0; + v = PA_CLAMP_UNLIKELY (v, 0.0, 255.0); + *b = rint (v); + } } static void u8_to_s16ne(unsigned n, const uint8_t *a, int16_t *b) { - static const int16_t add = -0x80, factor = 0x100; - pa_assert(a); pa_assert(b); - oil_conv_s16_u8(b, 2, a, 1, (int) n); - oil_scalaradd_s16(b, 2, b, 2, &add, (int) n); - oil_scalarmult_s16(b, 2, b, 2, &factor, (int) n); + for (; n > 0; n--, a++, b++) + *b = (((int16_t)*a) - 128) << 8; } static void u8_from_s16ne(unsigned n, const int16_t *a, uint8_t *b) { @@ -84,7 +80,7 @@ static void float32ne_to_float32ne(unsigned n, const float *a, float *b) { pa_assert(a); pa_assert(b); - oil_memcpy(b, a, (int) (sizeof(float) * n)); + memcpy(b, a, (int) (sizeof(float) * n)); } static void float32re_to_float32ne(unsigned n, const float *a, float *b) { @@ -101,7 +97,7 @@ static void s16ne_to_s16ne(unsigned n, const int16_t *a, int16_t *b) { pa_assert(a); pa_assert(b); - oil_memcpy(b, a, (int) (sizeof(int16_t) * n)); + memcpy(b, a, (int) (sizeof(int16_t) * n)); } static void s16re_to_s16ne(unsigned n, const int16_t *a, int16_t *b) { @@ -188,98 +184,130 @@ static void alaw_from_s16ne(unsigned n, const int16_t *a, uint8_t *b) { *b = st_13linear2alaw(*a >> 3); } +static pa_convert_func_t to_float32ne_table[] = { + [PA_SAMPLE_U8] = (pa_convert_func_t) u8_to_float32ne, + [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_to_float32ne, + [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_to_float32ne, + [PA_SAMPLE_S16LE] = (pa_convert_func_t) pa_sconv_s16le_to_float32ne, + [PA_SAMPLE_S16BE] = (pa_convert_func_t) pa_sconv_s16be_to_float32ne, + [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_to_float32ne, + [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_to_float32ne, + [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_to_float32ne, + [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_to_float32ne, + [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_to_float32ne, + [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_to_float32ne, + [PA_SAMPLE_FLOAT32NE] = (pa_convert_func_t) float32ne_to_float32ne, + [PA_SAMPLE_FLOAT32RE] = (pa_convert_func_t) float32re_to_float32ne, +}; + pa_convert_func_t pa_get_convert_to_float32ne_function(pa_sample_format_t f) { - static const pa_convert_func_t table[] = { - [PA_SAMPLE_U8] = (pa_convert_func_t) u8_to_float32ne, - [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_to_float32ne, - [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_to_float32ne, - [PA_SAMPLE_S16LE] = (pa_convert_func_t) pa_sconv_s16le_to_float32ne, - [PA_SAMPLE_S16BE] = (pa_convert_func_t) pa_sconv_s16be_to_float32ne, - [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_to_float32ne, - [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_to_float32ne, - [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_to_float32ne, - [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_to_float32ne, - [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_to_float32ne, - [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_to_float32ne, - [PA_SAMPLE_FLOAT32NE] = (pa_convert_func_t) float32ne_to_float32ne, - [PA_SAMPLE_FLOAT32RE] = (pa_convert_func_t) float32re_to_float32ne, - }; + pa_assert(f >= 0); + pa_assert(f < PA_SAMPLE_MAX); + + return to_float32ne_table[f]; +} + +void pa_set_convert_to_float32ne_function(pa_sample_format_t f, pa_convert_func_t func) { pa_assert(f >= 0); pa_assert(f < PA_SAMPLE_MAX); - return table[f]; + to_float32ne_table[f] = func; } +static pa_convert_func_t from_float32ne_table[] = { + [PA_SAMPLE_U8] = (pa_convert_func_t) u8_from_float32ne, + [PA_SAMPLE_S16LE] = (pa_convert_func_t) pa_sconv_s16le_from_float32ne, + [PA_SAMPLE_S16BE] = (pa_convert_func_t) pa_sconv_s16be_from_float32ne, + [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_from_float32ne, + [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_from_float32ne, + [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_from_float32ne, + [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_from_float32ne, + [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_from_float32ne, + [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_from_float32ne, + [PA_SAMPLE_FLOAT32NE] = (pa_convert_func_t) float32ne_to_float32ne, + [PA_SAMPLE_FLOAT32RE] = (pa_convert_func_t) float32re_to_float32ne, + [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_from_float32ne, + [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_from_float32ne +}; + pa_convert_func_t pa_get_convert_from_float32ne_function(pa_sample_format_t f) { - static const pa_convert_func_t table[] = { - [PA_SAMPLE_U8] = (pa_convert_func_t) u8_from_float32ne, - [PA_SAMPLE_S16LE] = (pa_convert_func_t) pa_sconv_s16le_from_float32ne, - [PA_SAMPLE_S16BE] = (pa_convert_func_t) pa_sconv_s16be_from_float32ne, - [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_from_float32ne, - [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_from_float32ne, - [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_from_float32ne, - [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_from_float32ne, - [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_from_float32ne, - [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_from_float32ne, - [PA_SAMPLE_FLOAT32NE] = (pa_convert_func_t) float32ne_to_float32ne, - [PA_SAMPLE_FLOAT32RE] = (pa_convert_func_t) float32re_to_float32ne, - [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_from_float32ne, - [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_from_float32ne - }; + pa_assert(f >= 0); + pa_assert(f < PA_SAMPLE_MAX); + + return from_float32ne_table[f]; +} + +void pa_set_convert_from_float32ne_function(pa_sample_format_t f, pa_convert_func_t func) { pa_assert(f >= 0); pa_assert(f < PA_SAMPLE_MAX); - return table[f]; + from_float32ne_table[f] = func; } +static pa_convert_func_t to_s16ne_table[] = { + [PA_SAMPLE_U8] = (pa_convert_func_t) u8_to_s16ne, + [PA_SAMPLE_S16NE] = (pa_convert_func_t) s16ne_to_s16ne, + [PA_SAMPLE_S16RE] = (pa_convert_func_t) s16re_to_s16ne, + [PA_SAMPLE_FLOAT32BE] = (pa_convert_func_t) pa_sconv_float32be_to_s16ne, + [PA_SAMPLE_FLOAT32LE] = (pa_convert_func_t) pa_sconv_float32le_to_s16ne, + [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_to_s16ne, + [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_to_s16ne, + [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_to_s16ne, + [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_to_s16ne, + [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_to_s16ne, + [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_to_s16ne, + [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_to_s16ne, + [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_to_s16ne +}; + pa_convert_func_t pa_get_convert_to_s16ne_function(pa_sample_format_t f) { - static const pa_convert_func_t table[] = { - [PA_SAMPLE_U8] = (pa_convert_func_t) u8_to_s16ne, - [PA_SAMPLE_S16NE] = (pa_convert_func_t) s16ne_to_s16ne, - [PA_SAMPLE_S16RE] = (pa_convert_func_t) s16re_to_s16ne, - [PA_SAMPLE_FLOAT32BE] = (pa_convert_func_t) pa_sconv_float32be_to_s16ne, - [PA_SAMPLE_FLOAT32LE] = (pa_convert_func_t) pa_sconv_float32le_to_s16ne, - [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_to_s16ne, - [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_to_s16ne, - [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_to_s16ne, - [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_to_s16ne, - [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_to_s16ne, - [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_to_s16ne, - [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_to_s16ne, - [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_to_s16ne - }; + pa_assert(f >= 0); + pa_assert(f < PA_SAMPLE_MAX); + + return to_s16ne_table[f]; +} + +void pa_set_convert_to_s16ne_function(pa_sample_format_t f, pa_convert_func_t func) { pa_assert(f >= 0); pa_assert(f < PA_SAMPLE_MAX); - return table[f]; + to_s16ne_table[f] = func; } +static pa_convert_func_t from_s16ne_table[] = { + [PA_SAMPLE_U8] = (pa_convert_func_t) u8_from_s16ne, + [PA_SAMPLE_S16NE] = (pa_convert_func_t) s16ne_to_s16ne, + [PA_SAMPLE_S16RE] = (pa_convert_func_t) s16re_to_s16ne, + [PA_SAMPLE_FLOAT32BE] = (pa_convert_func_t) pa_sconv_float32be_from_s16ne, + [PA_SAMPLE_FLOAT32LE] = (pa_convert_func_t) pa_sconv_float32le_from_s16ne, + [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_from_s16ne, + [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_from_s16ne, + [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_from_s16ne, + [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_from_s16ne, + [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_from_s16ne, + [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_from_s16ne, + [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_from_s16ne, + [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_from_s16ne, +}; + pa_convert_func_t pa_get_convert_from_s16ne_function(pa_sample_format_t f) { - static const pa_convert_func_t table[] = { - [PA_SAMPLE_U8] = (pa_convert_func_t) u8_from_s16ne, - [PA_SAMPLE_S16NE] = (pa_convert_func_t) s16ne_to_s16ne, - [PA_SAMPLE_S16RE] = (pa_convert_func_t) s16re_to_s16ne, - [PA_SAMPLE_FLOAT32BE] = (pa_convert_func_t) pa_sconv_float32be_from_s16ne, - [PA_SAMPLE_FLOAT32LE] = (pa_convert_func_t) pa_sconv_float32le_from_s16ne, - [PA_SAMPLE_S32BE] = (pa_convert_func_t) pa_sconv_s32be_from_s16ne, - [PA_SAMPLE_S32LE] = (pa_convert_func_t) pa_sconv_s32le_from_s16ne, - [PA_SAMPLE_S24BE] = (pa_convert_func_t) pa_sconv_s24be_from_s16ne, - [PA_SAMPLE_S24LE] = (pa_convert_func_t) pa_sconv_s24le_from_s16ne, - [PA_SAMPLE_S24_32BE] = (pa_convert_func_t) pa_sconv_s24_32be_from_s16ne, - [PA_SAMPLE_S24_32LE] = (pa_convert_func_t) pa_sconv_s24_32le_from_s16ne, - [PA_SAMPLE_ALAW] = (pa_convert_func_t) alaw_from_s16ne, - [PA_SAMPLE_ULAW] = (pa_convert_func_t) ulaw_from_s16ne, - }; + pa_assert(f >= 0); + pa_assert(f < PA_SAMPLE_MAX); + + return from_s16ne_table[f]; +} + +void pa_set_convert_from_s16ne_function(pa_sample_format_t f, pa_convert_func_t func) { pa_assert(f >= 0); pa_assert(f < PA_SAMPLE_MAX); - return table[f]; + from_s16ne_table[f] = func; } diff --git a/src/pulsecore/sconv.h b/src/pulsecore/sconv.h index b00a16a4..cd937559 100644 --- a/src/pulsecore/sconv.h +++ b/src/pulsecore/sconv.h @@ -33,4 +33,10 @@ pa_convert_func_t pa_get_convert_from_float32ne_function(pa_sample_format_t f) P pa_convert_func_t pa_get_convert_to_s16ne_function(pa_sample_format_t f) PA_GCC_PURE; pa_convert_func_t pa_get_convert_from_s16ne_function(pa_sample_format_t f) PA_GCC_PURE; +void pa_set_convert_to_float32ne_function(pa_sample_format_t f, pa_convert_func_t func); +void pa_set_convert_from_float32ne_function(pa_sample_format_t f, pa_convert_func_t func); + +void pa_set_convert_to_s16ne_function(pa_sample_format_t f, pa_convert_func_t func); +void pa_set_convert_from_s16ne_function(pa_sample_format_t f, pa_convert_func_t func); + #endif diff --git a/src/pulsecore/svolume_arm.c b/src/pulsecore/svolume_arm.c new file mode 100644 index 00000000..5bd1448f --- /dev/null +++ b/src/pulsecore/svolume_arm.c @@ -0,0 +1,195 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/timeval.h> +#include <pulsecore/random.h> +#include <pulsecore/macro.h> +#include <pulsecore/g711.h> +#include <pulsecore/core-util.h> + +#include "cpu-arm.h" + +#include "sample-util.h" +#include "endianmacros.h" + +#if defined (__arm__) + +#define MOD_INC() \ + " subs r0, r6, %2 \n\t" \ + " addcs r0, %1 \n\t" \ + " movcs r6, r0 \n\t" + +static void +pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + int32_t *ve; + + channels = PA_MAX (4U, channels); + ve = volumes + channels; + + __asm__ __volatile__ ( + " mov r6, %1 \n\t" + " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ + " tst %3, #1 \n\t" /* check for odd samples */ + " beq 2f \n\t" + + "1: \n\t" + " ldr r0, [r6], #4 \n\t" /* odd samples volumes */ + " ldrh r2, [%0] \n\t" + + " smulwb r0, r0, r2 \n\t" + " ssat r0, #16, r0 \n\t" + + " strh r0, [%0], #2 \n\t" + + MOD_INC() + + "2: \n\t" + " mov %3, %3, LSR #1 \n\t" + " tst %3, #1 \n\t" /* check for odd samples */ + " beq 4f \n\t" + + "3: \n\t" + " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */ + " ldr r0, [%0] \n\t" + + " smulwt r2, r2, r0 \n\t" + " smulwb r3, r3, r0 \n\t" + + " ssat r2, #16, r2 \n\t" + " ssat r3, #16, r3 \n\t" + + " pkhbt r0, r3, r2, LSL #16 \n\t" + " str r0, [%0], #4 \n\t" + + MOD_INC() + + "4: \n\t" + " movs %3, %3, LSR #1 \n\t" + " beq 6f \n\t" + + "5: \n\t" + " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */ + " ldrd r4, [r6], #8 \n\t" + " ldrd r0, [%0] \n\t" + + " smulwt r2, r2, r0 \n\t" + " smulwb r3, r3, r0 \n\t" + " smulwt r4, r4, r1 \n\t" + " smulwb r5, r5, r1 \n\t" + + " ssat r2, #16, r2 \n\t" + " ssat r3, #16, r3 \n\t" + " ssat r4, #16, r4 \n\t" + " ssat r5, #16, r5 \n\t" + + " pkhbt r0, r3, r2, LSL #16 \n\t" + " pkhbt r1, r5, r4, LSL #16 \n\t" + " strd r0, [%0], #8 \n\t" + + MOD_INC() + + " subs %3, %3, #1 \n\t" + " bne 5b \n\t" + "6: \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) + : + : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" + ); +} + +#undef RUN_TEST + +#ifdef RUN_TEST +#define CHANNELS 2 +#define SAMPLES 1023 +#define TIMES 1000 +#define PADDING 16 + +static void run_test (void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + int16_t samples_orig[SAMPLES]; + int32_t volumes[CHANNELS + PADDING]; + int i, j, padding; + pa_do_volume_func_t func; + pa_usec_t start, stop; + + func = pa_get_volume_func (PA_SAMPLE_S16NE); + + printf ("checking ARM %zd\n", sizeof (samples)); + + pa_random (samples, sizeof (samples)); + memcpy (samples_ref, samples, sizeof (samples)); + memcpy (samples_orig, samples, sizeof (samples)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = rand() >> 1; + for (padding = 0; padding < PADDING; padding++, i++) + volumes[i] = volumes[padding]; + + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); + for (i = 0; i < SAMPLES; i++) { + if (samples[i] != samples_ref[i]) { + printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], + samples_orig[i], volumes[i % CHANNELS]); + } + } + + start = pa_rtclock_now(); + for (j = 0; j < TIMES; j++) { + memcpy (samples, samples_orig, sizeof (samples)); + pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); + } + stop = pa_rtclock_now(); + pa_log_info("ARM: %llu usec.", (long long unsigned int) (stop - start)); + + start = pa_rtclock_now(); + for (j = 0; j < TIMES; j++) { + memcpy (samples_ref, samples_orig, sizeof (samples)); + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int) (stop - start)); +} +#endif + +#endif /* defined (__arm__) */ + + +void pa_volume_func_init_arm (pa_cpu_arm_flag_t flags) { +#if defined (__arm__) + pa_log_info("Initialising ARM optimized functions."); + +#ifdef RUN_TEST + run_test (); +#endif + + pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_arm); +#endif /* defined (__arm__) */ +} diff --git a/src/pulsecore/svolume_c.c b/src/pulsecore/svolume_c.c new file mode 100644 index 00000000..5fc052b8 --- /dev/null +++ b/src/pulsecore/svolume_c.c @@ -0,0 +1,335 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2006 Pierre Ossman <ossman@cendio.se> for Cendio AB + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + + +#include <pulsecore/macro.h> +#include <pulsecore/g711.h> +#include <pulsecore/core-util.h> + +#include "sample-util.h" +#include "endianmacros.h" + +static void +pa_volume_u8_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + for (channel = 0; length; length--) { + int32_t t, hi, lo; + + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; + + t = (int32_t) *samples - 0x80; + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F); + *samples++ = (uint8_t) (t + 0x80); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_alaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + for (channel = 0; length; length--) { + int32_t t, hi, lo; + + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; + + t = (int32_t) st_alaw2linear16(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_ulaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + for (channel = 0; length; length--) { + int32_t t, hi, lo; + + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; + + t = (int32_t) st_ulaw2linear16(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s16ne_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (int16_t); + + for (channel = 0; length; length--) { + int32_t t, hi, lo; + + /* Multiplying the 32bit volume factor with the 16bit + * sample might result in an 48bit value. We want to + * do without 64 bit integers and hence do the + * multiplication independantly for the HI and LO part + * of the volume. */ + + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; + + t = (int32_t)(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (int16_t) t; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s16re_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (int16_t); + + for (channel = 0; length; length--) { + int32_t t, hi, lo; + + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; + + t = (int32_t) PA_INT16_SWAP(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = PA_INT16_SWAP((int16_t) t); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_float32ne_c (float *samples, float *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (float); + + for (channel = 0; length; length--) { + *samples++ *= volumes[channel]; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_float32re_c (float *samples, float *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (float); + + for (channel = 0; length; length--) { + float t; + + t = PA_FLOAT32_SWAP(*samples); + t *= volumes[channel]; + *samples++ = PA_FLOAT32_SWAP(t); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s32ne_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (int32_t); + + for (channel = 0; length; length--) { + int64_t t; + + t = (int64_t)(*samples); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = (int32_t) t; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s32re_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (int32_t); + + for (channel = 0; length; length--) { + int64_t t; + + t = (int64_t) PA_INT32_SWAP(*samples); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = PA_INT32_SWAP((int32_t) t); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s24ne_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + uint8_t *e; + + e = samples + length; + + for (channel = 0; samples < e; samples += 3) { + int64_t t; + + t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s24re_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + uint8_t *e; + + e = samples + length; + + for (channel = 0; samples < e; samples += 3) { + int64_t t; + + t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s24_32ne_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (uint32_t); + + for (channel = 0; length; length--) { + int64_t t; + + t = (int64_t) ((int32_t) (*samples << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = ((uint32_t) ((int32_t) t)) >> 8; + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static void +pa_volume_s24_32re_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + unsigned channel; + + length /= sizeof (uint32_t); + + for (channel = 0; length; length--) { + int64_t t; + + t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8); + + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } +} + +static pa_do_volume_func_t do_volume_table[] = +{ + [PA_SAMPLE_U8] = (pa_do_volume_func_t) pa_volume_u8_c, + [PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c, + [PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c, + [PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c, + [PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c, + [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c, + [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c, + [PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c, + [PA_SAMPLE_S32RE] = (pa_do_volume_func_t) pa_volume_s32re_c, + [PA_SAMPLE_S24NE] = (pa_do_volume_func_t) pa_volume_s24ne_c, + [PA_SAMPLE_S24RE] = (pa_do_volume_func_t) pa_volume_s24re_c, + [PA_SAMPLE_S24_32NE] = (pa_do_volume_func_t) pa_volume_s24_32ne_c, + [PA_SAMPLE_S24_32RE] = (pa_do_volume_func_t) pa_volume_s24_32re_c +}; + +pa_do_volume_func_t pa_get_volume_func(pa_sample_format_t f) { + pa_assert(f >= 0); + pa_assert(f < PA_SAMPLE_MAX); + + return do_volume_table[f]; +} + +void pa_set_volume_func(pa_sample_format_t f, pa_do_volume_func_t func) { + pa_assert(f >= 0); + pa_assert(f < PA_SAMPLE_MAX); + + do_volume_table[f] = func; +} diff --git a/src/pulsecore/svolume_mmx.c b/src/pulsecore/svolume_mmx.c new file mode 100644 index 00000000..8510b0c4 --- /dev/null +++ b/src/pulsecore/svolume_mmx.c @@ -0,0 +1,313 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/timeval.h> +#include <pulsecore/random.h> +#include <pulsecore/macro.h> +#include <pulsecore/g711.h> +#include <pulsecore/core-util.h> + +#include "cpu-x86.h" + +#include "sample-util.h" +#include "endianmacros.h" + +#if defined (__i386__) || defined (__amd64__) +/* in s: 2 int16_t samples + * in v: 2 int32_t volumes, fixed point 16:16 + * out s: contains scaled and clamped int16_t samples. + * + * We calculate the high 32 bits of a 32x16 multiply which we then + * clamp to 16 bits. The calulcation is: + * + * vl = (v & 0xffff) + * vh = (v >> 16) + * s = ((s * vl) >> 16) + (s * vh); + * + * For the first multiply we have to do a sign correction as we need to + * multiply a signed int with an unsigned int. Hacker's delight 8-3 gives a + * simple formula to correct the sign of the high word after the signed + * multiply. + */ +#define VOLUME_32x16(s,v) /* .. | vh | vl | */ \ + " pxor %%mm4, %%mm4 \n\t" /* .. | 0 | 0 | */ \ + " punpcklwd %%mm4, "#s" \n\t" /* .. | 0 | p0 | */ \ + " pcmpgtw "#v", %%mm4 \n\t" /* .. | 0 | s(vl) | */ \ + " pand "#s", %%mm4 \n\t" /* .. | 0 | (p0) | (vl >> 15) & p */ \ + " movq %%mm6, %%mm5 \n\t" /* .. | ffff | 0 | */ \ + " pand "#v", %%mm5 \n\t" /* .. | vh | 0 | */ \ + " por %%mm5, %%mm4 \n\t" /* .. | vh | (p0) | */ \ + " pmulhw "#s", "#v" \n\t" /* .. | 0 | vl*p0 | */ \ + " paddw %%mm4, "#v" \n\t" /* .. | vh | vl*p0 | vh + sign correct */ \ + " pslld $16, "#s" \n\t" /* .. | p0 | 0 | */ \ + " por %%mm7, "#s" \n\t" /* .. | p0 | 1 | */ \ + " pmaddwd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \ + " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */ + +/* approximately advances %3 = (%3 + a) % b. This function requires that + * a <= b. */ +#define MOD_ADD(a,b) \ + " add "#a", %3 \n\t" \ + " mov %3, %4 \n\t" \ + " sub "#b", %4 \n\t" \ + " cmovae %4, %3 \n\t" + +/* swap 16 bits */ +#define SWAP_16(s) \ + " movq "#s", %%mm4 \n\t" /* .. | h l | */ \ + " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \ + " psllw $8, "#s" \n\t" /* .. | l 0 | */ \ + " por %%mm4, "#s" \n\t" /* .. | l h | */ + +/* swap 2 registers 16 bits for better pairing */ +#define SWAP_16_2(s1,s2) \ + " movq "#s1", %%mm4 \n\t" /* .. | h l | */ \ + " movq "#s2", %%mm5 \n\t" \ + " psrlw $8, %%mm4 \n\t" /* .. | 0 h | */ \ + " psrlw $8, %%mm5 \n\t" \ + " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \ + " psllw $8, "#s2" \n\t" \ + " por %%mm4, "#s1" \n\t" /* .. | l h | */ \ + " por %%mm5, "#s2" \n\t" + +static void +pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = PA_MAX (4U, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */ + " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */ + " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */ + " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " movd %4, %%mm1 \n\t" + VOLUME_32x16 (%%mm1, %%mm0) + " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" /* check for odd samples */ + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + VOLUME_32x16 (%%mm1, %%mm0) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " cmp $0, %2 \n\t" + " je 6f \n\t" + + "5: \n\t" /* do samples in groups of 4 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ + VOLUME_32x16 (%%mm1, %%mm0) + VOLUME_32x16 (%%mm3, %%mm2) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + " dec %2 \n\t" + " jne 5b \n\t" + + "6: \n\t" + " emms \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); +} + +static void +pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = PA_MAX (4U, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */ + " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */ + " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */ + " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " rorw $8, %w4 \n\t" + " movd %4, %%mm1 \n\t" + VOLUME_32x16 (%%mm1, %%mm0) + " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ + " rorw $8, %w4 \n\t" + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" /* check for odd samples */ + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + SWAP_16 (%%mm1) + VOLUME_32x16 (%%mm1, %%mm0) + SWAP_16 (%%mm0) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " cmp $0, %2 \n\t" + " je 6f \n\t" + + "5: \n\t" /* do samples in groups of 4 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ + SWAP_16_2 (%%mm1, %%mm3) + VOLUME_32x16 (%%mm1, %%mm0) + VOLUME_32x16 (%%mm3, %%mm2) + SWAP_16_2 (%%mm0, %%mm2) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + " dec %2 \n\t" + " jne 5b \n\t" + + "6: \n\t" + " emms \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); +} + +#undef RUN_TEST + +#ifdef RUN_TEST +#define CHANNELS 2 +#define SAMPLES 1021 +#define TIMES 1000 +#define PADDING 16 + +static void run_test (void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + int16_t samples_orig[SAMPLES]; + int32_t volumes[CHANNELS + PADDING]; + int i, j, padding; + pa_do_volume_func_t func; + pa_usec_t start, stop; + + func = pa_get_volume_func (PA_SAMPLE_S16NE); + + printf ("checking MMX %zd\n", sizeof (samples)); + + pa_random (samples, sizeof (samples)); + memcpy (samples_ref, samples, sizeof (samples)); + memcpy (samples_orig, samples, sizeof (samples)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = rand() >> 1; + for (padding = 0; padding < PADDING; padding++, i++) + volumes[i] = volumes[padding]; + + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); + for (i = 0; i < SAMPLES; i++) { + if (samples[i] != samples_ref[i]) { + printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], + samples_orig[i], volumes[i % CHANNELS]); + } + } + + start = pa_rtclock_now(); + for (j = 0; j < TIMES; j++) { + memcpy (samples, samples_orig, sizeof (samples)); + pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); + } + stop = pa_rtclock_now(); + pa_log_info("MMX: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (j = 0; j < TIMES; j++) { + memcpy (samples_ref, samples_orig, sizeof (samples)); + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} +#endif + +#endif /* defined (__i386__) || defined (__amd64__) */ + + +void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) { +#if defined (__i386__) || defined (__amd64__) + pa_log_info("Initialising MMX optimized functions."); + +#ifdef RUN_TEST + run_test (); +#endif + + pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx); + pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx); +#endif /* defined (__i386__) || defined (__amd64__) */ +} diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c new file mode 100644 index 00000000..54af4a57 --- /dev/null +++ b/src/pulsecore/svolume_sse.c @@ -0,0 +1,314 @@ +/*** + This file is part of PulseAudio. + + Copyright 2004-2006 Lennart Poettering + Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk> + + PulseAudio is free software; you can redistribute it and/or modify + it under the terms of the GNU Lesser General Public License as published + by the Free Software Foundation; either version 2.1 of the License, + or (at your option) any later version. + + PulseAudio is distributed in the hope that it will be useful, but + WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received a copy of the GNU Lesser General Public License + along with PulseAudio; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + USA. +***/ + +#ifdef HAVE_CONFIG_H +#include <config.h> +#endif + +#include <pulse/timeval.h> +#include <pulsecore/random.h> +#include <pulsecore/macro.h> +#include <pulsecore/g711.h> +#include <pulsecore/core-util.h> + +#include "cpu-x86.h" + +#include "sample-util.h" +#include "endianmacros.h" + +#if defined (__i386__) || defined (__amd64__) + +#define VOLUME_32x16(s,v) /* .. | vh | vl | */ \ + " pxor %%xmm4, %%xmm4 \n\t" /* .. | 0 | 0 | */ \ + " punpcklwd %%xmm4, "#s" \n\t" /* .. | 0 | p0 | */ \ + " pcmpgtw "#s", %%xmm4 \n\t" /* .. | 0 | s(p0) | */ \ + " pand "#v", %%xmm4 \n\t" /* .. | 0 | (vl) | */ \ + " movdqa "#s", %%xmm5 \n\t" \ + " pmulhuw "#v", "#s" \n\t" /* .. | 0 | vl*p0 | */ \ + " psubd %%xmm4, "#s" \n\t" /* .. | 0 | vl*p0 | + sign correct */ \ + " psrld $16, "#v" \n\t" /* .. | p0 | 0 | */ \ + " pmaddwd %%xmm5, "#v" \n\t" /* .. | p0 * vh | */ \ + " paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \ + " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */ + +#define MOD_ADD(a,b) \ + " add "#a", %3 \n\t" /* channel += inc */ \ + " mov %3, %4 \n\t" \ + " sub "#b", %4 \n\t" /* tmp = channel - channels */ \ + " cmovae %4, %3 \n\t" /* if (tmp >= 0) channel = tmp */ + +/* swap 16 bits */ +#define SWAP_16(s) \ + " movdqa "#s", %%xmm4 \n\t" /* .. | h l | */ \ + " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \ + " psllw $8, "#s" \n\t" /* .. | l 0 | */ \ + " por %%xmm4, "#s" \n\t" /* .. | l h | */ + +/* swap 2 registers 16 bits for better pairing */ +#define SWAP_16_2(s1,s2) \ + " movdqa "#s1", %%xmm4 \n\t" /* .. | h l | */ \ + " movdqa "#s2", %%xmm5 \n\t" \ + " psrlw $8, %%xmm4 \n\t" /* .. | 0 h | */ \ + " psrlw $8, %%xmm5 \n\t" \ + " psllw $8, "#s1" \n\t" /* .. | l 0 | */ \ + " psllw $8, "#s2" \n\t" \ + " por %%xmm4, "#s1" \n\t" /* .. | l h | */ \ + " por %%xmm5, "#s2" \n\t" + +static void +pa_volume_s16ne_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = PA_MAX (8U, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " movd %4, %%xmm1 \n\t" + VOLUME_32x16 (%%xmm1, %%xmm0) + " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ + VOLUME_32x16 (%%xmm1, %%xmm0) + " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " test $1, %2 \n\t" + " je 6f \n\t" + + /* FIXME, we can do aligned access of the volume values if we can guarantee + * that the array is 16 bytes aligned, we probably have to do the odd values + * after this then. */ + "5: \n\t" /* do samples in groups of 4 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + VOLUME_32x16 (%%xmm1, %%xmm0) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + + "6: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ + " cmp $0, %2 \n\t" + " je 8f \n\t" + + "7: \n\t" /* do samples in groups of 8 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ + VOLUME_32x16 (%%xmm1, %%xmm0) + VOLUME_32x16 (%%xmm3, %%xmm2) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ + " add $16, %0 \n\t" + MOD_ADD ($8, %5) + " dec %2 \n\t" + " jne 7b \n\t" + "8: \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); +} + +static void +pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) +{ + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = PA_MAX (8U, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " rorw $8, %w4 \n\t" + " movd %4, %%xmm1 \n\t" + VOLUME_32x16 (%%xmm1, %%xmm0) + " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ + " rorw $8, %w4 \n\t" + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ + SWAP_16 (%%xmm1) + VOLUME_32x16 (%%xmm1, %%xmm0) + SWAP_16 (%%xmm0) + " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " test $1, %2 \n\t" + " je 6f \n\t" + + /* FIXME, we can do aligned access of the volume values if we can guarantee + * that the array is 16 bytes aligned, we probably have to do the odd values + * after this then. */ + "5: \n\t" /* do samples in groups of 4 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + SWAP_16 (%%xmm1) + VOLUME_32x16 (%%xmm1, %%xmm0) + SWAP_16 (%%xmm0) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + + "6: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ + " cmp $0, %2 \n\t" + " je 8f \n\t" + + "7: \n\t" /* do samples in groups of 8 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ + SWAP_16_2 (%%xmm1, %%xmm3) + VOLUME_32x16 (%%xmm1, %%xmm0) + VOLUME_32x16 (%%xmm3, %%xmm2) + SWAP_16_2 (%%xmm0, %%xmm2) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ + " add $16, %0 \n\t" + MOD_ADD ($8, %5) + " dec %2 \n\t" + " jne 7b \n\t" + "8: \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); +} + +#undef RUN_TEST + +#ifdef RUN_TEST +#define CHANNELS 2 +#define SAMPLES 1021 +#define TIMES 1000 +#define PADDING 16 + +static void run_test (void) { + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + int16_t samples_orig[SAMPLES]; + int32_t volumes[CHANNELS + PADDING]; + int i, j, padding; + pa_do_volume_func_t func; + pa_usec_t start, stop; + + func = pa_get_volume_func (PA_SAMPLE_S16NE); + + printf ("checking SSE %zd\n", sizeof (samples)); + + pa_random (samples, sizeof (samples)); + memcpy (samples_ref, samples, sizeof (samples)); + memcpy (samples_orig, samples, sizeof (samples)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = rand() >> 1; + for (padding = 0; padding < PADDING; padding++, i++) + volumes[i] = volumes[padding]; + + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); + for (i = 0; i < SAMPLES; i++) { + if (samples[i] != samples_ref[i]) { + printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], + samples_orig[i], volumes[i % CHANNELS]); + } + } + + start = pa_rtclock_now(); + for (j = 0; j < TIMES; j++) { + memcpy (samples, samples_orig, sizeof (samples)); + pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); + } + stop = pa_rtclock_now(); + pa_log_info("SSE: %llu usec.", (long long unsigned int)(stop - start)); + + start = pa_rtclock_now(); + for (j = 0; j < TIMES; j++) { + memcpy (samples_ref, samples_orig, sizeof (samples)); + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + } + stop = pa_rtclock_now(); + pa_log_info("ref: %llu usec.", (long long unsigned int)(stop - start)); +} +#endif +#endif /* defined (__i386__) || defined (__amd64__) */ + +void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) { +#if defined (__i386__) || defined (__amd64__) + pa_log_info("Initialising SSE optimized functions."); + +#ifdef RUN_TEST + run_test (); +#endif + + pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse); + pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse); +#endif /* defined (__i386__) || defined (__amd64__) */ +} |