summaryrefslogtreecommitdiffstats
path: root/src/pulsecore
diff options
context:
space:
mode:
authorWim Taymans <wim.taymans@collabora.co.uk>2009-08-20 10:56:20 +0200
committerWim Taymans <wim.taymans@collabora.co.uk>2009-08-20 11:31:04 +0200
commitf09b51198f43d79b22cb92b5223d01a7ab339d9f (patch)
tree25d2d3946a24c0d893bb28ec775a4b7486145d8f /src/pulsecore
parent3cc1278dcf44c9fb93bfd2725a2f75de1958cf23 (diff)
whitespace fixes
Diffstat (limited to 'src/pulsecore')
-rw-r--r--src/pulsecore/cpu-arm.c32
-rw-r--r--src/pulsecore/cpu-x86.c49
-rw-r--r--src/pulsecore/resampler.c65
-rw-r--r--src/pulsecore/sample-util.c11
-rw-r--r--src/pulsecore/svolume_arm.c242
-rw-r--r--src/pulsecore/svolume_c.c330
-rw-r--r--src/pulsecore/svolume_mmx.c366
-rw-r--r--src/pulsecore/svolume_sse.c410
8 files changed, 765 insertions, 740 deletions
diff --git a/src/pulsecore/cpu-arm.c b/src/pulsecore/cpu-arm.c
index 93ad3891..5a994b71 100644
--- a/src/pulsecore/cpu-arm.c
+++ b/src/pulsecore/cpu-arm.c
@@ -36,14 +36,14 @@
#if defined (__arm__) && defined (__linux__)
-#define MAX_BUFFER 4096
+#define MAX_BUFFER 4096
static char *
get_cpuinfo_line (char *cpuinfo, const char *tag) {
char *line, *end, *colon;
if (!(line = strstr (cpuinfo, tag)))
return NULL;
-
+
if (!(end = strchr (line, '\n')))
return NULL;
@@ -106,20 +106,20 @@ void pa_cpu_init_arm (void) {
}
/* get the CPU features */
if ((line = get_cpuinfo_line (cpuinfo, "Features"))) {
- char *state = NULL, *current;
-
- while ((current = pa_split_spaces (line, &state))) {
- if (!strcmp (current, "vfp"))
- flags |= PA_CPU_ARM_VFP;
- else if (!strcmp (current, "edsp"))
- flags |= PA_CPU_ARM_EDSP;
- else if (!strcmp (current, "neon"))
- flags |= PA_CPU_ARM_NEON;
- else if (!strcmp (current, "vfpv3"))
- flags |= PA_CPU_ARM_VFPV3;
-
- free (current);
- }
+ char *state = NULL, *current;
+
+ while ((current = pa_split_spaces (line, &state))) {
+ if (!strcmp (current, "vfp"))
+ flags |= PA_CPU_ARM_VFP;
+ else if (!strcmp (current, "edsp"))
+ flags |= PA_CPU_ARM_EDSP;
+ else if (!strcmp (current, "neon"))
+ flags |= PA_CPU_ARM_NEON;
+ else if (!strcmp (current, "vfpv3"))
+ flags |= PA_CPU_ARM_VFPV3;
+
+ free (current);
+ }
}
free (cpuinfo);
diff --git a/src/pulsecore/cpu-x86.c b/src/pulsecore/cpu-x86.c
index 453ecf5b..0457199d 100644
--- a/src/pulsecore/cpu-x86.c
+++ b/src/pulsecore/cpu-x86.c
@@ -2,7 +2,7 @@
This file is part of PulseAudio.
Copyright 2004-2006 Lennart Poettering
- Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
+ Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>
PulseAudio is free software; you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published
@@ -34,14 +34,15 @@
static void
get_cpuid (uint32_t op, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)
{
- __asm__ __volatile__ (
- " push %%"PA_REG_b" \n\t"
- " cpuid \n\t"
- " mov %%ebx, %%esi \n\t"
- " pop %%"PA_REG_b" \n\t"
-
- : "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d)
- : "0" (op));
+ __asm__ __volatile__ (
+ " push %%"PA_REG_b" \n\t"
+ " cpuid \n\t"
+ " mov %%ebx, %%esi \n\t"
+ " pop %%"PA_REG_b" \n\t"
+
+ : "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d)
+ : "0" (op)
+ );
}
#endif
@@ -97,23 +98,23 @@ void pa_cpu_init_x86 (void) {
}
pa_log_info ("CPU flags: %s%s%s%s%s%s%s%s%s%s",
- (flags & PA_CPU_X86_MMX) ? "MMX " : "",
- (flags & PA_CPU_X86_SSE) ? "SSE " : "",
- (flags & PA_CPU_X86_SSE2) ? "SSE2 " : "",
- (flags & PA_CPU_X86_SSE3) ? "SSE3 " : "",
- (flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "",
- (flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "",
- (flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "",
- (flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "",
- (flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "",
- (flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : "");
+ (flags & PA_CPU_X86_MMX) ? "MMX " : "",
+ (flags & PA_CPU_X86_SSE) ? "SSE " : "",
+ (flags & PA_CPU_X86_SSE2) ? "SSE2 " : "",
+ (flags & PA_CPU_X86_SSE3) ? "SSE3 " : "",
+ (flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "",
+ (flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "",
+ (flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "",
+ (flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "",
+ (flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "",
+ (flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : "");
/* activate various optimisations */
- if (flags & PA_CPU_X86_MMX) {
+ if (flags & PA_CPU_X86_MMX)
pa_volume_func_init_mmx (flags);
- }
- if (flags & PA_CPU_X86_SSE) {
- pa_volume_func_init_sse (flags);
- }
+
+ if (flags & PA_CPU_X86_SSE)
+ pa_volume_func_init_sse (flags);
+
#endif /* defined (__i386__) || defined (__amd64__) */
}
diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c
index 43771dc8..5a6c398e 100644
--- a/src/pulsecore/resampler.c
+++ b/src/pulsecore/resampler.c
@@ -1065,30 +1065,53 @@ static pa_memchunk* convert_to_work_format(pa_resampler *r, pa_memchunk *input)
}
static void remap_mono_to_stereo(pa_resampler *r, void *dst, const void *src, unsigned n) {
-
+ unsigned i;
+
switch (r->work_format) {
case PA_SAMPLE_FLOAT32NE:
{
float *d, *s;
- d = (float *) dst;
- s = (float *) src;
+ d = (float *) dst;
+ s = (float *) src;
- for (; n > 0; n--, s++, d += 2)
- d[0] = d[1] = *s;
- break;
- }
+ for (i = n >> 2; i; i--) {
+ d[0] = d[1] = s[0];
+ d[2] = d[3] = s[1];
+ d[4] = d[5] = s[2];
+ d[6] = d[7] = s[3];
+ s += 4;
+ d += 8;
+ }
+ for (i = n & 3; i; i--) {
+ d[0] = d[1] = s[0];
+ s++;
+ d += 2;
+ }
+ break;
+ }
case PA_SAMPLE_S16NE:
{
int16_t *d, *s;
- d = (int16_t *) dst;
- s = (int16_t *) src;
+ d = (int16_t *) dst;
+ s = (int16_t *) src;
- for (; n > 0; n--, s++, d += 2)
- d[0] = d[1] = *s;
- break;
- }
+ for (i = n >> 2; i; i--) {
+ d[0] = d[1] = s[0];
+ d[2] = d[3] = s[1];
+ d[4] = d[5] = s[2];
+ d[6] = d[7] = s[3];
+ s += 4;
+ d += 8;
+ }
+ for (i = n & 3; i; i--) {
+ d[0] = d[1] = s[0];
+ s++;
+ d += 2;
+ }
+ break;
+ }
default:
pa_assert_not_reached();
}
@@ -1114,7 +1137,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
for (ic = 0; ic < n_ic; ic++) {
float vol;
- vol = r->map_table_f[oc][ic];
+ vol = r->map_table_f[oc][ic];
if (vol <= 0.0)
continue;
@@ -1122,18 +1145,18 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
d = (float *)dst + oc;
s = (float *)src + ic;
- if (vol >= 1.0) {
+ if (vol >= 1.0) {
for (i = n; i > 0; i--, s += n_ic, d += n_oc)
*d += *s;
- } else {
+ } else {
for (i = n; i > 0; i--, s += n_ic, d += n_oc)
*d += *s * vol;
- }
+ }
}
}
break;
- }
+ }
case PA_SAMPLE_S16NE:
{
int16_t *d, *s;
@@ -1144,7 +1167,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
for (ic = 0; ic < n_ic; ic++) {
int32_t vol;
- vol = r->map_table_i[oc][ic];
+ vol = r->map_table_i[oc][ic];
if (vol <= 0)
continue;
@@ -1158,11 +1181,11 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,
} else {
for (i = n; i > 0; i--, s += n_ic, d += n_oc)
*d += (int16_t) (((int32_t)*s * vol) >> 16);
- }
+ }
}
}
break;
- }
+ }
default:
pa_assert_not_reached();
}
diff --git a/src/pulsecore/sample-util.c b/src/pulsecore/sample-util.c
index 677f914a..6e97e5a9 100644
--- a/src/pulsecore/sample-util.c
+++ b/src/pulsecore/sample-util.c
@@ -752,12 +752,13 @@ void pa_volume_memchunk(
return;
}
- ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index;
-
do_volume = pa_get_volume_func (spec->format);
pa_assert(do_volume);
-
+
calc_volume_table[spec->format] ((void *)linear, volume);
+
+ ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index;
+
do_volume (ptr, (void *)linear, spec->channels, c->length);
pa_memblock_release(c->memblock);
@@ -944,12 +945,12 @@ void pa_sample_clamp(pa_sample_format_t format, void *dst, size_t dstr, const vo
for (; n > 0; n--) {
float f;
- f = *s;
+ f = *s;
*d = PA_CLAMP_UNLIKELY(f, -1.0f, 1.0f);
s = (const float*) ((const uint8_t*) s + sstr);
d = (float*) ((uint8_t*) d + dstr);
- }
+ }
} else {
pa_assert(format == PA_SAMPLE_FLOAT32RE);
diff --git a/src/pulsecore/svolume_arm.c b/src/pulsecore/svolume_arm.c
index 7e25a13c..0d39d105 100644
--- a/src/pulsecore/svolume_arm.c
+++ b/src/pulsecore/svolume_arm.c
@@ -40,86 +40,86 @@
#define MOD_INC() \
" subs r0, r6, %2 \n\t" \
" addcs r0, %1 \n\t" \
- " movcs r6, r0 \n\t"
+ " movcs r6, r0 \n\t"
static void
pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- int32_t *ve;
-
- channels = MAX (4, channels);
- ve = volumes + channels;
-
- __asm__ __volatile__ (
- " mov r6, %1 \n\t"
- " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */
- " tst %3, #1 \n\t" /* check for odd samples */
- " beq 2f \n\t"
-
- "1: \n\t"
- " ldr r0, [r6], #4 \n\t" /* odd samples volumes */
- " ldrh r2, [%0] \n\t"
-
- " smulwb r0, r0, r2 \n\t"
- " ssat r0, #16, r0 \n\t"
-
- " strh r0, [%0], #2 \n\t"
-
- MOD_INC()
-
- "2: \n\t"
- " mov %3, %3, LSR #1 \n\t"
- " tst %3, #1 \n\t" /* check for odd samples */
- " beq 4f \n\t"
-
- "3: \n\t"
- " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */
- " ldr r0, [%0] \n\t"
-
- " smulwt r2, r2, r0 \n\t"
- " smulwb r3, r3, r0 \n\t"
-
- " ssat r2, #16, r2 \n\t"
- " ssat r3, #16, r3 \n\t"
-
- " pkhbt r0, r3, r2, LSL #16 \n\t"
- " str r0, [%0], #4 \n\t"
-
- MOD_INC()
-
- "4: \n\t"
- " movs %3, %3, LSR #1 \n\t"
- " beq 6f \n\t"
-
- "5: \n\t"
- " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */
- " ldrd r4, [r6], #8 \n\t"
- " ldrd r0, [%0] \n\t"
-
- " smulwt r2, r2, r0 \n\t"
- " smulwb r3, r3, r0 \n\t"
- " smulwt r4, r4, r1 \n\t"
- " smulwb r5, r5, r1 \n\t"
-
- " ssat r2, #16, r2 \n\t"
- " ssat r3, #16, r3 \n\t"
- " ssat r4, #16, r4 \n\t"
- " ssat r5, #16, r5 \n\t"
-
- " pkhbt r0, r3, r2, LSL #16 \n\t"
- " pkhbt r1, r5, r4, LSL #16 \n\t"
- " strd r0, [%0], #8 \n\t"
-
- MOD_INC()
-
- " subs %3, %3, #1 \n\t"
- " bne 5b \n\t"
- "6: \n\t"
-
- : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
- :
- : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
- );
+ int32_t *ve;
+
+ channels = MAX (4, channels);
+ ve = volumes + channels;
+
+ __asm__ __volatile__ (
+ " mov r6, %1 \n\t"
+ " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */
+ " tst %3, #1 \n\t" /* check for odd samples */
+ " beq 2f \n\t"
+
+ "1: \n\t"
+ " ldr r0, [r6], #4 \n\t" /* odd samples volumes */
+ " ldrh r2, [%0] \n\t"
+
+ " smulwb r0, r0, r2 \n\t"
+ " ssat r0, #16, r0 \n\t"
+
+ " strh r0, [%0], #2 \n\t"
+
+ MOD_INC()
+
+ "2: \n\t"
+ " mov %3, %3, LSR #1 \n\t"
+ " tst %3, #1 \n\t" /* check for odd samples */
+ " beq 4f \n\t"
+
+ "3: \n\t"
+ " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */
+ " ldr r0, [%0] \n\t"
+
+ " smulwt r2, r2, r0 \n\t"
+ " smulwb r3, r3, r0 \n\t"
+
+ " ssat r2, #16, r2 \n\t"
+ " ssat r3, #16, r3 \n\t"
+
+ " pkhbt r0, r3, r2, LSL #16 \n\t"
+ " str r0, [%0], #4 \n\t"
+
+ MOD_INC()
+
+ "4: \n\t"
+ " movs %3, %3, LSR #1 \n\t"
+ " beq 6f \n\t"
+
+ "5: \n\t"
+ " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */
+ " ldrd r4, [r6], #8 \n\t"
+ " ldrd r0, [%0] \n\t"
+
+ " smulwt r2, r2, r0 \n\t"
+ " smulwb r3, r3, r0 \n\t"
+ " smulwt r4, r4, r1 \n\t"
+ " smulwb r5, r5, r1 \n\t"
+
+ " ssat r2, #16, r2 \n\t"
+ " ssat r3, #16, r3 \n\t"
+ " ssat r4, #16, r4 \n\t"
+ " ssat r5, #16, r5 \n\t"
+
+ " pkhbt r0, r3, r2, LSL #16 \n\t"
+ " pkhbt r1, r5, r4, LSL #16 \n\t"
+ " strd r0, [%0], #8 \n\t"
+
+ MOD_INC()
+
+ " subs %3, %3, #1 \n\t"
+ " bne 5b \n\t"
+ "6: \n\t"
+
+ : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length)
+ :
+ : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc"
+ );
}
#undef RUN_TEST
@@ -131,51 +131,51 @@ pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsi
#define PADDING 16
static void run_test (void) {
- int16_t samples[SAMPLES];
- int16_t samples_ref[SAMPLES];
- int16_t samples_orig[SAMPLES];
- int32_t volumes[CHANNELS + PADDING];
- int i, j, padding;
- pa_do_volume_func_t func;
- struct timeval start, stop;
-
- func = pa_get_volume_func (PA_SAMPLE_S16NE);
-
- printf ("checking ARM %zd\n", sizeof (samples));
-
- pa_random (samples, sizeof (samples));
- memcpy (samples_ref, samples, sizeof (samples));
- memcpy (samples_orig, samples, sizeof (samples));
-
- for (i = 0; i < CHANNELS; i++)
- volumes[i] = rand() >> 1;
- for (padding = 0; padding < PADDING; padding++, i++)
- volumes[i] = volumes[padding];
-
- func (samples_ref, volumes, CHANNELS, sizeof (samples));
- pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
- for (i = 0; i < SAMPLES; i++) {
- if (samples[i] != samples_ref[i]) {
- printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
- samples_orig[i], volumes[i % CHANNELS]);
- }
- }
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ int16_t samples_orig[SAMPLES];
+ int32_t volumes[CHANNELS + PADDING];
+ int i, j, padding;
+ pa_do_volume_func_t func;
+ struct timeval start, stop;
- pa_gettimeofday(&start);
- for (j = 0; j < TIMES; j++) {
- memcpy (samples, samples_orig, sizeof (samples));
- pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
- }
- pa_gettimeofday(&stop);
- pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+ func = pa_get_volume_func (PA_SAMPLE_S16NE);
+
+ printf ("checking ARM %zd\n", sizeof (samples));
+
+ pa_random (samples, sizeof (samples));
+ memcpy (samples_ref, samples, sizeof (samples));
+ memcpy (samples_orig, samples, sizeof (samples));
+
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = rand() >> 1;
+ for (padding = 0; padding < PADDING; padding++, i++)
+ volumes[i] = volumes[padding];
- pa_gettimeofday(&start);
- for (j = 0; j < TIMES; j++) {
- memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
- }
- pa_gettimeofday(&stop);
- pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+ pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
+ for (i = 0; i < SAMPLES; i++) {
+ if (samples[i] != samples_ref[i]) {
+ printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
+ samples_orig[i], volumes[i % CHANNELS]);
+ }
+ }
+
+ pa_gettimeofday(&start);
+ for (j = 0; j < TIMES; j++) {
+ memcpy (samples, samples_orig, sizeof (samples));
+ pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples));
+ }
+ pa_gettimeofday(&stop);
+ pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+
+ pa_gettimeofday(&start);
+ for (j = 0; j < TIMES; j++) {
+ memcpy (samples_ref, samples_orig, sizeof (samples));
+ func (samples_ref, volumes, CHANNELS, sizeof (samples));
+ }
+ pa_gettimeofday(&stop);
+ pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
}
#endif
@@ -184,12 +184,12 @@ static void run_test (void) {
void pa_volume_func_init_arm (pa_cpu_arm_flag_t flags) {
#if defined (__arm__)
- pa_log_info("Initialising ARM optimized functions.");
+ pa_log_info("Initialising ARM optimized functions.");
#ifdef RUN_TEST
- run_test ();
+ run_test ();
#endif
- pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_arm);
+ pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_arm);
#endif /* defined (__arm__) */
}
diff --git a/src/pulsecore/svolume_c.c b/src/pulsecore/svolume_c.c
index 2148a573..5fc052b8 100644
--- a/src/pulsecore/svolume_c.c
+++ b/src/pulsecore/svolume_c.c
@@ -35,289 +35,289 @@
static void
pa_volume_u8_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- for (channel = 0; length; length--) {
- int32_t t, hi, lo;
+ for (channel = 0; length; length--) {
+ int32_t t, hi, lo;
- hi = volumes[channel] >> 16;
- lo = volumes[channel] & 0xFFFF;
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
- t = (int32_t) *samples - 0x80;
- t = ((t * lo) >> 16) + (t * hi);
- t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
- *samples++ = (uint8_t) (t + 0x80);
+ t = (int32_t) *samples - 0x80;
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F);
+ *samples++ = (uint8_t) (t + 0x80);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_alaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- for (channel = 0; length; length--) {
- int32_t t, hi, lo;
+ for (channel = 0; length; length--) {
+ int32_t t, hi, lo;
- hi = volumes[channel] >> 16;
- lo = volumes[channel] & 0xFFFF;
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
- t = (int32_t) st_alaw2linear16(*samples);
- t = ((t * lo) >> 16) + (t * hi);
- t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
- *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
+ t = (int32_t) st_alaw2linear16(*samples);
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_ulaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- for (channel = 0; length; length--) {
- int32_t t, hi, lo;
+ for (channel = 0; length; length--) {
+ int32_t t, hi, lo;
- hi = volumes[channel] >> 16;
- lo = volumes[channel] & 0xFFFF;
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
- t = (int32_t) st_ulaw2linear16(*samples);
- t = ((t * lo) >> 16) + (t * hi);
- t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
- *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
+ t = (int32_t) st_ulaw2linear16(*samples);
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s16ne_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (int16_t);
+ length /= sizeof (int16_t);
- for (channel = 0; length; length--) {
- int32_t t, hi, lo;
+ for (channel = 0; length; length--) {
+ int32_t t, hi, lo;
- /* Multiplying the 32bit volume factor with the 16bit
- * sample might result in an 48bit value. We want to
- * do without 64 bit integers and hence do the
- * multiplication independantly for the HI and LO part
- * of the volume. */
+ /* Multiplying the 32bit volume factor with the 16bit
+ * sample might result in an 48bit value. We want to
+ * do without 64 bit integers and hence do the
+ * multiplication independantly for the HI and LO part
+ * of the volume. */
- hi = volumes[channel] >> 16;
- lo = volumes[channel] & 0xFFFF;
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
- t = (int32_t)(*samples);
- t = ((t * lo) >> 16) + (t * hi);
- t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
- *samples++ = (int16_t) t;
+ t = (int32_t)(*samples);
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = (int16_t) t;
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s16re_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (int16_t);
+ length /= sizeof (int16_t);
- for (channel = 0; length; length--) {
- int32_t t, hi, lo;
+ for (channel = 0; length; length--) {
+ int32_t t, hi, lo;
- hi = volumes[channel] >> 16;
- lo = volumes[channel] & 0xFFFF;
+ hi = volumes[channel] >> 16;
+ lo = volumes[channel] & 0xFFFF;
- t = (int32_t) PA_INT16_SWAP(*samples);
- t = ((t * lo) >> 16) + (t * hi);
- t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
- *samples++ = PA_INT16_SWAP((int16_t) t);
+ t = (int32_t) PA_INT16_SWAP(*samples);
+ t = ((t * lo) >> 16) + (t * hi);
+ t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF);
+ *samples++ = PA_INT16_SWAP((int16_t) t);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_float32ne_c (float *samples, float *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (float);
+ length /= sizeof (float);
- for (channel = 0; length; length--) {
- *samples++ *= volumes[channel];
+ for (channel = 0; length; length--) {
+ *samples++ *= volumes[channel];
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_float32re_c (float *samples, float *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (float);
+ length /= sizeof (float);
- for (channel = 0; length; length--) {
- float t;
+ for (channel = 0; length; length--) {
+ float t;
- t = PA_FLOAT32_SWAP(*samples);
- t *= volumes[channel];
- *samples++ = PA_FLOAT32_SWAP(t);
+ t = PA_FLOAT32_SWAP(*samples);
+ t *= volumes[channel];
+ *samples++ = PA_FLOAT32_SWAP(t);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s32ne_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (int32_t);
+ length /= sizeof (int32_t);
- for (channel = 0; length; length--) {
- int64_t t;
+ for (channel = 0; length; length--) {
+ int64_t t;
- t = (int64_t)(*samples);
- t = (t * volumes[channel]) >> 16;
- t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
- *samples++ = (int32_t) t;
+ t = (int64_t)(*samples);
+ t = (t * volumes[channel]) >> 16;
+ t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
+ *samples++ = (int32_t) t;
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s32re_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (int32_t);
+ length /= sizeof (int32_t);
- for (channel = 0; length; length--) {
- int64_t t;
+ for (channel = 0; length; length--) {
+ int64_t t;
- t = (int64_t) PA_INT32_SWAP(*samples);
- t = (t * volumes[channel]) >> 16;
- t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
- *samples++ = PA_INT32_SWAP((int32_t) t);
+ t = (int64_t) PA_INT32_SWAP(*samples);
+ t = (t * volumes[channel]) >> 16;
+ t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
+ *samples++ = PA_INT32_SWAP((int32_t) t);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s24ne_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
- uint8_t *e;
+ unsigned channel;
+ uint8_t *e;
- e = samples + length;
+ e = samples + length;
- for (channel = 0; samples < e; samples += 3) {
- int64_t t;
+ for (channel = 0; samples < e; samples += 3) {
+ int64_t t;
- t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
- t = (t * volumes[channel]) >> 16;
- t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
- PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
+ t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8));
+ t = (t * volumes[channel]) >> 16;
+ t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
+ PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s24re_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
- uint8_t *e;
+ unsigned channel;
+ uint8_t *e;
- e = samples + length;
+ e = samples + length;
- for (channel = 0; samples < e; samples += 3) {
- int64_t t;
+ for (channel = 0; samples < e; samples += 3) {
+ int64_t t;
- t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
- t = (t * volumes[channel]) >> 16;
- t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
- PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
+ t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8));
+ t = (t * volumes[channel]) >> 16;
+ t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
+ PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s24_32ne_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (uint32_t);
+ length /= sizeof (uint32_t);
- for (channel = 0; length; length--) {
- int64_t t;
+ for (channel = 0; length; length--) {
+ int64_t t;
- t = (int64_t) ((int32_t) (*samples << 8));
- t = (t * volumes[channel]) >> 16;
- t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
- *samples++ = ((uint32_t) ((int32_t) t)) >> 8;
+ t = (int64_t) ((int32_t) (*samples << 8));
+ t = (t * volumes[channel]) >> 16;
+ t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
+ *samples++ = ((uint32_t) ((int32_t) t)) >> 8;
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static void
pa_volume_s24_32re_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- unsigned channel;
+ unsigned channel;
- length /= sizeof (uint32_t);
+ length /= sizeof (uint32_t);
- for (channel = 0; length; length--) {
- int64_t t;
+ for (channel = 0; length; length--) {
+ int64_t t;
- t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
- t = (t * volumes[channel]) >> 16;
- t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
- *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
+ t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8));
+ t = (t * volumes[channel]) >> 16;
+ t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL);
+ *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8);
- if (PA_UNLIKELY(++channel >= channels))
- channel = 0;
- }
+ if (PA_UNLIKELY(++channel >= channels))
+ channel = 0;
+ }
}
static pa_do_volume_func_t do_volume_table[] =
{
- [PA_SAMPLE_U8] = (pa_do_volume_func_t) pa_volume_u8_c,
- [PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c,
- [PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c,
- [PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c,
- [PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c,
- [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c,
- [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c,
- [PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c,
- [PA_SAMPLE_S32RE] = (pa_do_volume_func_t) pa_volume_s32re_c,
- [PA_SAMPLE_S24NE] = (pa_do_volume_func_t) pa_volume_s24ne_c,
- [PA_SAMPLE_S24RE] = (pa_do_volume_func_t) pa_volume_s24re_c,
- [PA_SAMPLE_S24_32NE] = (pa_do_volume_func_t) pa_volume_s24_32ne_c,
- [PA_SAMPLE_S24_32RE] = (pa_do_volume_func_t) pa_volume_s24_32re_c
+ [PA_SAMPLE_U8] = (pa_do_volume_func_t) pa_volume_u8_c,
+ [PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c,
+ [PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c,
+ [PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c,
+ [PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c,
+ [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c,
+ [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c,
+ [PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c,
+ [PA_SAMPLE_S32RE] = (pa_do_volume_func_t) pa_volume_s32re_c,
+ [PA_SAMPLE_S24NE] = (pa_do_volume_func_t) pa_volume_s24ne_c,
+ [PA_SAMPLE_S24RE] = (pa_do_volume_func_t) pa_volume_s24re_c,
+ [PA_SAMPLE_S24_32NE] = (pa_do_volume_func_t) pa_volume_s24_32ne_c,
+ [PA_SAMPLE_S24_32RE] = (pa_do_volume_func_t) pa_volume_s24_32re_c
};
pa_do_volume_func_t pa_get_volume_func(pa_sample_format_t f) {
diff --git a/src/pulsecore/svolume_mmx.c b/src/pulsecore/svolume_mmx.c
index 86af76d3..7e242684 100644
--- a/src/pulsecore/svolume_mmx.c
+++ b/src/pulsecore/svolume_mmx.c
@@ -73,7 +73,7 @@
" add "#a", %3 \n\t" \
" mov %3, %4 \n\t" \
" sub "#b", %4 \n\t" \
- " cmovae %4, %3 \n\t"
+ " cmovae %4, %3 \n\t"
/* swap 16 bits */
#define SWAP_16(s) \
@@ -96,147 +96,147 @@
static void
pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- pa_reg_x86 channel, temp;
-
- /* the max number of samples we process at a time, this is also the max amount
- * we overread the volume array, which should have enough padding. */
- channels = MAX (4, channels);
-
- __asm__ __volatile__ (
- " xor %3, %3 \n\t"
- " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
- " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
- " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
- " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
- " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
-
- " test $1, %2 \n\t" /* check for odd samples */
- " je 2f \n\t"
-
- " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
- " movw (%0), %w4 \n\t" /* .. | p0 | */
- " movd %4, %%mm1 \n\t"
- VOLUME_32x16 (%%mm1, %%mm0)
- " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
- " movw %w4, (%0) \n\t"
- " add $2, %0 \n\t"
- MOD_ADD ($1, %5)
-
- "2: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
- " test $1, %2 \n\t" /* check for odd samples */
- " je 4f \n\t"
-
- "3: \n\t" /* do samples in groups of 2 */
- " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
- " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
- VOLUME_32x16 (%%mm1, %%mm0)
- " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
- " add $4, %0 \n\t"
- MOD_ADD ($2, %5)
-
- "4: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
- " cmp $0, %2 \n\t"
- " je 6f \n\t"
-
- "5: \n\t" /* do samples in groups of 4 */
- " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
- " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
- " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
- " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
- VOLUME_32x16 (%%mm1, %%mm0)
- VOLUME_32x16 (%%mm3, %%mm2)
- " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
- " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
- " add $8, %0 \n\t"
- MOD_ADD ($4, %5)
- " dec %2 \n\t"
- " jne 5b \n\t"
-
- "6: \n\t"
- " emms \n\t"
-
- : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
- : "r" ((pa_reg_x86)channels)
- : "cc"
- );
+ pa_reg_x86 channel, temp;
+
+ /* the max number of samples we process at a time, this is also the max amount
+ * we overread the volume array, which should have enough padding. */
+ channels = MAX (4, channels);
+
+ __asm__ __volatile__ (
+ " xor %3, %3 \n\t"
+ " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
+ " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
+ " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
+ " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
+ " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
+
+ " test $1, %2 \n\t" /* check for odd samples */
+ " je 2f \n\t"
+
+ " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
+ " movw (%0), %w4 \n\t" /* .. | p0 | */
+ " movd %4, %%mm1 \n\t"
+ VOLUME_32x16 (%%mm1, %%mm0)
+ " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
+ " movw %w4, (%0) \n\t"
+ " add $2, %0 \n\t"
+ MOD_ADD ($1, %5)
+
+ "2: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
+ " test $1, %2 \n\t" /* check for odd samples */
+ " je 4f \n\t"
+
+ "3: \n\t" /* do samples in groups of 2 */
+ " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
+ " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
+ VOLUME_32x16 (%%mm1, %%mm0)
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " add $4, %0 \n\t"
+ MOD_ADD ($2, %5)
+
+ "4: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
+ " cmp $0, %2 \n\t"
+ " je 6f \n\t"
+
+ "5: \n\t" /* do samples in groups of 4 */
+ " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
+ " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
+ " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
+ " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
+ VOLUME_32x16 (%%mm1, %%mm0)
+ VOLUME_32x16 (%%mm3, %%mm2)
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
+ " add $8, %0 \n\t"
+ MOD_ADD ($4, %5)
+ " dec %2 \n\t"
+ " jne 5b \n\t"
+
+ "6: \n\t"
+ " emms \n\t"
+
+ : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
+ : "r" ((pa_reg_x86)channels)
+ : "cc"
+ );
}
static void
pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- pa_reg_x86 channel, temp;
-
- /* the max number of samples we process at a time, this is also the max amount
- * we overread the volume array, which should have enough padding. */
- channels = MAX (4, channels);
-
- __asm__ __volatile__ (
- " xor %3, %3 \n\t"
- " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
- " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
- " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
- " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
- " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
-
- " test $1, %2 \n\t" /* check for odd samples */
- " je 2f \n\t"
-
- " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
- " movw (%0), %w4 \n\t" /* .. | p0 | */
- " rorw $8, %w4 \n\t"
- " movd %4, %%mm1 \n\t"
- VOLUME_32x16 (%%mm1, %%mm0)
- " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
- " rorw $8, %w4 \n\t"
- " movw %w4, (%0) \n\t"
- " add $2, %0 \n\t"
- MOD_ADD ($1, %5)
-
- "2: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
- " test $1, %2 \n\t" /* check for odd samples */
- " je 4f \n\t"
-
- "3: \n\t" /* do samples in groups of 2 */
- " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
- " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
- SWAP_16 (%%mm1)
- VOLUME_32x16 (%%mm1, %%mm0)
- SWAP_16 (%%mm0)
- " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
- " add $4, %0 \n\t"
- MOD_ADD ($2, %5)
-
- "4: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
- " cmp $0, %2 \n\t"
- " je 6f \n\t"
-
- "5: \n\t" /* do samples in groups of 4 */
- " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
- " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
- " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
- " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
- SWAP_16_2 (%%mm1, %%mm3)
- VOLUME_32x16 (%%mm1, %%mm0)
- VOLUME_32x16 (%%mm3, %%mm2)
- SWAP_16_2 (%%mm0, %%mm2)
- " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
- " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
- " add $8, %0 \n\t"
- MOD_ADD ($4, %5)
- " dec %2 \n\t"
- " jne 5b \n\t"
-
- "6: \n\t"
- " emms \n\t"
-
- : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
- : "r" ((pa_reg_x86)channels)
- : "cc"
- );
+ pa_reg_x86 channel, temp;
+
+ /* the max number of samples we process at a time, this is also the max amount
+ * we overread the volume array, which should have enough padding. */
+ channels = MAX (4, channels);
+
+ __asm__ __volatile__ (
+ " xor %3, %3 \n\t"
+ " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
+ " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */
+ " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */
+ " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */
+ " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */
+
+ " test $1, %2 \n\t" /* check for odd samples */
+ " je 2f \n\t"
+
+ " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */
+ " movw (%0), %w4 \n\t" /* .. | p0 | */
+ " rorw $8, %w4 \n\t"
+ " movd %4, %%mm1 \n\t"
+ VOLUME_32x16 (%%mm1, %%mm0)
+ " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */
+ " rorw $8, %w4 \n\t"
+ " movw %w4, (%0) \n\t"
+ " add $2, %0 \n\t"
+ MOD_ADD ($1, %5)
+
+ "2: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
+ " test $1, %2 \n\t" /* check for odd samples */
+ " je 4f \n\t"
+
+ "3: \n\t" /* do samples in groups of 2 */
+ " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
+ " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
+ SWAP_16 (%%mm1)
+ VOLUME_32x16 (%%mm1, %%mm0)
+ SWAP_16 (%%mm0)
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " add $4, %0 \n\t"
+ MOD_ADD ($2, %5)
+
+ "4: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
+ " cmp $0, %2 \n\t"
+ " je 6f \n\t"
+
+ "5: \n\t" /* do samples in groups of 4 */
+ " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */
+ " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */
+ " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */
+ " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */
+ SWAP_16_2 (%%mm1, %%mm3)
+ VOLUME_32x16 (%%mm1, %%mm0)
+ VOLUME_32x16 (%%mm3, %%mm2)
+ SWAP_16_2 (%%mm0, %%mm2)
+ " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */
+ " add $8, %0 \n\t"
+ MOD_ADD ($4, %5)
+ " dec %2 \n\t"
+ " jne 5b \n\t"
+
+ "6: \n\t"
+ " emms \n\t"
+
+ : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp)
+ : "r" ((pa_reg_x86)channels)
+ : "cc"
+ );
}
#undef RUN_TEST
@@ -248,51 +248,51 @@ pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi
#define PADDING 16
static void run_test (void) {
- int16_t samples[SAMPLES];
- int16_t samples_ref[SAMPLES];
- int16_t samples_orig[SAMPLES];
- int32_t volumes[CHANNELS + PADDING];
- int i, j, padding;
- pa_do_volume_func_t func;
- struct timeval start, stop;
-
- func = pa_get_volume_func (PA_SAMPLE_S16NE);
-
- printf ("checking MMX %zd\n", sizeof (samples));
-
- pa_random (samples, sizeof (samples));
- memcpy (samples_ref, samples, sizeof (samples));
- memcpy (samples_orig, samples, sizeof (samples));
-
- for (i = 0; i < CHANNELS; i++)
- volumes[i] = rand() >> 1;
- for (padding = 0; padding < PADDING; padding++, i++)
- volumes[i] = volumes[padding];
-
- func (samples_ref, volumes, CHANNELS, sizeof (samples));
- pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
- for (i = 0; i < SAMPLES; i++) {
- if (samples[i] != samples_ref[i]) {
- printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
- samples_orig[i], volumes[i % CHANNELS]);
- }
- }
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ int16_t samples_orig[SAMPLES];
+ int32_t volumes[CHANNELS + PADDING];
+ int i, j, padding;
+ pa_do_volume_func_t func;
+ struct timeval start, stop;
- pa_gettimeofday(&start);
- for (j = 0; j < TIMES; j++) {
- memcpy (samples, samples_orig, sizeof (samples));
- pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
- }
- pa_gettimeofday(&stop);
- pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+ func = pa_get_volume_func (PA_SAMPLE_S16NE);
+
+ printf ("checking MMX %zd\n", sizeof (samples));
+
+ pa_random (samples, sizeof (samples));
+ memcpy (samples_ref, samples, sizeof (samples));
+ memcpy (samples_orig, samples, sizeof (samples));
+
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = rand() >> 1;
+ for (padding = 0; padding < PADDING; padding++, i++)
+ volumes[i] = volumes[padding];
- pa_gettimeofday(&start);
- for (j = 0; j < TIMES; j++) {
- memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
- }
- pa_gettimeofday(&stop);
- pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+ pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
+ for (i = 0; i < SAMPLES; i++) {
+ if (samples[i] != samples_ref[i]) {
+ printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
+ samples_orig[i], volumes[i % CHANNELS]);
+ }
+ }
+
+ pa_gettimeofday(&start);
+ for (j = 0; j < TIMES; j++) {
+ memcpy (samples, samples_orig, sizeof (samples));
+ pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples));
+ }
+ pa_gettimeofday(&stop);
+ pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+
+ pa_gettimeofday(&start);
+ for (j = 0; j < TIMES; j++) {
+ memcpy (samples_ref, samples_orig, sizeof (samples));
+ func (samples_ref, volumes, CHANNELS, sizeof (samples));
+ }
+ pa_gettimeofday(&stop);
+ pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
}
#endif
@@ -301,13 +301,13 @@ static void run_test (void) {
void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) {
#if defined (__i386__) || defined (__amd64__)
- pa_log_info("Initialising MMX optimized functions.");
+ pa_log_info("Initialising MMX optimized functions.");
#ifdef RUN_TEST
- run_test ();
+ run_test ();
#endif
- pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
- pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
+ pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx);
+ pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx);
#endif /* defined (__i386__) || defined (__amd64__) */
}
diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c
index 5979f7c2..b5e3687f 100644
--- a/src/pulsecore/svolume_sse.c
+++ b/src/pulsecore/svolume_sse.c
@@ -48,7 +48,7 @@
" psrld $16, "#v" \n\t" /* .. | p0 | 0 | */ \
" pmaddwd %%xmm5, "#v" \n\t" /* .. | p0 * vh | */ \
" paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \
- " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */
+ " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */
#define MOD_ADD(a,b) \
" add "#a", %3 \n\t" /* channel += inc */ \
@@ -77,169 +77,169 @@
static void
pa_volume_s16ne_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- pa_reg_x86 channel, temp;
-
- /* the max number of samples we process at a time, this is also the max amount
- * we overread the volume array, which should have enough padding. */
- channels = MAX (8, channels);
-
- __asm__ __volatile__ (
- " xor %3, %3 \n\t"
- " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
-
- " test $1, %2 \n\t" /* check for odd samples */
- " je 2f \n\t"
-
- " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
- " movw (%0), %w4 \n\t" /* .. | p0 | */
- " movd %4, %%xmm1 \n\t"
- VOLUME_32x16 (%%xmm1, %%xmm0)
- " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
- " movw %w4, (%0) \n\t"
- " add $2, %0 \n\t"
- MOD_ADD ($1, %5)
-
- "2: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
- " test $1, %2 \n\t"
- " je 4f \n\t"
-
- "3: \n\t" /* do samples in groups of 2 */
- " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
- " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
- VOLUME_32x16 (%%xmm1, %%xmm0)
- " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
- " add $4, %0 \n\t"
- MOD_ADD ($2, %5)
-
- "4: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
- " test $1, %2 \n\t"
- " je 6f \n\t"
-
- /* FIXME, we can do aligned access of the volume values if we can guarantee
- * that the array is 16 bytes aligned, we probably have to do the odd values
- * after this then. */
- "5: \n\t" /* do samples in groups of 4 */
- " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
- " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
- VOLUME_32x16 (%%xmm1, %%xmm0)
- " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
- " add $8, %0 \n\t"
- MOD_ADD ($4, %5)
-
- "6: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
- " cmp $0, %2 \n\t"
- " je 8f \n\t"
-
- "7: \n\t" /* do samples in groups of 8 */
- " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
- " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
- " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
- " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
- VOLUME_32x16 (%%xmm1, %%xmm0)
- VOLUME_32x16 (%%xmm3, %%xmm2)
- " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
- " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
- " add $16, %0 \n\t"
- MOD_ADD ($8, %5)
- " dec %2 \n\t"
- " jne 7b \n\t"
- "8: \n\t"
-
- : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
- : "r" ((pa_reg_x86)channels)
- : "cc"
- );
+ pa_reg_x86 channel, temp;
+
+ /* the max number of samples we process at a time, this is also the max amount
+ * we overread the volume array, which should have enough padding. */
+ channels = MAX (8, channels);
+
+ __asm__ __volatile__ (
+ " xor %3, %3 \n\t"
+ " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
+
+ " test $1, %2 \n\t" /* check for odd samples */
+ " je 2f \n\t"
+
+ " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
+ " movw (%0), %w4 \n\t" /* .. | p0 | */
+ " movd %4, %%xmm1 \n\t"
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
+ " movw %w4, (%0) \n\t"
+ " add $2, %0 \n\t"
+ MOD_ADD ($1, %5)
+
+ "2: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
+ " test $1, %2 \n\t"
+ " je 4f \n\t"
+
+ "3: \n\t" /* do samples in groups of 2 */
+ " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
+ " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " add $4, %0 \n\t"
+ MOD_ADD ($2, %5)
+
+ "4: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
+ " test $1, %2 \n\t"
+ " je 6f \n\t"
+
+ /* FIXME, we can do aligned access of the volume values if we can guarantee
+ * that the array is 16 bytes aligned, we probably have to do the odd values
+ * after this then. */
+ "5: \n\t" /* do samples in groups of 4 */
+ " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
+ " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
+ " add $8, %0 \n\t"
+ MOD_ADD ($4, %5)
+
+ "6: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
+ " cmp $0, %2 \n\t"
+ " je 8f \n\t"
+
+ "7: \n\t" /* do samples in groups of 8 */
+ " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
+ " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
+ " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
+ " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ VOLUME_32x16 (%%xmm3, %%xmm2)
+ " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
+ " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
+ " add $16, %0 \n\t"
+ MOD_ADD ($8, %5)
+ " dec %2 \n\t"
+ " jne 7b \n\t"
+ "8: \n\t"
+
+ : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
+ : "r" ((pa_reg_x86)channels)
+ : "cc"
+ );
}
static void
pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)
{
- pa_reg_x86 channel, temp;
-
- /* the max number of samples we process at a time, this is also the max amount
- * we overread the volume array, which should have enough padding. */
- channels = MAX (8, channels);
-
- __asm__ __volatile__ (
- " xor %3, %3 \n\t"
- " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
-
- " test $1, %2 \n\t" /* check for odd samples */
- " je 2f \n\t"
-
- " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
- " movw (%0), %w4 \n\t" /* .. | p0 | */
- " rorw $8, %w4 \n\t"
- " movd %4, %%xmm1 \n\t"
- VOLUME_32x16 (%%xmm1, %%xmm0)
- " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
- " rorw $8, %w4 \n\t"
- " movw %w4, (%0) \n\t"
- " add $2, %0 \n\t"
- MOD_ADD ($1, %5)
-
- "2: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
- " test $1, %2 \n\t"
- " je 4f \n\t"
-
- "3: \n\t" /* do samples in groups of 2 */
- " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
- " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
- SWAP_16 (%%xmm1)
- VOLUME_32x16 (%%xmm1, %%xmm0)
- SWAP_16 (%%xmm0)
- " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
- " add $4, %0 \n\t"
- MOD_ADD ($2, %5)
-
- "4: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
- " test $1, %2 \n\t"
- " je 6f \n\t"
-
- /* FIXME, we can do aligned access of the volume values if we can guarantee
- * that the array is 16 bytes aligned, we probably have to do the odd values
- * after this then. */
- "5: \n\t" /* do samples in groups of 4 */
- " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
- " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
- SWAP_16 (%%xmm1)
- VOLUME_32x16 (%%xmm1, %%xmm0)
- SWAP_16 (%%xmm0)
- " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
- " add $8, %0 \n\t"
- MOD_ADD ($4, %5)
-
- "6: \n\t"
- " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
- " cmp $0, %2 \n\t"
- " je 8f \n\t"
-
- "7: \n\t" /* do samples in groups of 8 */
- " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
- " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
- " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
- " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
- SWAP_16_2 (%%xmm1, %%xmm3)
- VOLUME_32x16 (%%xmm1, %%xmm0)
- VOLUME_32x16 (%%xmm3, %%xmm2)
- SWAP_16_2 (%%xmm0, %%xmm2)
- " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
- " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
- " add $16, %0 \n\t"
- MOD_ADD ($8, %5)
- " dec %2 \n\t"
- " jne 7b \n\t"
- "8: \n\t"
-
- : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
- : "r" ((pa_reg_x86)channels)
- : "cc"
- );
+ pa_reg_x86 channel, temp;
+
+ /* the max number of samples we process at a time, this is also the max amount
+ * we overread the volume array, which should have enough padding. */
+ channels = MAX (8, channels);
+
+ __asm__ __volatile__ (
+ " xor %3, %3 \n\t"
+ " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */
+
+ " test $1, %2 \n\t" /* check for odd samples */
+ " je 2f \n\t"
+
+ " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */
+ " movw (%0), %w4 \n\t" /* .. | p0 | */
+ " rorw $8, %w4 \n\t"
+ " movd %4, %%xmm1 \n\t"
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */
+ " rorw $8, %w4 \n\t"
+ " movw %w4, (%0) \n\t"
+ " add $2, %0 \n\t"
+ MOD_ADD ($1, %5)
+
+ "2: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */
+ " test $1, %2 \n\t"
+ " je 4f \n\t"
+
+ "3: \n\t" /* do samples in groups of 2 */
+ " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */
+ " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */
+ SWAP_16 (%%xmm1)
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ SWAP_16 (%%xmm0)
+ " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */
+ " add $4, %0 \n\t"
+ MOD_ADD ($2, %5)
+
+ "4: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */
+ " test $1, %2 \n\t"
+ " je 6f \n\t"
+
+ /* FIXME, we can do aligned access of the volume values if we can guarantee
+ * that the array is 16 bytes aligned, we probably have to do the odd values
+ * after this then. */
+ "5: \n\t" /* do samples in groups of 4 */
+ " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
+ " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
+ SWAP_16 (%%xmm1)
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ SWAP_16 (%%xmm0)
+ " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
+ " add $8, %0 \n\t"
+ MOD_ADD ($4, %5)
+
+ "6: \n\t"
+ " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */
+ " cmp $0, %2 \n\t"
+ " je 8f \n\t"
+
+ "7: \n\t" /* do samples in groups of 8 */
+ " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */
+ " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */
+ " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */
+ " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */
+ SWAP_16_2 (%%xmm1, %%xmm3)
+ VOLUME_32x16 (%%xmm1, %%xmm0)
+ VOLUME_32x16 (%%xmm3, %%xmm2)
+ SWAP_16_2 (%%xmm0, %%xmm2)
+ " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */
+ " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */
+ " add $16, %0 \n\t"
+ MOD_ADD ($8, %5)
+ " dec %2 \n\t"
+ " jne 7b \n\t"
+ "8: \n\t"
+
+ : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp)
+ : "r" ((pa_reg_x86)channels)
+ : "cc"
+ );
}
#undef RUN_TEST
@@ -251,64 +251,64 @@ pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsi
#define PADDING 16
static void run_test (void) {
- int16_t samples[SAMPLES];
- int16_t samples_ref[SAMPLES];
- int16_t samples_orig[SAMPLES];
- int32_t volumes[CHANNELS + PADDING];
- int i, j, padding;
- pa_do_volume_func_t func;
- struct timeval start, stop;
-
- func = pa_get_volume_func (PA_SAMPLE_S16NE);
-
- printf ("checking SSE %zd\n", sizeof (samples));
-
- pa_random (samples, sizeof (samples));
- memcpy (samples_ref, samples, sizeof (samples));
- memcpy (samples_orig, samples, sizeof (samples));
-
- for (i = 0; i < CHANNELS; i++)
- volumes[i] = rand() >> 1;
- for (padding = 0; padding < PADDING; padding++, i++)
- volumes[i] = volumes[padding];
-
- func (samples_ref, volumes, CHANNELS, sizeof (samples));
- pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
- for (i = 0; i < SAMPLES; i++) {
- if (samples[i] != samples_ref[i]) {
- printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
- samples_orig[i], volumes[i % CHANNELS]);
- }
- }
+ int16_t samples[SAMPLES];
+ int16_t samples_ref[SAMPLES];
+ int16_t samples_orig[SAMPLES];
+ int32_t volumes[CHANNELS + PADDING];
+ int i, j, padding;
+ pa_do_volume_func_t func;
+ struct timeval start, stop;
- pa_gettimeofday(&start);
- for (j = 0; j < TIMES; j++) {
- memcpy (samples, samples_orig, sizeof (samples));
- pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
- }
- pa_gettimeofday(&stop);
- pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+ func = pa_get_volume_func (PA_SAMPLE_S16NE);
+
+ printf ("checking SSE %zd\n", sizeof (samples));
+
+ pa_random (samples, sizeof (samples));
+ memcpy (samples_ref, samples, sizeof (samples));
+ memcpy (samples_orig, samples, sizeof (samples));
+
+ for (i = 0; i < CHANNELS; i++)
+ volumes[i] = rand() >> 1;
+ for (padding = 0; padding < PADDING; padding++, i++)
+ volumes[i] = volumes[padding];
- pa_gettimeofday(&start);
- for (j = 0; j < TIMES; j++) {
- memcpy (samples_ref, samples_orig, sizeof (samples));
func (samples_ref, volumes, CHANNELS, sizeof (samples));
- }
- pa_gettimeofday(&stop);
- pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+ pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
+ for (i = 0; i < SAMPLES; i++) {
+ if (samples[i] != samples_ref[i]) {
+ printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i],
+ samples_orig[i], volumes[i % CHANNELS]);
+ }
+ }
+
+ pa_gettimeofday(&start);
+ for (j = 0; j < TIMES; j++) {
+ memcpy (samples, samples_orig, sizeof (samples));
+ pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples));
+ }
+ pa_gettimeofday(&stop);
+ pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
+
+ pa_gettimeofday(&start);
+ for (j = 0; j < TIMES; j++) {
+ memcpy (samples_ref, samples_orig, sizeof (samples));
+ func (samples_ref, volumes, CHANNELS, sizeof (samples));
+ }
+ pa_gettimeofday(&stop);
+ pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));
}
#endif
#endif /* defined (__i386__) || defined (__amd64__) */
void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) {
#if defined (__i386__) || defined (__amd64__)
- pa_log_info("Initialising SSE optimized functions.");
+ pa_log_info("Initialising SSE optimized functions.");
#ifdef RUN_TEST
- run_test ();
+ run_test ();
#endif
- pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse);
- pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse);
+ pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse);
+ pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse);
#endif /* defined (__i386__) || defined (__amd64__) */
}