From f09b51198f43d79b22cb92b5223d01a7ab339d9f Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Thu, 20 Aug 2009 10:56:20 +0200 Subject: whitespace fixes --- src/pulsecore/cpu-arm.c | 32 ++-- src/pulsecore/cpu-x86.c | 49 +++--- src/pulsecore/resampler.c | 65 ++++--- src/pulsecore/sample-util.c | 11 +- src/pulsecore/svolume_arm.c | 242 +++++++++++++------------- src/pulsecore/svolume_c.c | 330 +++++++++++++++++------------------ src/pulsecore/svolume_mmx.c | 366 +++++++++++++++++++-------------------- src/pulsecore/svolume_sse.c | 410 ++++++++++++++++++++++---------------------- 8 files changed, 765 insertions(+), 740 deletions(-) (limited to 'src/pulsecore') diff --git a/src/pulsecore/cpu-arm.c b/src/pulsecore/cpu-arm.c index 93ad3891..5a994b71 100644 --- a/src/pulsecore/cpu-arm.c +++ b/src/pulsecore/cpu-arm.c @@ -36,14 +36,14 @@ #if defined (__arm__) && defined (__linux__) -#define MAX_BUFFER 4096 +#define MAX_BUFFER 4096 static char * get_cpuinfo_line (char *cpuinfo, const char *tag) { char *line, *end, *colon; if (!(line = strstr (cpuinfo, tag))) return NULL; - + if (!(end = strchr (line, '\n'))) return NULL; @@ -106,20 +106,20 @@ void pa_cpu_init_arm (void) { } /* get the CPU features */ if ((line = get_cpuinfo_line (cpuinfo, "Features"))) { - char *state = NULL, *current; - - while ((current = pa_split_spaces (line, &state))) { - if (!strcmp (current, "vfp")) - flags |= PA_CPU_ARM_VFP; - else if (!strcmp (current, "edsp")) - flags |= PA_CPU_ARM_EDSP; - else if (!strcmp (current, "neon")) - flags |= PA_CPU_ARM_NEON; - else if (!strcmp (current, "vfpv3")) - flags |= PA_CPU_ARM_VFPV3; - - free (current); - } + char *state = NULL, *current; + + while ((current = pa_split_spaces (line, &state))) { + if (!strcmp (current, "vfp")) + flags |= PA_CPU_ARM_VFP; + else if (!strcmp (current, "edsp")) + flags |= PA_CPU_ARM_EDSP; + else if (!strcmp (current, "neon")) + flags |= PA_CPU_ARM_NEON; + else if (!strcmp (current, "vfpv3")) + flags |= PA_CPU_ARM_VFPV3; + + free (current); + } } free (cpuinfo); diff --git a/src/pulsecore/cpu-x86.c b/src/pulsecore/cpu-x86.c index 453ecf5b..0457199d 100644 --- a/src/pulsecore/cpu-x86.c +++ b/src/pulsecore/cpu-x86.c @@ -2,7 +2,7 @@ This file is part of PulseAudio. Copyright 2004-2006 Lennart Poettering - Copyright 2009 Wim Taymans + Copyright 2009 Wim Taymans PulseAudio is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published @@ -34,14 +34,15 @@ static void get_cpuid (uint32_t op, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d) { - __asm__ __volatile__ ( - " push %%"PA_REG_b" \n\t" - " cpuid \n\t" - " mov %%ebx, %%esi \n\t" - " pop %%"PA_REG_b" \n\t" - - : "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d) - : "0" (op)); + __asm__ __volatile__ ( + " push %%"PA_REG_b" \n\t" + " cpuid \n\t" + " mov %%ebx, %%esi \n\t" + " pop %%"PA_REG_b" \n\t" + + : "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d) + : "0" (op) + ); } #endif @@ -97,23 +98,23 @@ void pa_cpu_init_x86 (void) { } pa_log_info ("CPU flags: %s%s%s%s%s%s%s%s%s%s", - (flags & PA_CPU_X86_MMX) ? "MMX " : "", - (flags & PA_CPU_X86_SSE) ? "SSE " : "", - (flags & PA_CPU_X86_SSE2) ? "SSE2 " : "", - (flags & PA_CPU_X86_SSE3) ? "SSE3 " : "", - (flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "", - (flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "", - (flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "", - (flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "", - (flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "", - (flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : ""); + (flags & PA_CPU_X86_MMX) ? "MMX " : "", + (flags & PA_CPU_X86_SSE) ? "SSE " : "", + (flags & PA_CPU_X86_SSE2) ? "SSE2 " : "", + (flags & PA_CPU_X86_SSE3) ? "SSE3 " : "", + (flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "", + (flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "", + (flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "", + (flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "", + (flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "", + (flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : ""); /* activate various optimisations */ - if (flags & PA_CPU_X86_MMX) { + if (flags & PA_CPU_X86_MMX) pa_volume_func_init_mmx (flags); - } - if (flags & PA_CPU_X86_SSE) { - pa_volume_func_init_sse (flags); - } + + if (flags & PA_CPU_X86_SSE) + pa_volume_func_init_sse (flags); + #endif /* defined (__i386__) || defined (__amd64__) */ } diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c index 43771dc8..5a6c398e 100644 --- a/src/pulsecore/resampler.c +++ b/src/pulsecore/resampler.c @@ -1065,30 +1065,53 @@ static pa_memchunk* convert_to_work_format(pa_resampler *r, pa_memchunk *input) } static void remap_mono_to_stereo(pa_resampler *r, void *dst, const void *src, unsigned n) { - + unsigned i; + switch (r->work_format) { case PA_SAMPLE_FLOAT32NE: { float *d, *s; - d = (float *) dst; - s = (float *) src; + d = (float *) dst; + s = (float *) src; - for (; n > 0; n--, s++, d += 2) - d[0] = d[1] = *s; - break; - } + for (i = n >> 2; i; i--) { + d[0] = d[1] = s[0]; + d[2] = d[3] = s[1]; + d[4] = d[5] = s[2]; + d[6] = d[7] = s[3]; + s += 4; + d += 8; + } + for (i = n & 3; i; i--) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } case PA_SAMPLE_S16NE: { int16_t *d, *s; - d = (int16_t *) dst; - s = (int16_t *) src; + d = (int16_t *) dst; + s = (int16_t *) src; - for (; n > 0; n--, s++, d += 2) - d[0] = d[1] = *s; - break; - } + for (i = n >> 2; i; i--) { + d[0] = d[1] = s[0]; + d[2] = d[3] = s[1]; + d[4] = d[5] = s[2]; + d[6] = d[7] = s[3]; + s += 4; + d += 8; + } + for (i = n & 3; i; i--) { + d[0] = d[1] = s[0]; + s++; + d += 2; + } + break; + } default: pa_assert_not_reached(); } @@ -1114,7 +1137,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src, for (ic = 0; ic < n_ic; ic++) { float vol; - vol = r->map_table_f[oc][ic]; + vol = r->map_table_f[oc][ic]; if (vol <= 0.0) continue; @@ -1122,18 +1145,18 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src, d = (float *)dst + oc; s = (float *)src + ic; - if (vol >= 1.0) { + if (vol >= 1.0) { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s; - } else { + } else { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += *s * vol; - } + } } } break; - } + } case PA_SAMPLE_S16NE: { int16_t *d, *s; @@ -1144,7 +1167,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src, for (ic = 0; ic < n_ic; ic++) { int32_t vol; - vol = r->map_table_i[oc][ic]; + vol = r->map_table_i[oc][ic]; if (vol <= 0) continue; @@ -1158,11 +1181,11 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src, } else { for (i = n; i > 0; i--, s += n_ic, d += n_oc) *d += (int16_t) (((int32_t)*s * vol) >> 16); - } + } } } break; - } + } default: pa_assert_not_reached(); } diff --git a/src/pulsecore/sample-util.c b/src/pulsecore/sample-util.c index 677f914a..6e97e5a9 100644 --- a/src/pulsecore/sample-util.c +++ b/src/pulsecore/sample-util.c @@ -752,12 +752,13 @@ void pa_volume_memchunk( return; } - ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index; - do_volume = pa_get_volume_func (spec->format); pa_assert(do_volume); - + calc_volume_table[spec->format] ((void *)linear, volume); + + ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index; + do_volume (ptr, (void *)linear, spec->channels, c->length); pa_memblock_release(c->memblock); @@ -944,12 +945,12 @@ void pa_sample_clamp(pa_sample_format_t format, void *dst, size_t dstr, const vo for (; n > 0; n--) { float f; - f = *s; + f = *s; *d = PA_CLAMP_UNLIKELY(f, -1.0f, 1.0f); s = (const float*) ((const uint8_t*) s + sstr); d = (float*) ((uint8_t*) d + dstr); - } + } } else { pa_assert(format == PA_SAMPLE_FLOAT32RE); diff --git a/src/pulsecore/svolume_arm.c b/src/pulsecore/svolume_arm.c index 7e25a13c..0d39d105 100644 --- a/src/pulsecore/svolume_arm.c +++ b/src/pulsecore/svolume_arm.c @@ -40,86 +40,86 @@ #define MOD_INC() \ " subs r0, r6, %2 \n\t" \ " addcs r0, %1 \n\t" \ - " movcs r6, r0 \n\t" + " movcs r6, r0 \n\t" static void pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - int32_t *ve; - - channels = MAX (4, channels); - ve = volumes + channels; - - __asm__ __volatile__ ( - " mov r6, %1 \n\t" - " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ - " tst %3, #1 \n\t" /* check for odd samples */ - " beq 2f \n\t" - - "1: \n\t" - " ldr r0, [r6], #4 \n\t" /* odd samples volumes */ - " ldrh r2, [%0] \n\t" - - " smulwb r0, r0, r2 \n\t" - " ssat r0, #16, r0 \n\t" - - " strh r0, [%0], #2 \n\t" - - MOD_INC() - - "2: \n\t" - " mov %3, %3, LSR #1 \n\t" - " tst %3, #1 \n\t" /* check for odd samples */ - " beq 4f \n\t" - - "3: \n\t" - " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */ - " ldr r0, [%0] \n\t" - - " smulwt r2, r2, r0 \n\t" - " smulwb r3, r3, r0 \n\t" - - " ssat r2, #16, r2 \n\t" - " ssat r3, #16, r3 \n\t" - - " pkhbt r0, r3, r2, LSL #16 \n\t" - " str r0, [%0], #4 \n\t" - - MOD_INC() - - "4: \n\t" - " movs %3, %3, LSR #1 \n\t" - " beq 6f \n\t" - - "5: \n\t" - " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */ - " ldrd r4, [r6], #8 \n\t" - " ldrd r0, [%0] \n\t" - - " smulwt r2, r2, r0 \n\t" - " smulwb r3, r3, r0 \n\t" - " smulwt r4, r4, r1 \n\t" - " smulwb r5, r5, r1 \n\t" - - " ssat r2, #16, r2 \n\t" - " ssat r3, #16, r3 \n\t" - " ssat r4, #16, r4 \n\t" - " ssat r5, #16, r5 \n\t" - - " pkhbt r0, r3, r2, LSL #16 \n\t" - " pkhbt r1, r5, r4, LSL #16 \n\t" - " strd r0, [%0], #8 \n\t" - - MOD_INC() - - " subs %3, %3, #1 \n\t" - " bne 5b \n\t" - "6: \n\t" - - : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) - : - : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" - ); + int32_t *ve; + + channels = MAX (4, channels); + ve = volumes + channels; + + __asm__ __volatile__ ( + " mov r6, %1 \n\t" + " mov %3, %3, LSR #1 \n\t" /* length /= sizeof (int16_t) */ + " tst %3, #1 \n\t" /* check for odd samples */ + " beq 2f \n\t" + + "1: \n\t" + " ldr r0, [r6], #4 \n\t" /* odd samples volumes */ + " ldrh r2, [%0] \n\t" + + " smulwb r0, r0, r2 \n\t" + " ssat r0, #16, r0 \n\t" + + " strh r0, [%0], #2 \n\t" + + MOD_INC() + + "2: \n\t" + " mov %3, %3, LSR #1 \n\t" + " tst %3, #1 \n\t" /* check for odd samples */ + " beq 4f \n\t" + + "3: \n\t" + " ldrd r2, [r6], #8 \n\t" /* 2 samples at a time */ + " ldr r0, [%0] \n\t" + + " smulwt r2, r2, r0 \n\t" + " smulwb r3, r3, r0 \n\t" + + " ssat r2, #16, r2 \n\t" + " ssat r3, #16, r3 \n\t" + + " pkhbt r0, r3, r2, LSL #16 \n\t" + " str r0, [%0], #4 \n\t" + + MOD_INC() + + "4: \n\t" + " movs %3, %3, LSR #1 \n\t" + " beq 6f \n\t" + + "5: \n\t" + " ldrd r2, [r6], #8 \n\t" /* 4 samples at a time */ + " ldrd r4, [r6], #8 \n\t" + " ldrd r0, [%0] \n\t" + + " smulwt r2, r2, r0 \n\t" + " smulwb r3, r3, r0 \n\t" + " smulwt r4, r4, r1 \n\t" + " smulwb r5, r5, r1 \n\t" + + " ssat r2, #16, r2 \n\t" + " ssat r3, #16, r3 \n\t" + " ssat r4, #16, r4 \n\t" + " ssat r5, #16, r5 \n\t" + + " pkhbt r0, r3, r2, LSL #16 \n\t" + " pkhbt r1, r5, r4, LSL #16 \n\t" + " strd r0, [%0], #8 \n\t" + + MOD_INC() + + " subs %3, %3, #1 \n\t" + " bne 5b \n\t" + "6: \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) + : + : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" + ); } #undef RUN_TEST @@ -131,51 +131,51 @@ pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsi #define PADDING 16 static void run_test (void) { - int16_t samples[SAMPLES]; - int16_t samples_ref[SAMPLES]; - int16_t samples_orig[SAMPLES]; - int32_t volumes[CHANNELS + PADDING]; - int i, j, padding; - pa_do_volume_func_t func; - struct timeval start, stop; - - func = pa_get_volume_func (PA_SAMPLE_S16NE); - - printf ("checking ARM %zd\n", sizeof (samples)); - - pa_random (samples, sizeof (samples)); - memcpy (samples_ref, samples, sizeof (samples)); - memcpy (samples_orig, samples, sizeof (samples)); - - for (i = 0; i < CHANNELS; i++) - volumes[i] = rand() >> 1; - for (padding = 0; padding < PADDING; padding++, i++) - volumes[i] = volumes[padding]; - - func (samples_ref, volumes, CHANNELS, sizeof (samples)); - pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); - for (i = 0; i < SAMPLES; i++) { - if (samples[i] != samples_ref[i]) { - printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], - samples_orig[i], volumes[i % CHANNELS]); - } - } + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + int16_t samples_orig[SAMPLES]; + int32_t volumes[CHANNELS + PADDING]; + int i, j, padding; + pa_do_volume_func_t func; + struct timeval start, stop; - pa_gettimeofday(&start); - for (j = 0; j < TIMES; j++) { - memcpy (samples, samples_orig, sizeof (samples)); - pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); - } - pa_gettimeofday(&stop); - pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + func = pa_get_volume_func (PA_SAMPLE_S16NE); + + printf ("checking ARM %zd\n", sizeof (samples)); + + pa_random (samples, sizeof (samples)); + memcpy (samples_ref, samples, sizeof (samples)); + memcpy (samples_orig, samples, sizeof (samples)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = rand() >> 1; + for (padding = 0; padding < PADDING; padding++, i++) + volumes[i] = volumes[padding]; - pa_gettimeofday(&start); - for (j = 0; j < TIMES; j++) { - memcpy (samples_ref, samples_orig, sizeof (samples)); func (samples_ref, volumes, CHANNELS, sizeof (samples)); - } - pa_gettimeofday(&stop); - pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); + for (i = 0; i < SAMPLES; i++) { + if (samples[i] != samples_ref[i]) { + printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], + samples_orig[i], volumes[i % CHANNELS]); + } + } + + pa_gettimeofday(&start); + for (j = 0; j < TIMES; j++) { + memcpy (samples, samples_orig, sizeof (samples)); + pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); + } + pa_gettimeofday(&stop); + pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + + pa_gettimeofday(&start); + for (j = 0; j < TIMES; j++) { + memcpy (samples_ref, samples_orig, sizeof (samples)); + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + } + pa_gettimeofday(&stop); + pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); } #endif @@ -184,12 +184,12 @@ static void run_test (void) { void pa_volume_func_init_arm (pa_cpu_arm_flag_t flags) { #if defined (__arm__) - pa_log_info("Initialising ARM optimized functions."); + pa_log_info("Initialising ARM optimized functions."); #ifdef RUN_TEST - run_test (); + run_test (); #endif - pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_arm); + pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_arm); #endif /* defined (__arm__) */ } diff --git a/src/pulsecore/svolume_c.c b/src/pulsecore/svolume_c.c index 2148a573..5fc052b8 100644 --- a/src/pulsecore/svolume_c.c +++ b/src/pulsecore/svolume_c.c @@ -35,289 +35,289 @@ static void pa_volume_u8_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - for (channel = 0; length; length--) { - int32_t t, hi, lo; + for (channel = 0; length; length--) { + int32_t t, hi, lo; - hi = volumes[channel] >> 16; - lo = volumes[channel] & 0xFFFF; + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; - t = (int32_t) *samples - 0x80; - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F); - *samples++ = (uint8_t) (t + 0x80); + t = (int32_t) *samples - 0x80; + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F); + *samples++ = (uint8_t) (t + 0x80); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_alaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - for (channel = 0; length; length--) { - int32_t t, hi, lo; + for (channel = 0; length; length--) { + int32_t t, hi, lo; - hi = volumes[channel] >> 16; - lo = volumes[channel] & 0xFFFF; + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; - t = (int32_t) st_alaw2linear16(*samples); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3); + t = (int32_t) st_alaw2linear16(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_ulaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - for (channel = 0; length; length--) { - int32_t t, hi, lo; + for (channel = 0; length; length--) { + int32_t t, hi, lo; - hi = volumes[channel] >> 16; - lo = volumes[channel] & 0xFFFF; + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; - t = (int32_t) st_ulaw2linear16(*samples); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2); + t = (int32_t) st_ulaw2linear16(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s16ne_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (int16_t); + length /= sizeof (int16_t); - for (channel = 0; length; length--) { - int32_t t, hi, lo; + for (channel = 0; length; length--) { + int32_t t, hi, lo; - /* Multiplying the 32bit volume factor with the 16bit - * sample might result in an 48bit value. We want to - * do without 64 bit integers and hence do the - * multiplication independantly for the HI and LO part - * of the volume. */ + /* Multiplying the 32bit volume factor with the 16bit + * sample might result in an 48bit value. We want to + * do without 64 bit integers and hence do the + * multiplication independantly for the HI and LO part + * of the volume. */ - hi = volumes[channel] >> 16; - lo = volumes[channel] & 0xFFFF; + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; - t = (int32_t)(*samples); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *samples++ = (int16_t) t; + t = (int32_t)(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = (int16_t) t; - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s16re_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (int16_t); + length /= sizeof (int16_t); - for (channel = 0; length; length--) { - int32_t t, hi, lo; + for (channel = 0; length; length--) { + int32_t t, hi, lo; - hi = volumes[channel] >> 16; - lo = volumes[channel] & 0xFFFF; + hi = volumes[channel] >> 16; + lo = volumes[channel] & 0xFFFF; - t = (int32_t) PA_INT16_SWAP(*samples); - t = ((t * lo) >> 16) + (t * hi); - t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); - *samples++ = PA_INT16_SWAP((int16_t) t); + t = (int32_t) PA_INT16_SWAP(*samples); + t = ((t * lo) >> 16) + (t * hi); + t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); + *samples++ = PA_INT16_SWAP((int16_t) t); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_float32ne_c (float *samples, float *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (float); + length /= sizeof (float); - for (channel = 0; length; length--) { - *samples++ *= volumes[channel]; + for (channel = 0; length; length--) { + *samples++ *= volumes[channel]; - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_float32re_c (float *samples, float *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (float); + length /= sizeof (float); - for (channel = 0; length; length--) { - float t; + for (channel = 0; length; length--) { + float t; - t = PA_FLOAT32_SWAP(*samples); - t *= volumes[channel]; - *samples++ = PA_FLOAT32_SWAP(t); + t = PA_FLOAT32_SWAP(*samples); + t *= volumes[channel]; + *samples++ = PA_FLOAT32_SWAP(t); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s32ne_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (int32_t); + length /= sizeof (int32_t); - for (channel = 0; length; length--) { - int64_t t; + for (channel = 0; length; length--) { + int64_t t; - t = (int64_t)(*samples); - t = (t * volumes[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *samples++ = (int32_t) t; + t = (int64_t)(*samples); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = (int32_t) t; - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s32re_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (int32_t); + length /= sizeof (int32_t); - for (channel = 0; length; length--) { - int64_t t; + for (channel = 0; length; length--) { + int64_t t; - t = (int64_t) PA_INT32_SWAP(*samples); - t = (t * volumes[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *samples++ = PA_INT32_SWAP((int32_t) t); + t = (int64_t) PA_INT32_SWAP(*samples); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = PA_INT32_SWAP((int32_t) t); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s24ne_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; - uint8_t *e; + unsigned channel; + uint8_t *e; - e = samples + length; + e = samples + length; - for (channel = 0; samples < e; samples += 3) { - int64_t t; + for (channel = 0; samples < e; samples += 3) { + int64_t t; - t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8)); - t = (t * volumes[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8); + t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s24re_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; - uint8_t *e; + unsigned channel; + uint8_t *e; - e = samples + length; + e = samples + length; - for (channel = 0; samples < e; samples += 3) { - int64_t t; + for (channel = 0; samples < e; samples += 3) { + int64_t t; - t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8)); - t = (t * volumes[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8); + t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s24_32ne_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (uint32_t); + length /= sizeof (uint32_t); - for (channel = 0; length; length--) { - int64_t t; + for (channel = 0; length; length--) { + int64_t t; - t = (int64_t) ((int32_t) (*samples << 8)); - t = (t * volumes[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *samples++ = ((uint32_t) ((int32_t) t)) >> 8; + t = (int64_t) ((int32_t) (*samples << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = ((uint32_t) ((int32_t) t)) >> 8; - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static void pa_volume_s24_32re_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - unsigned channel; + unsigned channel; - length /= sizeof (uint32_t); + length /= sizeof (uint32_t); - for (channel = 0; length; length--) { - int64_t t; + for (channel = 0; length; length--) { + int64_t t; - t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8)); - t = (t * volumes[channel]) >> 16; - t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); - *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8); + t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8)); + t = (t * volumes[channel]) >> 16; + t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); + *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8); - if (PA_UNLIKELY(++channel >= channels)) - channel = 0; - } + if (PA_UNLIKELY(++channel >= channels)) + channel = 0; + } } static pa_do_volume_func_t do_volume_table[] = { - [PA_SAMPLE_U8] = (pa_do_volume_func_t) pa_volume_u8_c, - [PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c, - [PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c, - [PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c, - [PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c, - [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c, - [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c, - [PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c, - [PA_SAMPLE_S32RE] = (pa_do_volume_func_t) pa_volume_s32re_c, - [PA_SAMPLE_S24NE] = (pa_do_volume_func_t) pa_volume_s24ne_c, - [PA_SAMPLE_S24RE] = (pa_do_volume_func_t) pa_volume_s24re_c, - [PA_SAMPLE_S24_32NE] = (pa_do_volume_func_t) pa_volume_s24_32ne_c, - [PA_SAMPLE_S24_32RE] = (pa_do_volume_func_t) pa_volume_s24_32re_c + [PA_SAMPLE_U8] = (pa_do_volume_func_t) pa_volume_u8_c, + [PA_SAMPLE_ALAW] = (pa_do_volume_func_t) pa_volume_alaw_c, + [PA_SAMPLE_ULAW] = (pa_do_volume_func_t) pa_volume_ulaw_c, + [PA_SAMPLE_S16NE] = (pa_do_volume_func_t) pa_volume_s16ne_c, + [PA_SAMPLE_S16RE] = (pa_do_volume_func_t) pa_volume_s16re_c, + [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c, + [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c, + [PA_SAMPLE_S32NE] = (pa_do_volume_func_t) pa_volume_s32ne_c, + [PA_SAMPLE_S32RE] = (pa_do_volume_func_t) pa_volume_s32re_c, + [PA_SAMPLE_S24NE] = (pa_do_volume_func_t) pa_volume_s24ne_c, + [PA_SAMPLE_S24RE] = (pa_do_volume_func_t) pa_volume_s24re_c, + [PA_SAMPLE_S24_32NE] = (pa_do_volume_func_t) pa_volume_s24_32ne_c, + [PA_SAMPLE_S24_32RE] = (pa_do_volume_func_t) pa_volume_s24_32re_c }; pa_do_volume_func_t pa_get_volume_func(pa_sample_format_t f) { diff --git a/src/pulsecore/svolume_mmx.c b/src/pulsecore/svolume_mmx.c index 86af76d3..7e242684 100644 --- a/src/pulsecore/svolume_mmx.c +++ b/src/pulsecore/svolume_mmx.c @@ -73,7 +73,7 @@ " add "#a", %3 \n\t" \ " mov %3, %4 \n\t" \ " sub "#b", %4 \n\t" \ - " cmovae %4, %3 \n\t" + " cmovae %4, %3 \n\t" /* swap 16 bits */ #define SWAP_16(s) \ @@ -96,147 +96,147 @@ static void pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - pa_reg_x86 channel, temp; - - /* the max number of samples we process at a time, this is also the max amount - * we overread the volume array, which should have enough padding. */ - channels = MAX (4, channels); - - __asm__ __volatile__ ( - " xor %3, %3 \n\t" - " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ - " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */ - " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */ - " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */ - " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */ - - " test $1, %2 \n\t" /* check for odd samples */ - " je 2f \n\t" - - " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ - " movw (%0), %w4 \n\t" /* .. | p0 | */ - " movd %4, %%mm1 \n\t" - VOLUME_32x16 (%%mm1, %%mm0) - " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ - " movw %w4, (%0) \n\t" - " add $2, %0 \n\t" - MOD_ADD ($1, %5) - - "2: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ - " test $1, %2 \n\t" /* check for odd samples */ - " je 4f \n\t" - - "3: \n\t" /* do samples in groups of 2 */ - " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ - " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ - VOLUME_32x16 (%%mm1, %%mm0) - " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ - " add $4, %0 \n\t" - MOD_ADD ($2, %5) - - "4: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ - " cmp $0, %2 \n\t" - " je 6f \n\t" - - "5: \n\t" /* do samples in groups of 4 */ - " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ - " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ - " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ - " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ - VOLUME_32x16 (%%mm1, %%mm0) - VOLUME_32x16 (%%mm3, %%mm2) - " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ - " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ - " add $8, %0 \n\t" - MOD_ADD ($4, %5) - " dec %2 \n\t" - " jne 5b \n\t" - - "6: \n\t" - " emms \n\t" - - : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) - : "r" ((pa_reg_x86)channels) - : "cc" - ); + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = MAX (4, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */ + " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */ + " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */ + " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " movd %4, %%mm1 \n\t" + VOLUME_32x16 (%%mm1, %%mm0) + " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" /* check for odd samples */ + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + VOLUME_32x16 (%%mm1, %%mm0) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " cmp $0, %2 \n\t" + " je 6f \n\t" + + "5: \n\t" /* do samples in groups of 4 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ + VOLUME_32x16 (%%mm1, %%mm0) + VOLUME_32x16 (%%mm3, %%mm2) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + " dec %2 \n\t" + " jne 5b \n\t" + + "6: \n\t" + " emms \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); } static void pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - pa_reg_x86 channel, temp; - - /* the max number of samples we process at a time, this is also the max amount - * we overread the volume array, which should have enough padding. */ - channels = MAX (4, channels); - - __asm__ __volatile__ ( - " xor %3, %3 \n\t" - " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ - " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */ - " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */ - " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */ - " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */ - - " test $1, %2 \n\t" /* check for odd samples */ - " je 2f \n\t" - - " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ - " movw (%0), %w4 \n\t" /* .. | p0 | */ - " rorw $8, %w4 \n\t" - " movd %4, %%mm1 \n\t" - VOLUME_32x16 (%%mm1, %%mm0) - " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ - " rorw $8, %w4 \n\t" - " movw %w4, (%0) \n\t" - " add $2, %0 \n\t" - MOD_ADD ($1, %5) - - "2: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ - " test $1, %2 \n\t" /* check for odd samples */ - " je 4f \n\t" - - "3: \n\t" /* do samples in groups of 2 */ - " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ - " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ - SWAP_16 (%%mm1) - VOLUME_32x16 (%%mm1, %%mm0) - SWAP_16 (%%mm0) - " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ - " add $4, %0 \n\t" - MOD_ADD ($2, %5) - - "4: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ - " cmp $0, %2 \n\t" - " je 6f \n\t" - - "5: \n\t" /* do samples in groups of 4 */ - " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ - " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ - " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ - " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ - SWAP_16_2 (%%mm1, %%mm3) - VOLUME_32x16 (%%mm1, %%mm0) - VOLUME_32x16 (%%mm3, %%mm2) - SWAP_16_2 (%%mm0, %%mm2) - " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ - " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ - " add $8, %0 \n\t" - MOD_ADD ($4, %5) - " dec %2 \n\t" - " jne 5b \n\t" - - "6: \n\t" - " emms \n\t" - - : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) - : "r" ((pa_reg_x86)channels) - : "cc" - ); + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = MAX (4, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + " pcmpeqw %%mm6, %%mm6 \n\t" /* .. | ffff | ffff | */ + " pcmpeqw %%mm7, %%mm7 \n\t" /* .. | ffff | ffff | */ + " pslld $16, %%mm6 \n\t" /* .. | ffff | 0 | */ + " psrld $31, %%mm7 \n\t" /* .. | 0 | 1 | */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%mm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " rorw $8, %w4 \n\t" + " movd %4, %%mm1 \n\t" + VOLUME_32x16 (%%mm1, %%mm0) + " movd %%mm0, %4 \n\t" /* .. | p0*v0 | */ + " rorw $8, %w4 \n\t" + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" /* check for odd samples */ + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + SWAP_16 (%%mm1) + VOLUME_32x16 (%%mm1, %%mm0) + SWAP_16 (%%mm0) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " cmp $0, %2 \n\t" + " je 6f \n\t" + + "5: \n\t" /* do samples in groups of 4 */ + " movq (%1, %3, 4), %%mm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movq 8(%1, %3, 4), %%mm2 \n\t" /* | v3h | v3l | v2h | v2l | */ + " movd (%0), %%mm1 \n\t" /* .. | p1 | p0 | */ + " movd 4(%0), %%mm3 \n\t" /* .. | p3 | p2 | */ + SWAP_16_2 (%%mm1, %%mm3) + VOLUME_32x16 (%%mm1, %%mm0) + VOLUME_32x16 (%%mm3, %%mm2) + SWAP_16_2 (%%mm0, %%mm2) + " movd %%mm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " movd %%mm2, 4(%0) \n\t" /* .. | p3*v3 | p2*v2 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + " dec %2 \n\t" + " jne 5b \n\t" + + "6: \n\t" + " emms \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); } #undef RUN_TEST @@ -248,51 +248,51 @@ pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi #define PADDING 16 static void run_test (void) { - int16_t samples[SAMPLES]; - int16_t samples_ref[SAMPLES]; - int16_t samples_orig[SAMPLES]; - int32_t volumes[CHANNELS + PADDING]; - int i, j, padding; - pa_do_volume_func_t func; - struct timeval start, stop; - - func = pa_get_volume_func (PA_SAMPLE_S16NE); - - printf ("checking MMX %zd\n", sizeof (samples)); - - pa_random (samples, sizeof (samples)); - memcpy (samples_ref, samples, sizeof (samples)); - memcpy (samples_orig, samples, sizeof (samples)); - - for (i = 0; i < CHANNELS; i++) - volumes[i] = rand() >> 1; - for (padding = 0; padding < PADDING; padding++, i++) - volumes[i] = volumes[padding]; - - func (samples_ref, volumes, CHANNELS, sizeof (samples)); - pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); - for (i = 0; i < SAMPLES; i++) { - if (samples[i] != samples_ref[i]) { - printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], - samples_orig[i], volumes[i % CHANNELS]); - } - } + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + int16_t samples_orig[SAMPLES]; + int32_t volumes[CHANNELS + PADDING]; + int i, j, padding; + pa_do_volume_func_t func; + struct timeval start, stop; - pa_gettimeofday(&start); - for (j = 0; j < TIMES; j++) { - memcpy (samples, samples_orig, sizeof (samples)); - pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); - } - pa_gettimeofday(&stop); - pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + func = pa_get_volume_func (PA_SAMPLE_S16NE); + + printf ("checking MMX %zd\n", sizeof (samples)); + + pa_random (samples, sizeof (samples)); + memcpy (samples_ref, samples, sizeof (samples)); + memcpy (samples_orig, samples, sizeof (samples)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = rand() >> 1; + for (padding = 0; padding < PADDING; padding++, i++) + volumes[i] = volumes[padding]; - pa_gettimeofday(&start); - for (j = 0; j < TIMES; j++) { - memcpy (samples_ref, samples_orig, sizeof (samples)); func (samples_ref, volumes, CHANNELS, sizeof (samples)); - } - pa_gettimeofday(&stop); - pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); + for (i = 0; i < SAMPLES; i++) { + if (samples[i] != samples_ref[i]) { + printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], + samples_orig[i], volumes[i % CHANNELS]); + } + } + + pa_gettimeofday(&start); + for (j = 0; j < TIMES; j++) { + memcpy (samples, samples_orig, sizeof (samples)); + pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); + } + pa_gettimeofday(&stop); + pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + + pa_gettimeofday(&start); + for (j = 0; j < TIMES; j++) { + memcpy (samples_ref, samples_orig, sizeof (samples)); + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + } + pa_gettimeofday(&stop); + pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); } #endif @@ -301,13 +301,13 @@ static void run_test (void) { void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) { #if defined (__i386__) || defined (__amd64__) - pa_log_info("Initialising MMX optimized functions."); + pa_log_info("Initialising MMX optimized functions."); #ifdef RUN_TEST - run_test (); + run_test (); #endif - pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx); - pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx); + pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_mmx); + pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_mmx); #endif /* defined (__i386__) || defined (__amd64__) */ } diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c index 5979f7c2..b5e3687f 100644 --- a/src/pulsecore/svolume_sse.c +++ b/src/pulsecore/svolume_sse.c @@ -48,7 +48,7 @@ " psrld $16, "#v" \n\t" /* .. | p0 | 0 | */ \ " pmaddwd %%xmm5, "#v" \n\t" /* .. | p0 * vh | */ \ " paddd "#s", "#v" \n\t" /* .. | p0 * v0 | */ \ - " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */ + " packssdw "#v", "#v" \n\t" /* .. | p1*v1 | p0*v0 | */ #define MOD_ADD(a,b) \ " add "#a", %3 \n\t" /* channel += inc */ \ @@ -77,169 +77,169 @@ static void pa_volume_s16ne_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - pa_reg_x86 channel, temp; - - /* the max number of samples we process at a time, this is also the max amount - * we overread the volume array, which should have enough padding. */ - channels = MAX (8, channels); - - __asm__ __volatile__ ( - " xor %3, %3 \n\t" - " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ - - " test $1, %2 \n\t" /* check for odd samples */ - " je 2f \n\t" - - " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ - " movw (%0), %w4 \n\t" /* .. | p0 | */ - " movd %4, %%xmm1 \n\t" - VOLUME_32x16 (%%xmm1, %%xmm0) - " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ - " movw %w4, (%0) \n\t" - " add $2, %0 \n\t" - MOD_ADD ($1, %5) - - "2: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ - " test $1, %2 \n\t" - " je 4f \n\t" - - "3: \n\t" /* do samples in groups of 2 */ - " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ - " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ - VOLUME_32x16 (%%xmm1, %%xmm0) - " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ - " add $4, %0 \n\t" - MOD_ADD ($2, %5) - - "4: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ - " test $1, %2 \n\t" - " je 6f \n\t" - - /* FIXME, we can do aligned access of the volume values if we can guarantee - * that the array is 16 bytes aligned, we probably have to do the odd values - * after this then. */ - "5: \n\t" /* do samples in groups of 4 */ - " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ - " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ - VOLUME_32x16 (%%xmm1, %%xmm0) - " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ - " add $8, %0 \n\t" - MOD_ADD ($4, %5) - - "6: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ - " cmp $0, %2 \n\t" - " je 8f \n\t" - - "7: \n\t" /* do samples in groups of 8 */ - " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ - " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ - " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ - " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ - VOLUME_32x16 (%%xmm1, %%xmm0) - VOLUME_32x16 (%%xmm3, %%xmm2) - " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ - " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ - " add $16, %0 \n\t" - MOD_ADD ($8, %5) - " dec %2 \n\t" - " jne 7b \n\t" - "8: \n\t" - - : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) - : "r" ((pa_reg_x86)channels) - : "cc" - ); + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = MAX (8, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " movd %4, %%xmm1 \n\t" + VOLUME_32x16 (%%xmm1, %%xmm0) + " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ + VOLUME_32x16 (%%xmm1, %%xmm0) + " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " test $1, %2 \n\t" + " je 6f \n\t" + + /* FIXME, we can do aligned access of the volume values if we can guarantee + * that the array is 16 bytes aligned, we probably have to do the odd values + * after this then. */ + "5: \n\t" /* do samples in groups of 4 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + VOLUME_32x16 (%%xmm1, %%xmm0) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + + "6: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ + " cmp $0, %2 \n\t" + " je 8f \n\t" + + "7: \n\t" /* do samples in groups of 8 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ + VOLUME_32x16 (%%xmm1, %%xmm0) + VOLUME_32x16 (%%xmm3, %%xmm2) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ + " add $16, %0 \n\t" + MOD_ADD ($8, %5) + " dec %2 \n\t" + " jne 7b \n\t" + "8: \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); } static void pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length) { - pa_reg_x86 channel, temp; - - /* the max number of samples we process at a time, this is also the max amount - * we overread the volume array, which should have enough padding. */ - channels = MAX (8, channels); - - __asm__ __volatile__ ( - " xor %3, %3 \n\t" - " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ - - " test $1, %2 \n\t" /* check for odd samples */ - " je 2f \n\t" - - " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ - " movw (%0), %w4 \n\t" /* .. | p0 | */ - " rorw $8, %w4 \n\t" - " movd %4, %%xmm1 \n\t" - VOLUME_32x16 (%%xmm1, %%xmm0) - " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ - " rorw $8, %w4 \n\t" - " movw %w4, (%0) \n\t" - " add $2, %0 \n\t" - MOD_ADD ($1, %5) - - "2: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ - " test $1, %2 \n\t" - " je 4f \n\t" - - "3: \n\t" /* do samples in groups of 2 */ - " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ - " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ - SWAP_16 (%%xmm1) - VOLUME_32x16 (%%xmm1, %%xmm0) - SWAP_16 (%%xmm0) - " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ - " add $4, %0 \n\t" - MOD_ADD ($2, %5) - - "4: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ - " test $1, %2 \n\t" - " je 6f \n\t" - - /* FIXME, we can do aligned access of the volume values if we can guarantee - * that the array is 16 bytes aligned, we probably have to do the odd values - * after this then. */ - "5: \n\t" /* do samples in groups of 4 */ - " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ - " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ - SWAP_16 (%%xmm1) - VOLUME_32x16 (%%xmm1, %%xmm0) - SWAP_16 (%%xmm0) - " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ - " add $8, %0 \n\t" - MOD_ADD ($4, %5) - - "6: \n\t" - " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ - " cmp $0, %2 \n\t" - " je 8f \n\t" - - "7: \n\t" /* do samples in groups of 8 */ - " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ - " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ - " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ - " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ - SWAP_16_2 (%%xmm1, %%xmm3) - VOLUME_32x16 (%%xmm1, %%xmm0) - VOLUME_32x16 (%%xmm3, %%xmm2) - SWAP_16_2 (%%xmm0, %%xmm2) - " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ - " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ - " add $16, %0 \n\t" - MOD_ADD ($8, %5) - " dec %2 \n\t" - " jne 7b \n\t" - "8: \n\t" - - : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) - : "r" ((pa_reg_x86)channels) - : "cc" - ); + pa_reg_x86 channel, temp; + + /* the max number of samples we process at a time, this is also the max amount + * we overread the volume array, which should have enough padding. */ + channels = MAX (8, channels); + + __asm__ __volatile__ ( + " xor %3, %3 \n\t" + " sar $1, %2 \n\t" /* length /= sizeof (int16_t) */ + + " test $1, %2 \n\t" /* check for odd samples */ + " je 2f \n\t" + + " movd (%1, %3, 4), %%xmm0 \n\t" /* | v0h | v0l | */ + " movw (%0), %w4 \n\t" /* .. | p0 | */ + " rorw $8, %w4 \n\t" + " movd %4, %%xmm1 \n\t" + VOLUME_32x16 (%%xmm1, %%xmm0) + " movd %%xmm0, %4 \n\t" /* .. | p0*v0 | */ + " rorw $8, %w4 \n\t" + " movw %w4, (%0) \n\t" + " add $2, %0 \n\t" + MOD_ADD ($1, %5) + + "2: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 2 samples at a time */ + " test $1, %2 \n\t" + " je 4f \n\t" + + "3: \n\t" /* do samples in groups of 2 */ + " movq (%1, %3, 4), %%xmm0 \n\t" /* | v1h | v1l | v0h | v0l | */ + " movd (%0), %%xmm1 \n\t" /* .. | p1 | p0 | */ + SWAP_16 (%%xmm1) + VOLUME_32x16 (%%xmm1, %%xmm0) + SWAP_16 (%%xmm0) + " movd %%xmm0, (%0) \n\t" /* .. | p1*v1 | p0*v0 | */ + " add $4, %0 \n\t" + MOD_ADD ($2, %5) + + "4: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 4 samples at a time */ + " test $1, %2 \n\t" + " je 6f \n\t" + + /* FIXME, we can do aligned access of the volume values if we can guarantee + * that the array is 16 bytes aligned, we probably have to do the odd values + * after this then. */ + "5: \n\t" /* do samples in groups of 4 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + SWAP_16 (%%xmm1) + VOLUME_32x16 (%%xmm1, %%xmm0) + SWAP_16 (%%xmm0) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " add $8, %0 \n\t" + MOD_ADD ($4, %5) + + "6: \n\t" + " sar $1, %2 \n\t" /* prepare for processing 8 samples at a time */ + " cmp $0, %2 \n\t" + " je 8f \n\t" + + "7: \n\t" /* do samples in groups of 8 */ + " movdqu (%1, %3, 4), %%xmm0 \n\t" /* | v3h | v3l .. v0h | v0l | */ + " movdqu 16(%1, %3, 4), %%xmm2 \n\t" /* | v7h | v7l .. v4h | v4l | */ + " movq (%0), %%xmm1 \n\t" /* .. | p3 .. p0 | */ + " movq 8(%0), %%xmm3 \n\t" /* .. | p7 .. p4 | */ + SWAP_16_2 (%%xmm1, %%xmm3) + VOLUME_32x16 (%%xmm1, %%xmm0) + VOLUME_32x16 (%%xmm3, %%xmm2) + SWAP_16_2 (%%xmm0, %%xmm2) + " movq %%xmm0, (%0) \n\t" /* .. | p3*v3 .. p0*v0 | */ + " movq %%xmm2, 8(%0) \n\t" /* .. | p7*v7 .. p4*v4 | */ + " add $16, %0 \n\t" + MOD_ADD ($8, %5) + " dec %2 \n\t" + " jne 7b \n\t" + "8: \n\t" + + : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) + : "r" ((pa_reg_x86)channels) + : "cc" + ); } #undef RUN_TEST @@ -251,64 +251,64 @@ pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsi #define PADDING 16 static void run_test (void) { - int16_t samples[SAMPLES]; - int16_t samples_ref[SAMPLES]; - int16_t samples_orig[SAMPLES]; - int32_t volumes[CHANNELS + PADDING]; - int i, j, padding; - pa_do_volume_func_t func; - struct timeval start, stop; - - func = pa_get_volume_func (PA_SAMPLE_S16NE); - - printf ("checking SSE %zd\n", sizeof (samples)); - - pa_random (samples, sizeof (samples)); - memcpy (samples_ref, samples, sizeof (samples)); - memcpy (samples_orig, samples, sizeof (samples)); - - for (i = 0; i < CHANNELS; i++) - volumes[i] = rand() >> 1; - for (padding = 0; padding < PADDING; padding++, i++) - volumes[i] = volumes[padding]; - - func (samples_ref, volumes, CHANNELS, sizeof (samples)); - pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); - for (i = 0; i < SAMPLES; i++) { - if (samples[i] != samples_ref[i]) { - printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], - samples_orig[i], volumes[i % CHANNELS]); - } - } + int16_t samples[SAMPLES]; + int16_t samples_ref[SAMPLES]; + int16_t samples_orig[SAMPLES]; + int32_t volumes[CHANNELS + PADDING]; + int i, j, padding; + pa_do_volume_func_t func; + struct timeval start, stop; - pa_gettimeofday(&start); - for (j = 0; j < TIMES; j++) { - memcpy (samples, samples_orig, sizeof (samples)); - pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); - } - pa_gettimeofday(&stop); - pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + func = pa_get_volume_func (PA_SAMPLE_S16NE); + + printf ("checking SSE %zd\n", sizeof (samples)); + + pa_random (samples, sizeof (samples)); + memcpy (samples_ref, samples, sizeof (samples)); + memcpy (samples_orig, samples, sizeof (samples)); + + for (i = 0; i < CHANNELS; i++) + volumes[i] = rand() >> 1; + for (padding = 0; padding < PADDING; padding++, i++) + volumes[i] = volumes[padding]; - pa_gettimeofday(&start); - for (j = 0; j < TIMES; j++) { - memcpy (samples_ref, samples_orig, sizeof (samples)); func (samples_ref, volumes, CHANNELS, sizeof (samples)); - } - pa_gettimeofday(&stop); - pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); + for (i = 0; i < SAMPLES; i++) { + if (samples[i] != samples_ref[i]) { + printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], + samples_orig[i], volumes[i % CHANNELS]); + } + } + + pa_gettimeofday(&start); + for (j = 0; j < TIMES; j++) { + memcpy (samples, samples_orig, sizeof (samples)); + pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); + } + pa_gettimeofday(&stop); + pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + + pa_gettimeofday(&start); + for (j = 0; j < TIMES; j++) { + memcpy (samples_ref, samples_orig, sizeof (samples)); + func (samples_ref, volumes, CHANNELS, sizeof (samples)); + } + pa_gettimeofday(&stop); + pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); } #endif #endif /* defined (__i386__) || defined (__amd64__) */ void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) { #if defined (__i386__) || defined (__amd64__) - pa_log_info("Initialising SSE optimized functions."); + pa_log_info("Initialising SSE optimized functions."); #ifdef RUN_TEST - run_test (); + run_test (); #endif - pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse); - pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse); + pa_set_volume_func (PA_SAMPLE_S16NE, (pa_do_volume_func_t) pa_volume_s16ne_sse); + pa_set_volume_func (PA_SAMPLE_S16RE, (pa_do_volume_func_t) pa_volume_s16re_sse); #endif /* defined (__i386__) || defined (__amd64__) */ } -- cgit