diff options
Diffstat (limited to 'src')
| -rw-r--r-- | src/pulsecore/cpu-arm.c | 32 | ||||
| -rw-r--r-- | src/pulsecore/cpu-x86.c | 49 | ||||
| -rw-r--r-- | src/pulsecore/resampler.c | 65 | ||||
| -rw-r--r-- | src/pulsecore/sample-util.c | 11 | ||||
| -rw-r--r-- | src/pulsecore/svolume_arm.c | 242 | ||||
| -rw-r--r-- | src/pulsecore/svolume_c.c | 330 | ||||
| -rw-r--r-- | src/pulsecore/svolume_mmx.c | 366 | ||||
| -rw-r--r-- | src/pulsecore/svolume_sse.c | 410 | 
8 files changed, 765 insertions, 740 deletions
| diff --git a/src/pulsecore/cpu-arm.c b/src/pulsecore/cpu-arm.c index 93ad3891..5a994b71 100644 --- a/src/pulsecore/cpu-arm.c +++ b/src/pulsecore/cpu-arm.c @@ -36,14 +36,14 @@  #if defined (__arm__) && defined (__linux__) -#define MAX_BUFFER	4096 +#define MAX_BUFFER  4096  static char *  get_cpuinfo_line (char *cpuinfo, const char *tag) {      char *line, *end, *colon;      if (!(line = strstr (cpuinfo, tag)))          return NULL; -      +      if (!(end = strchr (line, '\n')))          return NULL; @@ -106,20 +106,20 @@ void pa_cpu_init_arm (void) {      }      /* get the CPU features */      if ((line = get_cpuinfo_line (cpuinfo, "Features"))) { -	  char *state = NULL, *current; - -	  while ((current = pa_split_spaces (line, &state))) { -              if (!strcmp (current, "vfp"))  -                  flags |= PA_CPU_ARM_VFP; -	      else if (!strcmp (current, "edsp"))  -                  flags |= PA_CPU_ARM_EDSP; -	      else if (!strcmp (current, "neon"))  -                  flags |= PA_CPU_ARM_NEON; -	      else if (!strcmp (current, "vfpv3"))  -                  flags |= PA_CPU_ARM_VFPV3; - -              free (current); -	  } +        char *state = NULL, *current; + +        while ((current = pa_split_spaces (line, &state))) { +            if (!strcmp (current, "vfp")) +                flags |= PA_CPU_ARM_VFP; +            else if (!strcmp (current, "edsp")) +                flags |= PA_CPU_ARM_EDSP; +            else if (!strcmp (current, "neon")) +                flags |= PA_CPU_ARM_NEON; +            else if (!strcmp (current, "vfpv3")) +                flags |= PA_CPU_ARM_VFPV3; + +            free (current); +        }      }      free (cpuinfo); diff --git a/src/pulsecore/cpu-x86.c b/src/pulsecore/cpu-x86.c index 453ecf5b..0457199d 100644 --- a/src/pulsecore/cpu-x86.c +++ b/src/pulsecore/cpu-x86.c @@ -2,7 +2,7 @@    This file is part of PulseAudio.    Copyright 2004-2006 Lennart Poettering -  Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>  +  Copyright 2009 Wim Taymans <wim.taymans@collabora.co.uk>    PulseAudio is free software; you can redistribute it and/or modify    it under the terms of the GNU Lesser General Public License as published @@ -34,14 +34,15 @@  static void  get_cpuid (uint32_t op, uint32_t *a, uint32_t *b, uint32_t *c, uint32_t *d)  { -  __asm__ __volatile__ ( -      "  push %%"PA_REG_b"   \n\t" -      "  cpuid               \n\t" -      "  mov %%ebx, %%esi    \n\t" -      "  pop %%"PA_REG_b"    \n\t" - -      : "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d) -      : "0" (op)); +    __asm__ __volatile__ ( +        "  push %%"PA_REG_b"   \n\t" +        "  cpuid               \n\t" +        "  mov %%ebx, %%esi    \n\t" +        "  pop %%"PA_REG_b"    \n\t" + +        : "=a" (*a), "=S" (*b), "=c" (*c), "=d" (*d) +        : "0" (op) +    );  }  #endif @@ -97,23 +98,23 @@ void pa_cpu_init_x86 (void) {      }      pa_log_info ("CPU flags: %s%s%s%s%s%s%s%s%s%s", -	  (flags & PA_CPU_X86_MMX) ? "MMX " : "", -	  (flags & PA_CPU_X86_SSE) ? "SSE " : "", -	  (flags & PA_CPU_X86_SSE2) ? "SSE2 " : "", -	  (flags & PA_CPU_X86_SSE3) ? "SSE3 " : "", -	  (flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "", -	  (flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "", -	  (flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "", -	  (flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "", -	  (flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "", -	  (flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : ""); +    (flags & PA_CPU_X86_MMX) ? "MMX " : "", +    (flags & PA_CPU_X86_SSE) ? "SSE " : "", +    (flags & PA_CPU_X86_SSE2) ? "SSE2 " : "", +    (flags & PA_CPU_X86_SSE3) ? "SSE3 " : "", +    (flags & PA_CPU_X86_SSSE3) ? "SSSE3 " : "", +    (flags & PA_CPU_X86_SSE4_1) ? "SSE4_1 " : "", +    (flags & PA_CPU_X86_SSE4_2) ? "SSE4_2 " : "", +    (flags & PA_CPU_X86_MMXEXT) ? "MMXEXT " : "", +    (flags & PA_CPU_X86_3DNOW) ? "3DNOW " : "", +    (flags & PA_CPU_X86_3DNOWEXT) ? "3DNOWEXT " : "");      /* activate various optimisations */ -    if (flags & PA_CPU_X86_MMX) { +    if (flags & PA_CPU_X86_MMX)          pa_volume_func_init_mmx (flags); -    } -    if (flags & PA_CPU_X86_SSE) { -	pa_volume_func_init_sse (flags); -    } + +    if (flags & PA_CPU_X86_SSE) +        pa_volume_func_init_sse (flags); +  #endif /* defined (__i386__) || defined (__amd64__) */  } diff --git a/src/pulsecore/resampler.c b/src/pulsecore/resampler.c index 43771dc8..5a6c398e 100644 --- a/src/pulsecore/resampler.c +++ b/src/pulsecore/resampler.c @@ -1065,30 +1065,53 @@ static pa_memchunk* convert_to_work_format(pa_resampler *r, pa_memchunk *input)  }  static void remap_mono_to_stereo(pa_resampler *r, void *dst, const void *src, unsigned n) { -   +    unsigned i; +      switch (r->work_format) {          case PA_SAMPLE_FLOAT32NE:          {              float *d, *s; -	    d = (float *) dst; -	    s = (float *) src; +            d = (float *) dst; +            s = (float *) src; -            for (; n > 0; n--, s++, d += 2) -                d[0] = d[1] = *s; -	    break; -	} +            for (i = n >> 2; i; i--) { +                d[0] = d[1] = s[0]; +                d[2] = d[3] = s[1]; +                d[4] = d[5] = s[2]; +                d[6] = d[7] = s[3]; +                s += 4; +                d += 8; +            } +            for (i = n & 3; i; i--) { +                d[0] = d[1] = s[0]; +                s++; +                d += 2; +            } +            break; +        }          case PA_SAMPLE_S16NE:          {              int16_t *d, *s; -	    d = (int16_t *) dst; -	    s = (int16_t *) src; +            d = (int16_t *) dst; +            s = (int16_t *) src; -            for (; n > 0; n--, s++, d += 2) -                d[0] = d[1] = *s; -	    break; -	} +            for (i = n >> 2; i; i--) { +                d[0] = d[1] = s[0]; +                d[2] = d[3] = s[1]; +                d[4] = d[5] = s[2]; +                d[6] = d[7] = s[3]; +                s += 4; +                d += 8; +            } +            for (i = n & 3; i; i--) { +                d[0] = d[1] = s[0]; +                s++; +                d += 2; +            } +            break; +        }          default:              pa_assert_not_reached();      } @@ -1114,7 +1137,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,                  for (ic = 0; ic < n_ic; ic++) {                      float vol; -		    vol = r->map_table_f[oc][ic]; +                    vol = r->map_table_f[oc][ic];                      if (vol <= 0.0)                          continue; @@ -1122,18 +1145,18 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,                      d = (float *)dst + oc;                      s = (float *)src + ic; -		    if (vol >= 1.0) { +                    if (vol >= 1.0) {                          for (i = n; i > 0; i--, s += n_ic, d += n_oc)                              *d += *s; -		    } else {  +                    } else {                          for (i = n; i > 0; i--, s += n_ic, d += n_oc)                              *d += *s * vol; -		    } +                    }                  }              }              break; -	} +        }          case PA_SAMPLE_S16NE:          {              int16_t *d, *s; @@ -1144,7 +1167,7 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,                  for (ic = 0; ic < n_ic; ic++) {                      int32_t vol; -		    vol = r->map_table_i[oc][ic]; +		                vol = r->map_table_i[oc][ic];                      if (vol <= 0)                          continue; @@ -1158,11 +1181,11 @@ static void remap_channels_matrix (pa_resampler *r, void *dst, const void *src,                      } else {                          for (i = n; i > 0; i--, s += n_ic, d += n_oc)                              *d += (int16_t) (((int32_t)*s * vol) >> 16); -		    } +		                }                  }              }              break; -	} +        }          default:              pa_assert_not_reached();      } diff --git a/src/pulsecore/sample-util.c b/src/pulsecore/sample-util.c index 677f914a..6e97e5a9 100644 --- a/src/pulsecore/sample-util.c +++ b/src/pulsecore/sample-util.c @@ -752,12 +752,13 @@ void pa_volume_memchunk(        return;      } -    ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index; -      do_volume = pa_get_volume_func (spec->format);      pa_assert(do_volume); -     +      calc_volume_table[spec->format] ((void *)linear, volume); + +    ptr = (uint8_t*) pa_memblock_acquire(c->memblock) + c->index; +      do_volume (ptr, (void *)linear, spec->channels, c->length);      pa_memblock_release(c->memblock); @@ -944,12 +945,12 @@ void pa_sample_clamp(pa_sample_format_t format, void *dst, size_t dstr, const vo          for (; n > 0; n--) {              float f; -	    f = *s; +            f = *s;              *d = PA_CLAMP_UNLIKELY(f, -1.0f, 1.0f);              s = (const float*) ((const uint8_t*) s + sstr);              d = (float*) ((uint8_t*) d + dstr); -	} +        }      } else {          pa_assert(format == PA_SAMPLE_FLOAT32RE); diff --git a/src/pulsecore/svolume_arm.c b/src/pulsecore/svolume_arm.c index 7e25a13c..0d39d105 100644 --- a/src/pulsecore/svolume_arm.c +++ b/src/pulsecore/svolume_arm.c @@ -40,86 +40,86 @@  #define MOD_INC() \      " subs  r0, r6, %2              \n\t" \      " addcs r0, %1                  \n\t" \ -    " movcs r6, r0                  \n\t"  +    " movcs r6, r0                  \n\t"  static void  pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  int32_t *ve; - -  channels = MAX (4, channels); -  ve = volumes + channels; - -  __asm__ __volatile__ ( -    " mov r6, %1                      \n\t" -    " mov %3, %3, LSR #1              \n\t" /* length /= sizeof (int16_t) */ -    " tst %3, #1                      \n\t" /* check for odd samples */ -    " beq  2f                         \n\t" - -    "1:                               \n\t" -    " ldr  r0, [r6], #4               \n\t" /* odd samples volumes */ -    " ldrh r2, [%0]                   \n\t"  - -    " smulwb r0, r0, r2               \n\t" -    " ssat r0, #16, r0                \n\t" -     -    " strh r0, [%0], #2               \n\t" - -    MOD_INC() - -    "2:                               \n\t" -    " mov %3, %3, LSR #1              \n\t"  -    " tst %3, #1                      \n\t" /* check for odd samples */ -    " beq  4f                         \n\t"  - -    "3:                               \n\t" -    " ldrd r2, [r6], #8               \n\t" /* 2 samples at a time */ -    " ldr  r0, [%0]                   \n\t" - -    " smulwt r2, r2, r0               \n\t" -    " smulwb r3, r3, r0               \n\t" - -    " ssat r2, #16, r2                \n\t" -    " ssat r3, #16, r3                \n\t" -     -    " pkhbt r0, r3, r2, LSL #16       \n\t" -    " str  r0, [%0], #4               \n\t" - -    MOD_INC() - -    "4:                               \n\t" -    " movs %3, %3, LSR #1             \n\t"  -    " beq  6f                         \n\t"  - -    "5:                               \n\t" -    " ldrd r2, [r6], #8               \n\t" /* 4 samples at a time */ -    " ldrd r4, [r6], #8               \n\t" -    " ldrd r0, [%0]                   \n\t" - -    " smulwt r2, r2, r0               \n\t" -    " smulwb r3, r3, r0               \n\t" -    " smulwt r4, r4, r1               \n\t" -    " smulwb r5, r5, r1               \n\t" - -    " ssat r2, #16, r2                \n\t" -    " ssat r3, #16, r3                \n\t" -    " ssat r4, #16, r4                \n\t" -    " ssat r5, #16, r5                \n\t" -     -    " pkhbt r0, r3, r2, LSL #16       \n\t" -    " pkhbt r1, r5, r4, LSL #16       \n\t" -    " strd  r0, [%0], #8              \n\t" - -    MOD_INC() -     -    " subs %3, %3, #1                 \n\t" -    " bne 5b                          \n\t" -    "6:                               \n\t" - -    : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) -    : -    : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" -  ); +    int32_t *ve; + +    channels = MAX (4, channels); +    ve = volumes + channels; + +    __asm__ __volatile__ ( +        " mov r6, %1                      \n\t" +        " mov %3, %3, LSR #1              \n\t" /* length /= sizeof (int16_t) */ +        " tst %3, #1                      \n\t" /* check for odd samples */ +        " beq  2f                         \n\t" + +        "1:                               \n\t" +        " ldr  r0, [r6], #4               \n\t" /* odd samples volumes */ +        " ldrh r2, [%0]                   \n\t" + +        " smulwb r0, r0, r2               \n\t" +        " ssat r0, #16, r0                \n\t" + +        " strh r0, [%0], #2               \n\t" + +        MOD_INC() + +        "2:                               \n\t" +        " mov %3, %3, LSR #1              \n\t" +        " tst %3, #1                      \n\t" /* check for odd samples */ +        " beq  4f                         \n\t" + +        "3:                               \n\t" +        " ldrd r2, [r6], #8               \n\t" /* 2 samples at a time */ +        " ldr  r0, [%0]                   \n\t" + +        " smulwt r2, r2, r0               \n\t" +        " smulwb r3, r3, r0               \n\t" + +        " ssat r2, #16, r2                \n\t" +        " ssat r3, #16, r3                \n\t" + +        " pkhbt r0, r3, r2, LSL #16       \n\t" +        " str  r0, [%0], #4               \n\t" + +        MOD_INC() + +        "4:                               \n\t" +        " movs %3, %3, LSR #1             \n\t" +        " beq  6f                         \n\t" + +        "5:                               \n\t" +        " ldrd r2, [r6], #8               \n\t" /* 4 samples at a time */ +        " ldrd r4, [r6], #8               \n\t" +        " ldrd r0, [%0]                   \n\t" + +        " smulwt r2, r2, r0               \n\t" +        " smulwb r3, r3, r0               \n\t" +        " smulwt r4, r4, r1               \n\t" +        " smulwb r5, r5, r1               \n\t" + +        " ssat r2, #16, r2                \n\t" +        " ssat r3, #16, r3                \n\t" +        " ssat r4, #16, r4                \n\t" +        " ssat r5, #16, r5                \n\t" + +        " pkhbt r0, r3, r2, LSL #16       \n\t" +        " pkhbt r1, r5, r4, LSL #16       \n\t" +        " strd  r0, [%0], #8              \n\t" + +        MOD_INC() + +        " subs %3, %3, #1                 \n\t" +        " bne 5b                          \n\t" +        "6:                               \n\t" + +        : "+r" (samples), "+r" (volumes), "+r" (ve), "+r" (length) +        : +        : "r6", "r5", "r4", "r3", "r2", "r1", "r0", "cc" +    );  }  #undef RUN_TEST @@ -131,51 +131,51 @@ pa_volume_s16ne_arm (int16_t *samples, int32_t *volumes, unsigned channels, unsi  #define PADDING 16  static void run_test (void) { -  int16_t samples[SAMPLES]; -  int16_t samples_ref[SAMPLES]; -  int16_t samples_orig[SAMPLES]; -  int32_t volumes[CHANNELS + PADDING]; -  int i, j, padding; -  pa_do_volume_func_t func; -  struct timeval start, stop; - -  func = pa_get_volume_func (PA_SAMPLE_S16NE); - -  printf ("checking ARM %zd\n", sizeof (samples)); - -  pa_random (samples, sizeof (samples)); -  memcpy (samples_ref, samples, sizeof (samples)); -  memcpy (samples_orig, samples, sizeof (samples)); - -  for (i = 0; i < CHANNELS; i++) -    volumes[i] = rand() >> 1; -  for (padding = 0; padding < PADDING; padding++, i++) -    volumes[i] = volumes[padding]; - -  func (samples_ref, volumes, CHANNELS, sizeof (samples)); -  pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); -  for (i = 0; i < SAMPLES; i++) { -    if (samples[i] != samples_ref[i]) { -      printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], -          samples_orig[i], volumes[i % CHANNELS]); -    } -  } +    int16_t samples[SAMPLES]; +    int16_t samples_ref[SAMPLES]; +    int16_t samples_orig[SAMPLES]; +    int32_t volumes[CHANNELS + PADDING]; +    int i, j, padding; +    pa_do_volume_func_t func; +    struct timeval start, stop; -  pa_gettimeofday(&start); -  for (j = 0; j < TIMES; j++) { -    memcpy (samples, samples_orig, sizeof (samples)); -    pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); -  } -  pa_gettimeofday(&stop); -  pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); +    func = pa_get_volume_func (PA_SAMPLE_S16NE); + +    printf ("checking ARM %zd\n", sizeof (samples)); + +    pa_random (samples, sizeof (samples)); +    memcpy (samples_ref, samples, sizeof (samples)); +    memcpy (samples_orig, samples, sizeof (samples)); + +    for (i = 0; i < CHANNELS; i++) +        volumes[i] = rand() >> 1; +    for (padding = 0; padding < PADDING; padding++, i++) +        volumes[i] = volumes[padding]; -  pa_gettimeofday(&start); -  for (j = 0; j < TIMES; j++) { -    memcpy (samples_ref, samples_orig, sizeof (samples));      func (samples_ref, volumes, CHANNELS, sizeof (samples)); -  } -  pa_gettimeofday(&stop); -  pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); +    pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); +    for (i = 0; i < SAMPLES; i++) { +        if (samples[i] != samples_ref[i]) { +            printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], +                  samples_orig[i], volumes[i % CHANNELS]); +        } +    } + +    pa_gettimeofday(&start); +    for (j = 0; j < TIMES; j++) { +        memcpy (samples, samples_orig, sizeof (samples)); +        pa_volume_s16ne_arm (samples, volumes, CHANNELS, sizeof (samples)); +    } +    pa_gettimeofday(&stop); +    pa_log_info("ARM: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + +    pa_gettimeofday(&start); +    for (j = 0; j < TIMES; j++) { +        memcpy (samples_ref, samples_orig, sizeof (samples)); +        func (samples_ref, volumes, CHANNELS, sizeof (samples)); +    } +    pa_gettimeofday(&stop); +    pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));  }  #endif @@ -184,12 +184,12 @@ static void run_test (void) {  void pa_volume_func_init_arm (pa_cpu_arm_flag_t flags) {  #if defined (__arm__) -  pa_log_info("Initialising ARM optimized functions."); +    pa_log_info("Initialising ARM optimized functions.");  #ifdef RUN_TEST -  run_test (); +    run_test ();  #endif -  pa_set_volume_func (PA_SAMPLE_S16NE,     (pa_do_volume_func_t) pa_volume_s16ne_arm); +    pa_set_volume_func (PA_SAMPLE_S16NE,     (pa_do_volume_func_t) pa_volume_s16ne_arm);  #endif /* defined (__arm__) */  } diff --git a/src/pulsecore/svolume_c.c b/src/pulsecore/svolume_c.c index 2148a573..5fc052b8 100644 --- a/src/pulsecore/svolume_c.c +++ b/src/pulsecore/svolume_c.c @@ -35,289 +35,289 @@  static void  pa_volume_u8_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  for (channel = 0; length; length--) { -    int32_t t, hi, lo; +    for (channel = 0; length; length--) { +        int32_t t, hi, lo; -    hi = volumes[channel] >> 16; -    lo = volumes[channel] & 0xFFFF; +        hi = volumes[channel] >> 16; +        lo = volumes[channel] & 0xFFFF; -    t = (int32_t) *samples - 0x80; -    t = ((t * lo) >> 16) + (t * hi); -    t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F); -    *samples++ = (uint8_t) (t + 0x80); +        t = (int32_t) *samples - 0x80; +        t = ((t * lo) >> 16) + (t * hi); +        t = PA_CLAMP_UNLIKELY(t, -0x80, 0x7F); +        *samples++ = (uint8_t) (t + 0x80); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_alaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  for (channel = 0; length; length--) { -    int32_t t, hi, lo; +    for (channel = 0; length; length--) { +        int32_t t, hi, lo; -    hi = volumes[channel] >> 16; -    lo = volumes[channel] & 0xFFFF; +        hi = volumes[channel] >> 16; +        lo = volumes[channel] & 0xFFFF; -    t = (int32_t) st_alaw2linear16(*samples); -    t = ((t * lo) >> 16) + (t * hi); -    t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); -    *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3); +        t = (int32_t) st_alaw2linear16(*samples); +        t = ((t * lo) >> 16) + (t * hi); +        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); +        *samples++ = (uint8_t) st_13linear2alaw((int16_t) t >> 3); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_ulaw_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  for (channel = 0; length; length--) { -    int32_t t, hi, lo; +    for (channel = 0; length; length--) { +        int32_t t, hi, lo; -    hi = volumes[channel] >> 16; -    lo = volumes[channel] & 0xFFFF; +        hi = volumes[channel] >> 16; +        lo = volumes[channel] & 0xFFFF; -    t = (int32_t) st_ulaw2linear16(*samples); -    t = ((t * lo) >> 16) + (t * hi); -    t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); -    *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2); +        t = (int32_t) st_ulaw2linear16(*samples); +        t = ((t * lo) >> 16) + (t * hi); +        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); +        *samples++ = (uint8_t) st_14linear2ulaw((int16_t) t >> 2); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s16ne_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (int16_t); +    length /= sizeof (int16_t); -  for (channel = 0; length; length--) { -    int32_t t, hi, lo; +    for (channel = 0; length; length--) { +        int32_t t, hi, lo; -    /* Multiplying the 32bit volume factor with the 16bit -     * sample might result in an 48bit value. We want to -     * do without 64 bit integers and hence do the -     * multiplication independantly for the HI and LO part -     * of the volume. */ +        /* Multiplying the 32bit volume factor with the 16bit +         * sample might result in an 48bit value. We want to +         * do without 64 bit integers and hence do the +         * multiplication independantly for the HI and LO part +         * of the volume. */ -    hi = volumes[channel] >> 16; -    lo = volumes[channel] & 0xFFFF; +        hi = volumes[channel] >> 16; +        lo = volumes[channel] & 0xFFFF; -    t = (int32_t)(*samples); -    t = ((t * lo) >> 16) + (t * hi); -    t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); -    *samples++ = (int16_t) t; +        t = (int32_t)(*samples); +        t = ((t * lo) >> 16) + (t * hi); +        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); +        *samples++ = (int16_t) t; -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s16re_c (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (int16_t); +    length /= sizeof (int16_t); -  for (channel = 0; length; length--) { -    int32_t t, hi, lo; +    for (channel = 0; length; length--) { +        int32_t t, hi, lo; -    hi = volumes[channel] >> 16; -    lo = volumes[channel] & 0xFFFF; +        hi = volumes[channel] >> 16; +        lo = volumes[channel] & 0xFFFF; -    t = (int32_t) PA_INT16_SWAP(*samples); -    t = ((t * lo) >> 16) + (t * hi); -    t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); -    *samples++ = PA_INT16_SWAP((int16_t) t); +        t = (int32_t) PA_INT16_SWAP(*samples); +        t = ((t * lo) >> 16) + (t * hi); +        t = PA_CLAMP_UNLIKELY(t, -0x8000, 0x7FFF); +        *samples++ = PA_INT16_SWAP((int16_t) t); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_float32ne_c (float *samples, float *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (float); +    length /= sizeof (float); -  for (channel = 0; length; length--) { -    *samples++ *= volumes[channel]; +    for (channel = 0; length; length--) { +        *samples++ *= volumes[channel]; -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_float32re_c (float *samples, float *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (float); +    length /= sizeof (float); -  for (channel = 0; length; length--) { -    float t; +    for (channel = 0; length; length--) { +        float t; -    t = PA_FLOAT32_SWAP(*samples); -    t *= volumes[channel]; -    *samples++ = PA_FLOAT32_SWAP(t); +        t = PA_FLOAT32_SWAP(*samples); +        t *= volumes[channel]; +        *samples++ = PA_FLOAT32_SWAP(t); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s32ne_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (int32_t); +    length /= sizeof (int32_t); -  for (channel = 0; length; length--) { -    int64_t t; +    for (channel = 0; length; length--) { +        int64_t t; -    t = (int64_t)(*samples); -    t = (t * volumes[channel]) >> 16; -    t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); -    *samples++ = (int32_t) t; +        t = (int64_t)(*samples); +        t = (t * volumes[channel]) >> 16; +        t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); +        *samples++ = (int32_t) t; -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s32re_c (int32_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (int32_t); +    length /= sizeof (int32_t); -  for (channel = 0; length; length--) { -    int64_t t; +    for (channel = 0; length; length--) { +        int64_t t; -    t = (int64_t) PA_INT32_SWAP(*samples); -    t = (t * volumes[channel]) >> 16; -    t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); -    *samples++ = PA_INT32_SWAP((int32_t) t); +        t = (int64_t) PA_INT32_SWAP(*samples); +        t = (t * volumes[channel]) >> 16; +        t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); +        *samples++ = PA_INT32_SWAP((int32_t) t); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s24ne_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; -  uint8_t *e; +    unsigned channel; +    uint8_t *e; -  e = samples + length; +    e = samples + length; -  for (channel = 0; samples < e; samples += 3) { -    int64_t t; +    for (channel = 0; samples < e; samples += 3) { +        int64_t t; -    t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8)); -    t = (t * volumes[channel]) >> 16; -    t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); -    PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8); +        t = (int64_t)((int32_t) (PA_READ24NE(samples) << 8)); +        t = (t * volumes[channel]) >> 16; +        t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); +        PA_WRITE24NE(samples, ((uint32_t) (int32_t) t) >> 8); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s24re_c (uint8_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; -  uint8_t *e; +    unsigned channel; +    uint8_t *e; -  e = samples + length; +    e = samples + length; -  for (channel = 0; samples < e; samples += 3) { -    int64_t t; +    for (channel = 0; samples < e; samples += 3) { +        int64_t t; -    t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8)); -    t = (t * volumes[channel]) >> 16; -    t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); -    PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8); +        t = (int64_t)((int32_t) (PA_READ24RE(samples) << 8)); +        t = (t * volumes[channel]) >> 16; +        t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); +        PA_WRITE24RE(samples, ((uint32_t) (int32_t) t) >> 8); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s24_32ne_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (uint32_t); +    length /= sizeof (uint32_t); -  for (channel = 0; length; length--) { -    int64_t t; +    for (channel = 0; length; length--) { +        int64_t t; -    t = (int64_t) ((int32_t) (*samples << 8)); -    t = (t * volumes[channel]) >> 16; -    t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); -    *samples++ = ((uint32_t) ((int32_t) t)) >> 8; +        t = (int64_t) ((int32_t) (*samples << 8)); +        t = (t * volumes[channel]) >> 16; +        t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); +        *samples++ = ((uint32_t) ((int32_t) t)) >> 8; -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static void  pa_volume_s24_32re_c (uint32_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  unsigned channel; +    unsigned channel; -  length /= sizeof (uint32_t); +    length /= sizeof (uint32_t); -  for (channel = 0; length; length--) { -    int64_t t; +    for (channel = 0; length; length--) { +        int64_t t; -    t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8)); -    t = (t * volumes[channel]) >> 16; -    t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); -    *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8); +        t = (int64_t) ((int32_t) (PA_UINT32_SWAP(*samples) << 8)); +        t = (t * volumes[channel]) >> 16; +        t = PA_CLAMP_UNLIKELY(t, -0x80000000LL, 0x7FFFFFFFLL); +        *samples++ = PA_UINT32_SWAP(((uint32_t) ((int32_t) t)) >> 8); -    if (PA_UNLIKELY(++channel >= channels)) -      channel = 0; -  } +        if (PA_UNLIKELY(++channel >= channels)) +            channel = 0; +    }  }  static pa_do_volume_func_t do_volume_table[] =  { -  [PA_SAMPLE_U8]        = (pa_do_volume_func_t) pa_volume_u8_c, -  [PA_SAMPLE_ALAW]      = (pa_do_volume_func_t) pa_volume_alaw_c, -  [PA_SAMPLE_ULAW]      = (pa_do_volume_func_t) pa_volume_ulaw_c, -  [PA_SAMPLE_S16NE]     = (pa_do_volume_func_t) pa_volume_s16ne_c, -  [PA_SAMPLE_S16RE]     = (pa_do_volume_func_t) pa_volume_s16re_c, -  [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c, -  [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c, -  [PA_SAMPLE_S32NE]     = (pa_do_volume_func_t) pa_volume_s32ne_c, -  [PA_SAMPLE_S32RE]     = (pa_do_volume_func_t) pa_volume_s32re_c, -  [PA_SAMPLE_S24NE]     = (pa_do_volume_func_t) pa_volume_s24ne_c, -  [PA_SAMPLE_S24RE]     = (pa_do_volume_func_t) pa_volume_s24re_c, -  [PA_SAMPLE_S24_32NE]  = (pa_do_volume_func_t) pa_volume_s24_32ne_c, -  [PA_SAMPLE_S24_32RE]  = (pa_do_volume_func_t) pa_volume_s24_32re_c +    [PA_SAMPLE_U8]        = (pa_do_volume_func_t) pa_volume_u8_c, +    [PA_SAMPLE_ALAW]      = (pa_do_volume_func_t) pa_volume_alaw_c, +    [PA_SAMPLE_ULAW]      = (pa_do_volume_func_t) pa_volume_ulaw_c, +    [PA_SAMPLE_S16NE]     = (pa_do_volume_func_t) pa_volume_s16ne_c, +    [PA_SAMPLE_S16RE]     = (pa_do_volume_func_t) pa_volume_s16re_c, +    [PA_SAMPLE_FLOAT32NE] = (pa_do_volume_func_t) pa_volume_float32ne_c, +    [PA_SAMPLE_FLOAT32RE] = (pa_do_volume_func_t) pa_volume_float32re_c, +    [PA_SAMPLE_S32NE]     = (pa_do_volume_func_t) pa_volume_s32ne_c, +    [PA_SAMPLE_S32RE]     = (pa_do_volume_func_t) pa_volume_s32re_c, +    [PA_SAMPLE_S24NE]     = (pa_do_volume_func_t) pa_volume_s24ne_c, +    [PA_SAMPLE_S24RE]     = (pa_do_volume_func_t) pa_volume_s24re_c, +    [PA_SAMPLE_S24_32NE]  = (pa_do_volume_func_t) pa_volume_s24_32ne_c, +    [PA_SAMPLE_S24_32RE]  = (pa_do_volume_func_t) pa_volume_s24_32re_c  };  pa_do_volume_func_t pa_get_volume_func(pa_sample_format_t f) { diff --git a/src/pulsecore/svolume_mmx.c b/src/pulsecore/svolume_mmx.c index 86af76d3..7e242684 100644 --- a/src/pulsecore/svolume_mmx.c +++ b/src/pulsecore/svolume_mmx.c @@ -73,7 +73,7 @@        " add "#a", %3                 \n\t" \        " mov %3, %4                   \n\t" \        " sub "#b", %4                 \n\t" \ -      " cmovae %4, %3                \n\t"  +      " cmovae %4, %3                \n\t"  /* swap 16 bits */  #define SWAP_16(s) \ @@ -96,147 +96,147 @@  static void  pa_volume_s16ne_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  pa_reg_x86 channel, temp; - -  /* the max number of samples we process at a time, this is also the max amount -   * we overread the volume array, which should have enough padding. */ -  channels = MAX (4, channels); - -  __asm__ __volatile__ ( -    " xor %3, %3                    \n\t" -    " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ -    " pcmpeqw %%mm6, %%mm6          \n\t" /* .. |  ffff |  ffff | */ -    " pcmpeqw %%mm7, %%mm7          \n\t" /* .. |  ffff |  ffff | */ -    " pslld  $16, %%mm6             \n\t" /* .. |  ffff |     0 | */ -    " psrld  $31, %%mm7             \n\t" /* .. |     0 |     1 | */ - -    " test $1, %2                   \n\t" /* check for odd samples */ -    " je 2f                         \n\t"  - -    " movd (%1, %3, 4), %%mm0       \n\t" /* |  v0h  |  v0l  | */ -    " movw (%0), %w4                \n\t" /*     ..  |  p0   | */ -    " movd %4, %%mm1                \n\t"  -    VOLUME_32x16 (%%mm1, %%mm0) -    " movd %%mm0, %4                \n\t" /*     ..  | p0*v0 | */ -    " movw %w4, (%0)                \n\t"  -    " add $2, %0                    \n\t" -    MOD_ADD ($1, %5) - -    "2:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ -    " test $1, %2                   \n\t" /* check for odd samples */ -    " je 4f                         \n\t"  - -    "3:                             \n\t" /* do samples in groups of 2 */ -    " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ -    " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */  -    VOLUME_32x16 (%%mm1, %%mm0) -    " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ -    " add $4, %0                    \n\t" -    MOD_ADD ($2, %5) - -    "4:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ -    " cmp $0, %2                    \n\t" -    " je 6f                         \n\t" - -    "5:                             \n\t" /* do samples in groups of 4 */ -    " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */  -    " movq 8(%1, %3, 4), %%mm2      \n\t" /* |  v3h  |  v3l  |  v2h  |  v2l  | */ -    " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */ -    " movd 4(%0), %%mm3             \n\t" /*              .. |   p3  |  p2   | */ -    VOLUME_32x16 (%%mm1, %%mm0) -    VOLUME_32x16 (%%mm3, %%mm2) -    " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ -    " movd %%mm2, 4(%0)             \n\t" /*              .. | p3*v3 | p2*v2 | */ -    " add $8, %0                    \n\t" -    MOD_ADD ($4, %5) -    " dec %2                        \n\t" -    " jne 5b                        \n\t" - -    "6:                             \n\t" -    " emms                          \n\t" - -    : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) -    : "r" ((pa_reg_x86)channels) -    : "cc" -  ); +    pa_reg_x86 channel, temp; + +    /* the max number of samples we process at a time, this is also the max amount +     * we overread the volume array, which should have enough padding. */ +    channels = MAX (4, channels); + +    __asm__ __volatile__ ( +        " xor %3, %3                    \n\t" +        " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ +        " pcmpeqw %%mm6, %%mm6          \n\t" /* .. |  ffff |  ffff | */ +        " pcmpeqw %%mm7, %%mm7          \n\t" /* .. |  ffff |  ffff | */ +        " pslld  $16, %%mm6             \n\t" /* .. |  ffff |     0 | */ +        " psrld  $31, %%mm7             \n\t" /* .. |     0 |     1 | */ + +        " test $1, %2                   \n\t" /* check for odd samples */ +        " je 2f                         \n\t" + +        " movd (%1, %3, 4), %%mm0       \n\t" /* |  v0h  |  v0l  | */ +        " movw (%0), %w4                \n\t" /*     ..  |  p0   | */ +        " movd %4, %%mm1                \n\t" +        VOLUME_32x16 (%%mm1, %%mm0) +        " movd %%mm0, %4                \n\t" /*     ..  | p0*v0 | */ +        " movw %w4, (%0)                \n\t" +        " add $2, %0                    \n\t" +        MOD_ADD ($1, %5) + +        "2:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ +        " test $1, %2                   \n\t" /* check for odd samples */ +        " je 4f                         \n\t" + +        "3:                             \n\t" /* do samples in groups of 2 */ +        " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ +        " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */ +        VOLUME_32x16 (%%mm1, %%mm0) +        " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ +        " add $4, %0                    \n\t" +        MOD_ADD ($2, %5) + +        "4:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ +        " cmp $0, %2                    \n\t" +        " je 6f                         \n\t" + +        "5:                             \n\t" /* do samples in groups of 4 */ +        " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ +        " movq 8(%1, %3, 4), %%mm2      \n\t" /* |  v3h  |  v3l  |  v2h  |  v2l  | */ +        " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */ +        " movd 4(%0), %%mm3             \n\t" /*              .. |   p3  |  p2   | */ +        VOLUME_32x16 (%%mm1, %%mm0) +        VOLUME_32x16 (%%mm3, %%mm2) +        " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ +        " movd %%mm2, 4(%0)             \n\t" /*              .. | p3*v3 | p2*v2 | */ +        " add $8, %0                    \n\t" +        MOD_ADD ($4, %5) +        " dec %2                        \n\t" +        " jne 5b                        \n\t" + +        "6:                             \n\t" +        " emms                          \n\t" + +        : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) +        : "r" ((pa_reg_x86)channels) +        : "cc" +    );  }  static void  pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  pa_reg_x86 channel, temp; - -  /* the max number of samples we process at a time, this is also the max amount -   * we overread the volume array, which should have enough padding. */ -  channels = MAX (4, channels); - -  __asm__ __volatile__ ( -    " xor %3, %3                    \n\t" -    " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ -    " pcmpeqw %%mm6, %%mm6          \n\t" /* .. |  ffff |  ffff | */ -    " pcmpeqw %%mm7, %%mm7          \n\t" /* .. |  ffff |  ffff | */ -    " pslld  $16, %%mm6             \n\t" /* .. |  ffff |     0 | */ -    " psrld  $31, %%mm7             \n\t" /* .. |     0 |     1 | */ - -    " test $1, %2                   \n\t" /* check for odd samples */ -    " je 2f                         \n\t"  - -    " movd (%1, %3, 4), %%mm0       \n\t" /* |  v0h  |  v0l  | */ -    " movw (%0), %w4                \n\t" /*     ..  |  p0   | */ -    " rorw $8, %w4                  \n\t" -    " movd %4, %%mm1                \n\t"  -    VOLUME_32x16 (%%mm1, %%mm0) -    " movd %%mm0, %4                \n\t" /*     ..  | p0*v0 | */ -    " rorw $8, %w4                  \n\t" -    " movw %w4, (%0)                \n\t"  -    " add $2, %0                    \n\t" -    MOD_ADD ($1, %5) - -    "2:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ -    " test $1, %2                   \n\t" /* check for odd samples */ -    " je 4f                         \n\t"  - -    "3:                             \n\t" /* do samples in groups of 2 */ -    " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ -    " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */  -    SWAP_16 (%%mm1) -    VOLUME_32x16 (%%mm1, %%mm0) -    SWAP_16 (%%mm0) -    " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ -    " add $4, %0                    \n\t" -    MOD_ADD ($2, %5) - -    "4:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ -    " cmp $0, %2                    \n\t" -    " je 6f                         \n\t" - -    "5:                             \n\t" /* do samples in groups of 4 */ -    " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */  -    " movq 8(%1, %3, 4), %%mm2      \n\t" /* |  v3h  |  v3l  |  v2h  |  v2l  | */ -    " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */ -    " movd 4(%0), %%mm3             \n\t" /*              .. |   p3  |  p2   | */ -    SWAP_16_2 (%%mm1, %%mm3) -    VOLUME_32x16 (%%mm1, %%mm0) -    VOLUME_32x16 (%%mm3, %%mm2) -    SWAP_16_2 (%%mm0, %%mm2) -    " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ -    " movd %%mm2, 4(%0)             \n\t" /*              .. | p3*v3 | p2*v2 | */ -    " add $8, %0                    \n\t" -    MOD_ADD ($4, %5) -    " dec %2                        \n\t" -    " jne 5b                        \n\t" - -    "6:                             \n\t" -    " emms                          \n\t" - -    : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) -    : "r" ((pa_reg_x86)channels) -    : "cc" -  ); +    pa_reg_x86 channel, temp; + +    /* the max number of samples we process at a time, this is also the max amount +     * we overread the volume array, which should have enough padding. */ +    channels = MAX (4, channels); + +    __asm__ __volatile__ ( +        " xor %3, %3                    \n\t" +        " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ +        " pcmpeqw %%mm6, %%mm6          \n\t" /* .. |  ffff |  ffff | */ +        " pcmpeqw %%mm7, %%mm7          \n\t" /* .. |  ffff |  ffff | */ +        " pslld  $16, %%mm6             \n\t" /* .. |  ffff |     0 | */ +        " psrld  $31, %%mm7             \n\t" /* .. |     0 |     1 | */ + +        " test $1, %2                   \n\t" /* check for odd samples */ +        " je 2f                         \n\t" + +        " movd (%1, %3, 4), %%mm0       \n\t" /* |  v0h  |  v0l  | */ +        " movw (%0), %w4                \n\t" /*     ..  |  p0   | */ +        " rorw $8, %w4                  \n\t" +        " movd %4, %%mm1                \n\t" +        VOLUME_32x16 (%%mm1, %%mm0) +        " movd %%mm0, %4                \n\t" /*     ..  | p0*v0 | */ +        " rorw $8, %w4                  \n\t" +        " movw %w4, (%0)                \n\t" +        " add $2, %0                    \n\t" +        MOD_ADD ($1, %5) + +        "2:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ +        " test $1, %2                   \n\t" /* check for odd samples */ +        " je 4f                         \n\t" + +        "3:                             \n\t" /* do samples in groups of 2 */ +        " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ +        " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */ +        SWAP_16 (%%mm1) +        VOLUME_32x16 (%%mm1, %%mm0) +        SWAP_16 (%%mm0) +        " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ +        " add $4, %0                    \n\t" +        MOD_ADD ($2, %5) + +        "4:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ +        " cmp $0, %2                    \n\t" +        " je 6f                         \n\t" + +        "5:                             \n\t" /* do samples in groups of 4 */ +        " movq (%1, %3, 4), %%mm0       \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ +        " movq 8(%1, %3, 4), %%mm2      \n\t" /* |  v3h  |  v3l  |  v2h  |  v2l  | */ +        " movd (%0), %%mm1              \n\t" /*              .. |   p1  |  p0   | */ +        " movd 4(%0), %%mm3             \n\t" /*              .. |   p3  |  p2   | */ +        SWAP_16_2 (%%mm1, %%mm3) +        VOLUME_32x16 (%%mm1, %%mm0) +        VOLUME_32x16 (%%mm3, %%mm2) +        SWAP_16_2 (%%mm0, %%mm2) +        " movd %%mm0, (%0)              \n\t" /*              .. | p1*v1 | p0*v0 | */ +        " movd %%mm2, 4(%0)             \n\t" /*              .. | p3*v3 | p2*v2 | */ +        " add $8, %0                    \n\t" +        MOD_ADD ($4, %5) +        " dec %2                        \n\t" +        " jne 5b                        \n\t" + +        "6:                             \n\t" +        " emms                          \n\t" + +        : "+r" (samples), "+r" (volumes), "+r" (length), "=D" ((pa_reg_x86)channel), "=&r" (temp) +        : "r" ((pa_reg_x86)channels) +        : "cc" +    );  }  #undef RUN_TEST @@ -248,51 +248,51 @@ pa_volume_s16re_mmx (int16_t *samples, int32_t *volumes, unsigned channels, unsi  #define PADDING 16  static void run_test (void) { -  int16_t samples[SAMPLES]; -  int16_t samples_ref[SAMPLES]; -  int16_t samples_orig[SAMPLES]; -  int32_t volumes[CHANNELS + PADDING]; -  int i, j, padding; -  pa_do_volume_func_t func; -  struct timeval start, stop; - -  func = pa_get_volume_func (PA_SAMPLE_S16NE); - -  printf ("checking MMX %zd\n", sizeof (samples)); - -  pa_random (samples, sizeof (samples)); -  memcpy (samples_ref, samples, sizeof (samples)); -  memcpy (samples_orig, samples, sizeof (samples)); - -  for (i = 0; i < CHANNELS; i++) -    volumes[i] = rand() >> 1; -  for (padding = 0; padding < PADDING; padding++, i++) -    volumes[i] = volumes[padding]; - -  func (samples_ref, volumes, CHANNELS, sizeof (samples)); -  pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); -  for (i = 0; i < SAMPLES; i++) { -    if (samples[i] != samples_ref[i]) { -      printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], -          samples_orig[i], volumes[i % CHANNELS]); -    } -  } +    int16_t samples[SAMPLES]; +    int16_t samples_ref[SAMPLES]; +    int16_t samples_orig[SAMPLES]; +    int32_t volumes[CHANNELS + PADDING]; +    int i, j, padding; +    pa_do_volume_func_t func; +    struct timeval start, stop; -  pa_gettimeofday(&start); -  for (j = 0; j < TIMES; j++) { -    memcpy (samples, samples_orig, sizeof (samples)); -    pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); -  } -  pa_gettimeofday(&stop); -  pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); +    func = pa_get_volume_func (PA_SAMPLE_S16NE); + +    printf ("checking MMX %zd\n", sizeof (samples)); + +    pa_random (samples, sizeof (samples)); +    memcpy (samples_ref, samples, sizeof (samples)); +    memcpy (samples_orig, samples, sizeof (samples)); + +    for (i = 0; i < CHANNELS; i++) +        volumes[i] = rand() >> 1; +    for (padding = 0; padding < PADDING; padding++, i++) +        volumes[i] = volumes[padding]; -  pa_gettimeofday(&start); -  for (j = 0; j < TIMES; j++) { -    memcpy (samples_ref, samples_orig, sizeof (samples));      func (samples_ref, volumes, CHANNELS, sizeof (samples)); -  } -  pa_gettimeofday(&stop); -  pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); +    pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); +    for (i = 0; i < SAMPLES; i++) { +        if (samples[i] != samples_ref[i]) { +            printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], +                  samples_orig[i], volumes[i % CHANNELS]); +        } +    } + +    pa_gettimeofday(&start); +    for (j = 0; j < TIMES; j++) { +        memcpy (samples, samples_orig, sizeof (samples)); +        pa_volume_s16ne_mmx (samples, volumes, CHANNELS, sizeof (samples)); +    } +    pa_gettimeofday(&stop); +    pa_log_info("MMX: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + +    pa_gettimeofday(&start); +    for (j = 0; j < TIMES; j++) { +        memcpy (samples_ref, samples_orig, sizeof (samples)); +        func (samples_ref, volumes, CHANNELS, sizeof (samples)); +    } +    pa_gettimeofday(&stop); +    pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));  }  #endif @@ -301,13 +301,13 @@ static void run_test (void) {  void pa_volume_func_init_mmx (pa_cpu_x86_flag_t flags) {  #if defined (__i386__) || defined (__amd64__) -  pa_log_info("Initialising MMX optimized functions."); +    pa_log_info("Initialising MMX optimized functions.");  #ifdef RUN_TEST -  run_test (); +    run_test ();  #endif -  pa_set_volume_func (PA_SAMPLE_S16NE,     (pa_do_volume_func_t) pa_volume_s16ne_mmx); -  pa_set_volume_func (PA_SAMPLE_S16RE,     (pa_do_volume_func_t) pa_volume_s16re_mmx); +    pa_set_volume_func (PA_SAMPLE_S16NE,     (pa_do_volume_func_t) pa_volume_s16ne_mmx); +    pa_set_volume_func (PA_SAMPLE_S16RE,     (pa_do_volume_func_t) pa_volume_s16re_mmx);  #endif /* defined (__i386__) || defined (__amd64__) */  } diff --git a/src/pulsecore/svolume_sse.c b/src/pulsecore/svolume_sse.c index 5979f7c2..b5e3687f 100644 --- a/src/pulsecore/svolume_sse.c +++ b/src/pulsecore/svolume_sse.c @@ -48,7 +48,7 @@        " psrld $16, "#v"              \n\t" /* .. |   p0  |    0  | */                   \        " pmaddwd %%xmm5, "#v"         \n\t" /* .. |    p0 * vh    | */                   \        " paddd "#s", "#v"             \n\t" /* .. |    p0 * v0    | */                   \ -      " packssdw "#v", "#v"          \n\t" /* .. | p1*v1 | p0*v0 | */          +      " packssdw "#v", "#v"          \n\t" /* .. | p1*v1 | p0*v0 | */  #define MOD_ADD(a,b) \        " add "#a", %3                 \n\t" /* channel += inc           */ \ @@ -77,169 +77,169 @@  static void  pa_volume_s16ne_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  pa_reg_x86 channel, temp; - -  /* the max number of samples we process at a time, this is also the max amount -   * we overread the volume array, which should have enough padding. */ -  channels = MAX (8, channels); - -  __asm__ __volatile__ ( -    " xor %3, %3                    \n\t" -    " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ - -    " test $1, %2                   \n\t" /* check for odd samples */ -    " je 2f                         \n\t"  - -    " movd (%1, %3, 4), %%xmm0      \n\t" /* |  v0h  |  v0l  | */ -    " movw (%0), %w4                \n\t" /*     ..  |   p0  | */ -    " movd %4, %%xmm1               \n\t"  -    VOLUME_32x16 (%%xmm1, %%xmm0) -    " movd %%xmm0, %4               \n\t" /*     ..  | p0*v0 | */ -    " movw %w4, (%0)                \n\t"  -    " add $2, %0                    \n\t" -    MOD_ADD ($1, %5) - -    "2:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ -    " test $1, %2                   \n\t"  -    " je 4f                         \n\t"  - -    "3:                             \n\t" /* do samples in groups of 2 */ -    " movq (%1, %3, 4), %%xmm0      \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ -    " movd (%0), %%xmm1             \n\t" /*              .. |   p1  |  p0   | */ -    VOLUME_32x16 (%%xmm1, %%xmm0) -    " movd %%xmm0, (%0)             \n\t" /*              .. | p1*v1 | p0*v0 | */ -    " add $4, %0                    \n\t" -    MOD_ADD ($2, %5) - -    "4:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ -    " test $1, %2                   \n\t"  -    " je 6f                         \n\t"  - -    /* FIXME, we can do aligned access of the volume values if we can guarantee -     * that the array is 16 bytes aligned, we probably have to do the odd values -     * after this then. */ -    "5:                             \n\t" /* do samples in groups of 4 */ -    " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ -    " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ -    VOLUME_32x16 (%%xmm1, %%xmm0) -    " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ -    " add $8, %0                    \n\t" -    MOD_ADD ($4, %5) - -    "6:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 8 samples at a time */ -    " cmp $0, %2                    \n\t" -    " je 8f                         \n\t" - -    "7:                             \n\t" /* do samples in groups of 8 */ -    " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ -    " movdqu 16(%1, %3, 4), %%xmm2  \n\t" /* |  v7h  |  v7l  ..  v4h  |  v4l  | */ -    " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ -    " movq 8(%0), %%xmm3            \n\t" /*              .. |   p7  ..  p4   | */ -    VOLUME_32x16 (%%xmm1, %%xmm0) -    VOLUME_32x16 (%%xmm3, %%xmm2) -    " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ -    " movq %%xmm2, 8(%0)            \n\t" /*              .. | p7*v7 .. p4*v4 | */ -    " add $16, %0                   \n\t" -    MOD_ADD ($8, %5) -    " dec %2                        \n\t" -    " jne 7b                        \n\t" -    "8:                             \n\t" - -    : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) -    : "r" ((pa_reg_x86)channels) -    : "cc" -  ); +    pa_reg_x86 channel, temp; + +    /* the max number of samples we process at a time, this is also the max amount +     * we overread the volume array, which should have enough padding. */ +    channels = MAX (8, channels); + +    __asm__ __volatile__ ( +        " xor %3, %3                    \n\t" +        " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ + +        " test $1, %2                   \n\t" /* check for odd samples */ +        " je 2f                         \n\t" + +        " movd (%1, %3, 4), %%xmm0      \n\t" /* |  v0h  |  v0l  | */ +        " movw (%0), %w4                \n\t" /*     ..  |   p0  | */ +        " movd %4, %%xmm1               \n\t" +        VOLUME_32x16 (%%xmm1, %%xmm0) +        " movd %%xmm0, %4               \n\t" /*     ..  | p0*v0 | */ +        " movw %w4, (%0)                \n\t" +        " add $2, %0                    \n\t" +        MOD_ADD ($1, %5) + +        "2:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ +        " test $1, %2                   \n\t" +        " je 4f                         \n\t" + +        "3:                             \n\t" /* do samples in groups of 2 */ +        " movq (%1, %3, 4), %%xmm0      \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ +        " movd (%0), %%xmm1             \n\t" /*              .. |   p1  |  p0   | */ +        VOLUME_32x16 (%%xmm1, %%xmm0) +        " movd %%xmm0, (%0)             \n\t" /*              .. | p1*v1 | p0*v0 | */ +        " add $4, %0                    \n\t" +        MOD_ADD ($2, %5) + +        "4:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ +        " test $1, %2                   \n\t" +        " je 6f                         \n\t" + +        /* FIXME, we can do aligned access of the volume values if we can guarantee +         * that the array is 16 bytes aligned, we probably have to do the odd values +         * after this then. */ +        "5:                             \n\t" /* do samples in groups of 4 */ +        " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ +        " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ +        VOLUME_32x16 (%%xmm1, %%xmm0) +        " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ +        " add $8, %0                    \n\t" +        MOD_ADD ($4, %5) + +        "6:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 8 samples at a time */ +        " cmp $0, %2                    \n\t" +        " je 8f                         \n\t" + +        "7:                             \n\t" /* do samples in groups of 8 */ +        " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ +        " movdqu 16(%1, %3, 4), %%xmm2  \n\t" /* |  v7h  |  v7l  ..  v4h  |  v4l  | */ +        " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ +        " movq 8(%0), %%xmm3            \n\t" /*              .. |   p7  ..  p4   | */ +        VOLUME_32x16 (%%xmm1, %%xmm0) +        VOLUME_32x16 (%%xmm3, %%xmm2) +        " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ +        " movq %%xmm2, 8(%0)            \n\t" /*              .. | p7*v7 .. p4*v4 | */ +        " add $16, %0                   \n\t" +        MOD_ADD ($8, %5) +        " dec %2                        \n\t" +        " jne 7b                        \n\t" +        "8:                             \n\t" + +        : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) +        : "r" ((pa_reg_x86)channels) +        : "cc" +    );  }  static void  pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsigned length)  { -  pa_reg_x86 channel, temp; - -  /* the max number of samples we process at a time, this is also the max amount -   * we overread the volume array, which should have enough padding. */ -  channels = MAX (8, channels); - -  __asm__ __volatile__ ( -    " xor %3, %3                    \n\t" -    " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ - -    " test $1, %2                   \n\t" /* check for odd samples */ -    " je 2f                         \n\t"  - -    " movd (%1, %3, 4), %%xmm0      \n\t" /* |  v0h  |  v0l  | */ -    " movw (%0), %w4                \n\t" /*     ..  |   p0  | */ -    " rorw $8, %w4                  \n\t"  -    " movd %4, %%xmm1               \n\t"  -    VOLUME_32x16 (%%xmm1, %%xmm0) -    " movd %%xmm0, %4               \n\t" /*     ..  | p0*v0 | */ -    " rorw $8, %w4                  \n\t"  -    " movw %w4, (%0)                \n\t"  -    " add $2, %0                    \n\t" -    MOD_ADD ($1, %5) - -    "2:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ -    " test $1, %2                   \n\t" -    " je 4f                         \n\t"  - -    "3:                             \n\t" /* do samples in groups of 2 */ -    " movq (%1, %3, 4), %%xmm0      \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ -    " movd (%0), %%xmm1             \n\t" /*              .. |   p1  |  p0   | */ -    SWAP_16 (%%xmm1) -    VOLUME_32x16 (%%xmm1, %%xmm0) -    SWAP_16 (%%xmm0) -    " movd %%xmm0, (%0)             \n\t" /*              .. | p1*v1 | p0*v0 | */ -    " add $4, %0                    \n\t" -    MOD_ADD ($2, %5) - -    "4:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ -    " test $1, %2                   \n\t" -    " je 6f                         \n\t"  - -    /* FIXME, we can do aligned access of the volume values if we can guarantee -     * that the array is 16 bytes aligned, we probably have to do the odd values -     * after this then. */ -    "5:                             \n\t" /* do samples in groups of 4 */ -    " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ -    " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ -    SWAP_16 (%%xmm1) -    VOLUME_32x16 (%%xmm1, %%xmm0) -    SWAP_16 (%%xmm0) -    " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ -    " add $8, %0                    \n\t" -    MOD_ADD ($4, %5) - -    "6:                             \n\t" -    " sar $1, %2                    \n\t" /* prepare for processing 8 samples at a time */ -    " cmp $0, %2                    \n\t" -    " je 8f                         \n\t" - -    "7:                             \n\t" /* do samples in groups of 8 */ -    " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ -    " movdqu 16(%1, %3, 4), %%xmm2  \n\t" /* |  v7h  |  v7l  ..  v4h  |  v4l  | */ -    " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ -    " movq 8(%0), %%xmm3            \n\t" /*              .. |   p7  ..  p4   | */ -    SWAP_16_2 (%%xmm1, %%xmm3) -    VOLUME_32x16 (%%xmm1, %%xmm0) -    VOLUME_32x16 (%%xmm3, %%xmm2) -    SWAP_16_2 (%%xmm0, %%xmm2) -    " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ -    " movq %%xmm2, 8(%0)            \n\t" /*              .. | p7*v7 .. p4*v4 | */ -    " add $16, %0                   \n\t" -    MOD_ADD ($8, %5) -    " dec %2                        \n\t" -    " jne 7b                        \n\t" -    "8:                             \n\t" - -    : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) -    : "r" ((pa_reg_x86)channels) -    : "cc" -  ); +    pa_reg_x86 channel, temp; + +    /* the max number of samples we process at a time, this is also the max amount +     * we overread the volume array, which should have enough padding. */ +    channels = MAX (8, channels); + +    __asm__ __volatile__ ( +        " xor %3, %3                    \n\t" +        " sar $1, %2                    \n\t" /* length /= sizeof (int16_t) */ + +        " test $1, %2                   \n\t" /* check for odd samples */ +        " je 2f                         \n\t" + +        " movd (%1, %3, 4), %%xmm0      \n\t" /* |  v0h  |  v0l  | */ +        " movw (%0), %w4                \n\t" /*     ..  |   p0  | */ +        " rorw $8, %w4                  \n\t" +        " movd %4, %%xmm1               \n\t" +        VOLUME_32x16 (%%xmm1, %%xmm0) +        " movd %%xmm0, %4               \n\t" /*     ..  | p0*v0 | */ +        " rorw $8, %w4                  \n\t" +        " movw %w4, (%0)                \n\t" +        " add $2, %0                    \n\t" +        MOD_ADD ($1, %5) + +        "2:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 2 samples at a time */ +        " test $1, %2                   \n\t" +        " je 4f                         \n\t" + +        "3:                             \n\t" /* do samples in groups of 2 */ +        " movq (%1, %3, 4), %%xmm0      \n\t" /* |  v1h  |  v1l  |  v0h  |  v0l  | */ +        " movd (%0), %%xmm1             \n\t" /*              .. |   p1  |  p0   | */ +        SWAP_16 (%%xmm1) +        VOLUME_32x16 (%%xmm1, %%xmm0) +        SWAP_16 (%%xmm0) +        " movd %%xmm0, (%0)             \n\t" /*              .. | p1*v1 | p0*v0 | */ +        " add $4, %0                    \n\t" +        MOD_ADD ($2, %5) + +        "4:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 4 samples at a time */ +        " test $1, %2                   \n\t" +        " je 6f                         \n\t" + +        /* FIXME, we can do aligned access of the volume values if we can guarantee +         * that the array is 16 bytes aligned, we probably have to do the odd values +         * after this then. */ +        "5:                             \n\t" /* do samples in groups of 4 */ +        " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ +        " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ +        SWAP_16 (%%xmm1) +        VOLUME_32x16 (%%xmm1, %%xmm0) +        SWAP_16 (%%xmm0) +        " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ +        " add $8, %0                    \n\t" +        MOD_ADD ($4, %5) + +        "6:                             \n\t" +        " sar $1, %2                    \n\t" /* prepare for processing 8 samples at a time */ +        " cmp $0, %2                    \n\t" +        " je 8f                         \n\t" + +        "7:                             \n\t" /* do samples in groups of 8 */ +        " movdqu (%1, %3, 4), %%xmm0    \n\t" /* |  v3h  |  v3l  ..  v0h  |  v0l  | */ +        " movdqu 16(%1, %3, 4), %%xmm2  \n\t" /* |  v7h  |  v7l  ..  v4h  |  v4l  | */ +        " movq (%0), %%xmm1             \n\t" /*              .. |   p3  ..  p0   | */ +        " movq 8(%0), %%xmm3            \n\t" /*              .. |   p7  ..  p4   | */ +        SWAP_16_2 (%%xmm1, %%xmm3) +        VOLUME_32x16 (%%xmm1, %%xmm0) +        VOLUME_32x16 (%%xmm3, %%xmm2) +        SWAP_16_2 (%%xmm0, %%xmm2) +        " movq %%xmm0, (%0)             \n\t" /*              .. | p3*v3 .. p0*v0 | */ +        " movq %%xmm2, 8(%0)            \n\t" /*              .. | p7*v7 .. p4*v4 | */ +        " add $16, %0                   \n\t" +        MOD_ADD ($8, %5) +        " dec %2                        \n\t" +        " jne 7b                        \n\t" +        "8:                             \n\t" + +        : "+r" (samples), "+r" (volumes), "+r" (length), "=D" (channel), "=&r" (temp) +        : "r" ((pa_reg_x86)channels) +        : "cc" +    );  }  #undef RUN_TEST @@ -251,64 +251,64 @@ pa_volume_s16re_sse (int16_t *samples, int32_t *volumes, unsigned channels, unsi  #define PADDING 16  static void run_test (void) { -  int16_t samples[SAMPLES]; -  int16_t samples_ref[SAMPLES]; -  int16_t samples_orig[SAMPLES]; -  int32_t volumes[CHANNELS + PADDING]; -  int i, j, padding; -  pa_do_volume_func_t func; -  struct timeval start, stop; - -  func = pa_get_volume_func (PA_SAMPLE_S16NE); - -  printf ("checking SSE %zd\n", sizeof (samples)); - -  pa_random (samples, sizeof (samples)); -  memcpy (samples_ref, samples, sizeof (samples)); -  memcpy (samples_orig, samples, sizeof (samples)); - -  for (i = 0; i < CHANNELS; i++) -    volumes[i] = rand() >> 1; -  for (padding = 0; padding < PADDING; padding++, i++) -    volumes[i] = volumes[padding]; - -  func (samples_ref, volumes, CHANNELS, sizeof (samples)); -  pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); -  for (i = 0; i < SAMPLES; i++) { -    if (samples[i] != samples_ref[i]) { -      printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], -              samples_orig[i], volumes[i % CHANNELS]); -    } -  } +    int16_t samples[SAMPLES]; +    int16_t samples_ref[SAMPLES]; +    int16_t samples_orig[SAMPLES]; +    int32_t volumes[CHANNELS + PADDING]; +    int i, j, padding; +    pa_do_volume_func_t func; +    struct timeval start, stop; -  pa_gettimeofday(&start); -  for (j = 0; j < TIMES; j++) { -    memcpy (samples, samples_orig, sizeof (samples)); -    pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); -  } -  pa_gettimeofday(&stop); -  pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); +    func = pa_get_volume_func (PA_SAMPLE_S16NE); + +    printf ("checking SSE %zd\n", sizeof (samples)); + +    pa_random (samples, sizeof (samples)); +    memcpy (samples_ref, samples, sizeof (samples)); +    memcpy (samples_orig, samples, sizeof (samples)); + +    for (i = 0; i < CHANNELS; i++) +        volumes[i] = rand() >> 1; +    for (padding = 0; padding < PADDING; padding++, i++) +        volumes[i] = volumes[padding]; -  pa_gettimeofday(&start); -  for (j = 0; j < TIMES; j++) { -    memcpy (samples_ref, samples_orig, sizeof (samples));      func (samples_ref, volumes, CHANNELS, sizeof (samples)); -  } -  pa_gettimeofday(&stop); -  pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); +    pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); +    for (i = 0; i < SAMPLES; i++) { +        if (samples[i] != samples_ref[i]) { +            printf ("%d: %04x != %04x (%04x * %04x)\n", i, samples[i], samples_ref[i], +                      samples_orig[i], volumes[i % CHANNELS]); +        } +    } + +    pa_gettimeofday(&start); +    for (j = 0; j < TIMES; j++) { +        memcpy (samples, samples_orig, sizeof (samples)); +        pa_volume_s16ne_sse (samples, volumes, CHANNELS, sizeof (samples)); +    } +    pa_gettimeofday(&stop); +    pa_log_info("SSE: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start)); + +    pa_gettimeofday(&start); +    for (j = 0; j < TIMES; j++) { +        memcpy (samples_ref, samples_orig, sizeof (samples)); +        func (samples_ref, volumes, CHANNELS, sizeof (samples)); +    } +    pa_gettimeofday(&stop); +    pa_log_info("ref: %llu usec.", (long long unsigned int)pa_timeval_diff (&stop, &start));  }  #endif  #endif /* defined (__i386__) || defined (__amd64__) */  void pa_volume_func_init_sse (pa_cpu_x86_flag_t flags) {  #if defined (__i386__) || defined (__amd64__) -  pa_log_info("Initialising SSE optimized functions."); +    pa_log_info("Initialising SSE optimized functions.");  #ifdef RUN_TEST -  run_test (); +    run_test ();  #endif -  pa_set_volume_func (PA_SAMPLE_S16NE,     (pa_do_volume_func_t) pa_volume_s16ne_sse); -  pa_set_volume_func (PA_SAMPLE_S16RE,     (pa_do_volume_func_t) pa_volume_s16re_sse); +    pa_set_volume_func (PA_SAMPLE_S16NE,     (pa_do_volume_func_t) pa_volume_s16ne_sse); +    pa_set_volume_func (PA_SAMPLE_S16RE,     (pa_do_volume_func_t) pa_volume_s16re_sse);  #endif /* defined (__i386__) || defined (__amd64__) */  } | 
