From dc221f204b89fca85c0125e55f3afea4a807ffa7 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Sat, 29 Aug 2009 12:22:42 +0200 Subject: remap: fix counters for mmx and sse remap Take the size of the sample into account when calculating the amount of samples we process in parallel. --- src/pulsecore/remap_sse.c | 34 +++++++++++++++++++++------------- 1 file changed, 21 insertions(+), 13 deletions(-) (limited to 'src/pulsecore/remap_sse.c') diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c index 97f2476e..d6003571 100644 --- a/src/pulsecore/remap_sse.c +++ b/src/pulsecore/remap_sse.c @@ -65,16 +65,24 @@ " add $64, %1 \n\t" \ " add $128, %0 \n\t" -#define HANDLE_SINGLE(s) \ +#define HANDLE_SINGLE_dq() \ " movd (%1), %%xmm0 \n\t" \ - " punpckl"#s" %%xmm0, %%xmm0 \n\t" \ + " punpckldq %%xmm0, %%xmm0 \n\t" \ " movq %%xmm0, (%0) \n\t" \ " add $4, %1 \n\t" \ " add $8, %0 \n\t" -#define MONO_TO_STEREO(s) \ - " mov %3, %2 \n\t" \ - " sar $4, %2 \n\t" \ +#define HANDLE_SINGLE_wd() \ + " movw (%1), %w3 \n\t" \ + " movd %3, %%xmm0 \n\t" \ + " punpcklwd %%xmm0, %%xmm0 \n\t" \ + " movd %%xmm0, (%0) \n\t" \ + " add $2, %1 \n\t" \ + " add $4, %0 \n\t" + +#define MONO_TO_STEREO(s,shift,mask) \ + " mov %4, %2 \n\t" \ + " sar $"#shift", %2 \n\t" \ " cmp $0, %2 \n\t" \ " je 2f \n\t" \ "1: \n\t" \ @@ -84,24 +92,24 @@ " dec %2 \n\t" \ " jne 1b \n\t" \ "2: \n\t" \ - " mov %3, %2 \n\t" \ - " and $15, %2 \n\t" \ + " mov %4, %2 \n\t" \ + " and $"#mask", %2 \n\t" \ " je 4f \n\t" \ "3: \n\t" \ - HANDLE_SINGLE(s) \ + HANDLE_SINGLE_##s() \ " dec %2 \n\t" \ " jne 3b \n\t" \ "4: \n\t" static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, unsigned n) { - pa_reg_x86 temp; + pa_reg_x86 temp, temp2; switch (*m->format) { case PA_SAMPLE_FLOAT32NE: { __asm__ __volatile__ ( - MONO_TO_STEREO(dq) /* do doubles to quads */ - : "+r" (dst), "+r" (src), "=&r" (temp) + MONO_TO_STEREO(dq,3,7) /* do doubles to quads */ + : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2) : "r" ((pa_reg_x86)n) : "cc" ); @@ -110,8 +118,8 @@ static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, case PA_SAMPLE_S16NE: { __asm__ __volatile__ ( - MONO_TO_STEREO(wd) /* do words to doubles */ - : "+r" (dst), "+r" (src), "=&r" (temp) + MONO_TO_STEREO(wd,4,15) /* do words to doubles */ + : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2) : "r" ((pa_reg_x86)n) : "cc" ); -- cgit From 51423cae52333f604de198691d487c7de65cd096 Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 3 Sep 2009 00:13:21 +0200 Subject: remap_sse: reindent macro so that diff to MMX is nicer Completely useless, but diff -u remap_mmx.c remap_sse.c is much nicer this way. --- src/pulsecore/remap_sse.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) (limited to 'src/pulsecore/remap_sse.c') diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c index d6003571..fa21c6c8 100644 --- a/src/pulsecore/remap_sse.c +++ b/src/pulsecore/remap_sse.c @@ -80,25 +80,25 @@ " add $2, %1 \n\t" \ " add $4, %0 \n\t" -#define MONO_TO_STEREO(s,shift,mask) \ - " mov %4, %2 \n\t" \ - " sar $"#shift", %2 \n\t" \ - " cmp $0, %2 \n\t" \ - " je 2f \n\t" \ - "1: \n\t" \ - LOAD_SAMPLES \ - UNPACK_SAMPLES(s) \ - STORE_SAMPLES \ - " dec %2 \n\t" \ - " jne 1b \n\t" \ - "2: \n\t" \ - " mov %4, %2 \n\t" \ - " and $"#mask", %2 \n\t" \ - " je 4f \n\t" \ - "3: \n\t" \ - HANDLE_SINGLE_##s() \ - " dec %2 \n\t" \ - " jne 3b \n\t" \ +#define MONO_TO_STEREO(s,shift,mask) \ + " mov %4, %2 \n\t" \ + " sar $"#shift", %2 \n\t" \ + " cmp $0, %2 \n\t" \ + " je 2f \n\t" \ + "1: \n\t" \ + LOAD_SAMPLES \ + UNPACK_SAMPLES(s) \ + STORE_SAMPLES \ + " dec %2 \n\t" \ + " jne 1b \n\t" \ + "2: \n\t" \ + " mov %4, %2 \n\t" \ + " and $"#mask", %2 \n\t" \ + " je 4f \n\t" \ + "3: \n\t" \ + HANDLE_SINGLE_##s() \ + " dec %2 \n\t" \ + " jne 3b \n\t" \ "4: \n\t" static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, unsigned n) { -- cgit From 9f4f374a19e808ba4f7d4bb04266526bf5ed428b Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 3 Sep 2009 00:17:28 +0200 Subject: remap_sse: fix inner loop increment on SSE In each iteration we can process 2^4 S16NE samples and 2^5 F32NE samples, that's twice as much as in MMX, hence correct the increments. --- src/pulsecore/remap_sse.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/pulsecore/remap_sse.c') diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c index fa21c6c8..368a3196 100644 --- a/src/pulsecore/remap_sse.c +++ b/src/pulsecore/remap_sse.c @@ -108,7 +108,7 @@ static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, case PA_SAMPLE_FLOAT32NE: { __asm__ __volatile__ ( - MONO_TO_STEREO(dq,3,7) /* do doubles to quads */ + MONO_TO_STEREO(dq, 4, 15) /* do doubles to quads */ : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2) : "r" ((pa_reg_x86)n) : "cc" @@ -118,7 +118,7 @@ static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, case PA_SAMPLE_S16NE: { __asm__ __volatile__ ( - MONO_TO_STEREO(wd,4,15) /* do words to doubles */ + MONO_TO_STEREO(wd, 5, 31) /* do words to doubles */ : "+r" (dst), "+r" (src), "=&r" (temp), "=&r" (temp2) : "r" ((pa_reg_x86)n) : "cc" -- cgit From 6f396c89ab48c4897ebbcb846c5914b06a8225fb Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Thu, 3 Sep 2009 01:48:30 +0200 Subject: remap: build sse code only on x86 --- src/pulsecore/remap_sse.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'src/pulsecore/remap_sse.c') diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c index 368a3196..bf22df7c 100644 --- a/src/pulsecore/remap_sse.c +++ b/src/pulsecore/remap_sse.c @@ -101,6 +101,7 @@ " jne 3b \n\t" \ "4: \n\t" +#if defined (__i386__) || defined (__amd64__) static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, unsigned n) { pa_reg_x86 temp, temp2; @@ -144,6 +145,7 @@ static void init_remap_sse (pa_remap_t *m) { pa_log_info("Using SSE mono to stereo remapping"); } } +#endif /* defined (__i386__) || defined (__amd64__) */ void pa_remap_func_init_sse (pa_cpu_x86_flag_t flags) { #if defined (__i386__) || defined (__amd64__) -- cgit From b5ac3839e18524524fa3e0da7ec68dbce16e8203 Mon Sep 17 00:00:00 2001 From: Wim Taymans Date: Mon, 7 Sep 2009 17:21:21 +0200 Subject: x86: only install some functions when SSE2 Remap and volume functions use SSE2 instructions so only install them when SSE2 is present. --- src/pulsecore/remap_sse.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'src/pulsecore/remap_sse.c') diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c index bf22df7c..dac072ec 100644 --- a/src/pulsecore/remap_sse.c +++ b/src/pulsecore/remap_sse.c @@ -102,7 +102,7 @@ "4: \n\t" #if defined (__i386__) || defined (__amd64__) -static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, unsigned n) { +static void remap_mono_to_stereo_sse2 (pa_remap_t *m, void *dst, const void *src, unsigned n) { pa_reg_x86 temp, temp2; switch (*m->format) { @@ -132,7 +132,7 @@ static void remap_mono_to_stereo_sse (pa_remap_t *m, void *dst, const void *src, } /* set the function that will execute the remapping based on the matrices */ -static void init_remap_sse (pa_remap_t *m) { +static void init_remap_sse2 (pa_remap_t *m) { unsigned n_oc, n_ic; n_oc = m->o_ss->channels; @@ -141,7 +141,7 @@ static void init_remap_sse (pa_remap_t *m) { /* find some common channel remappings, fall back to full matrix operation. */ if (n_ic == 1 && n_oc == 2 && m->map_table_f[0][0] >= 1.0 && m->map_table_f[1][0] >= 1.0) { - m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_sse; + m->do_remap = (pa_do_remap_func_t) remap_mono_to_stereo_sse2; pa_log_info("Using SSE mono to stereo remapping"); } } @@ -151,6 +151,7 @@ void pa_remap_func_init_sse (pa_cpu_x86_flag_t flags) { #if defined (__i386__) || defined (__amd64__) pa_log_info("Initialising SSE optimized remappers."); - pa_set_init_remap_func ((pa_init_remap_func_t) init_remap_sse); + if (flags & PA_CPU_X86_SSE2) + pa_set_init_remap_func ((pa_init_remap_func_t) init_remap_sse2); #endif /* defined (__i386__) || defined (__amd64__) */ } -- cgit From 71e066c873e5bd31bd446ac0f8d0e97cc0b12ace Mon Sep 17 00:00:00 2001 From: Lennart Poettering Date: Wed, 9 Sep 2009 04:28:22 +0200 Subject: simd: be more precise which SIMD optimizations we activate --- src/pulsecore/remap_sse.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'src/pulsecore/remap_sse.c') diff --git a/src/pulsecore/remap_sse.c b/src/pulsecore/remap_sse.c index dac072ec..0ccf3161 100644 --- a/src/pulsecore/remap_sse.c +++ b/src/pulsecore/remap_sse.c @@ -149,9 +149,11 @@ static void init_remap_sse2 (pa_remap_t *m) { void pa_remap_func_init_sse (pa_cpu_x86_flag_t flags) { #if defined (__i386__) || defined (__amd64__) - pa_log_info("Initialising SSE optimized remappers."); - if (flags & PA_CPU_X86_SSE2) - pa_set_init_remap_func ((pa_init_remap_func_t) init_remap_sse2); + if (flags & PA_CPU_X86_SSE2) { + pa_log_info("Initialising SSE2 optimized remappers."); + pa_set_init_remap_func ((pa_init_remap_func_t) init_remap_sse2); + } + #endif /* defined (__i386__) || defined (__amd64__) */ } -- cgit