diff options
Diffstat (limited to 'src/modules/module-equalizer-sink.c')
-rw-r--r--[-rwxr-xr-x] | src/modules/module-equalizer-sink.c | 341 |
1 files changed, 194 insertions, 147 deletions
diff --git a/src/modules/module-equalizer-sink.c b/src/modules/module-equalizer-sink.c index 3a28b497..0a2860b0 100755..100644 --- a/src/modules/module-equalizer-sink.c +++ b/src/modules/module-equalizer-sink.c @@ -113,8 +113,11 @@ struct userdata { float **Xs; float ***Hs;//thread updatable copies of the freq response filters (magintude based) pa_aupdate **a_H; - pa_memchunk conv_buffer; pa_memblockq *input_q; + char *output_buffer; + size_t output_buffer_length; + size_t output_buffer_max_length; + pa_memblockq *output_q; pa_bool_t first_iteration; pa_dbus_protocol *dbus_protocol; @@ -250,10 +253,11 @@ static int sink_process_msg_cb(pa_msgobject *o, int code, void *data, int64_t of pa_sink_get_latency_within_thread(u->sink_input->sink) + /* Add the latency internal to our sink input on top */ - pa_bytes_to_usec(pa_memblockq_get_length(u->sink_input->thread_info.render_memblockq), &u->sink_input->sink->sample_spec); + pa_bytes_to_usec(pa_memblockq_get_length(u->output_q), &u->sink_input->sink->sample_spec) + + pa_bytes_to_usec(pa_memblockq_get_length(u->sink_input->thread_info.render_memblockq), &u->sink_input->sink->sample_spec) + + pa_bytes_to_usec(pa_memblockq_get_length(u->input_q), &u->sink_input->sink->sample_spec); // pa_bytes_to_usec(u->samples_gathered * fs, &u->sink->sample_spec); //+ pa_bytes_to_usec(u->latency * fs, ss) - //+ pa_bytes_to_usec(pa_memblockq_get_length(u->input_q), ss); return 0; } } @@ -337,7 +341,7 @@ static void sink_set_mute_cb(pa_sink *s) { pa_sink_input_set_mute(u->sink_input, s->muted, s->save_muted); } - +#if 1 //reference implementation static void dsp_logic( float * restrict dst,//used as a temp array too, needs to be fft_length! @@ -351,12 +355,12 @@ static void dsp_logic( fftwf_complex * restrict output_window,//The transformed window'd src struct userdata *u){ //use a linear-phase sliding STFT and overlap-add method (for each channel) - //zero padd the data - memset(dst + u->window_size, 0, (u->fft_size - u->window_size) * sizeof(float)); //window the data for(size_t j = 0; j < u->window_size; ++j){ dst[j] = X * W[j] * src[j]; } + //zero padd the the remaining fft window + memset(dst + u->window_size, 0, (u->fft_size - u->window_size) * sizeof(float)); //Processing is done here! //do fft fftwf_execute_dft_r2c(u->forward_plan, dst, output_window); @@ -390,125 +394,141 @@ static void dsp_logic( (u->samples_gathered - u->R) * sizeof(float) ); } - +#else typedef float v4sf __attribute__ ((__aligned__(v_size * sizeof(float)))); typedef union float_vector { float f[v_size]; v4sf v; -#ifdef __SSE2__ __m128 m; -#endif } float_vector_t; -////regardless of sse enabled, the loops in here assume -////16 byte aligned addresses and memory allocations divisible by v_size -//void dsp_logic( -// float * restrict dst,//used as a temp array too, needs to be fft_length! -// float * restrict src,/*input data w/ overlap at start, -// *automatically cycled in routine -// */ -// float * restrict overlap,//The size of the overlap -// const float X,//multipliar -// const float * restrict H,//The freq. magnitude scalers filter -// const float * restrict W,//The windowing function -// fftwf_complex * restrict output_window,//The transformed window'd src -// struct userdata *u){//Collection of constants - //float_vector_t x = {X, X, X, X}; -// const size_t window_size = PA_ROUND_UP(u->window_size,v_size); -// const size_t fft_h = PA_ROUND_UP(FILTER_SIZE, v_size / 2); -// //const size_t R = PA_ROUND_UP(u->R, v_size); -// const size_t overlap_size = PA_ROUND_UP(u->overlap_size, v_size); -// overlap_size = PA_ROUND_UP(u->overlap_size, v_size); -// -// //assert(u->samples_gathered >= u->R); -// //zero out the bit beyond the real overlap so we don't add garbage -// for(size_t j = overlap_size; j > u->overlap_size; --j){ -// overlap[j-1] = 0; -// } -// //use a linear-phase sliding STFT and overlap-add method -// //zero padd the data -// memset(dst + u->window_size, 0, (u->fft_size - u->window_size)*sizeof(float)); -// //window the data -// for(size_t j = 0; j < window_size; j += v_size){ -// //dst[j] = W[j]*src[j]; -// float_vector_t *d = (float_vector_t*) (dst+j); -// float_vector_t *w = (float_vector_t*) (W+j); -// float_vector_t *s = (float_vector_t*) (src+j); +//regardless of sse enabled, the loops in here assume +//16 byte aligned addresses and memory allocations divisible by v_size +static void dsp_logic( + float * restrict dst,//used as a temp array too, needs to be fft_length! + float * restrict src,/*input data w/ overlap at start, + *automatically cycled in routine + */ + float * restrict overlap,//The size of the overlap + const float X,//multipliar + const float * restrict H,//The freq. magnitude scalers filter + const float * restrict W,//The windowing function + fftwf_complex * restrict output_window,//The transformed window'd src + struct userdata *u){//Collection of constants + const size_t overlap_size = PA_ROUND_UP(u->overlap_size, v_size); + float_vector_t x; + x.f[0] = x.f[1] = x.f[2] = x.f[3] = X; + + //assert(u->samples_gathered >= u->R); + //use a linear-phase sliding STFT and overlap-add method + for(size_t j = 0; j < u->window_size; j += v_size){ + //dst[j] = W[j] * src[j]; + float_vector_t *d = (float_vector_t*) (dst + j); + float_vector_t *w = (float_vector_t*) (W + j); + float_vector_t *s = (float_vector_t*) (src + j); //#if __SSE2__ -// d->m = _mm_mul_ps(x->m, _mm_mul_ps(w->m, s->m)); -//#else + d->m = _mm_mul_ps(x.m, _mm_mul_ps(w->m, s->m)); // d->v = x->v * w->v * s->v; //#endif -// } -// //Processing is done here! -// //do fft -// fftwf_execute_dft_r2c(u->forward_plan, dst, output_window); -// -// -// //perform filtering - purely magnitude based -// for(size_t j = 0;j < fft_h; j+=v_size/2){ -// //output_window[j][0]*=H[j]; -// //output_window[j][1]*=H[j]; -// float_vector_t *d = (float_vector_t*)(output_window+j); -// float_vector_t h; -// h.f[0] = h.f[1] = H[j]; -// h.f[2] = h.f[3] = H[j+1]; + } + //zero padd the the remaining fft window + memset(dst + u->window_size, 0, (u->fft_size - u->window_size) * sizeof(float)); + + //Processing is done here! + //do fft + fftwf_execute_dft_r2c(u->forward_plan, dst, output_window); + //perform filtering - purely magnitude based + for(size_t j = 0; j < FILTER_SIZE; j += v_size / 2){ + //output_window[j][0]*=H[j]; + //output_window[j][1]*=H[j]; + float_vector_t *d = (float_vector_t*)( ((float *) output_window) + 2 * j); + float_vector_t h; + h.f[0] = h.f[1] = H[j]; + h.f[2] = h.f[3] = H[j + 1]; //#if __SSE2__ -// d->m = _mm_mul_ps(d->m, h.m); + d->m = _mm_mul_ps(d->m, h.m); //#else -// d->v = d->v*h->v; +// d->v = d->v * h.v; //#endif -// } -// //inverse fft -// fftwf_execute_dft_c2r(u->inverse_plan, output_window, dst); -// -// ////debug: tests overlaping add -// ////and negates ALL PREVIOUS processing -// ////yields a perfect reconstruction if COLA is held -// //for(size_t j = 0; j < u->window_size; ++j){ -// // dst[j] = W[j]*src[j]; -// //} -// -// //overlap add and preserve overlap component from this window (linear phase) -// for(size_t j = 0; j < overlap_size; j+=v_size){ -// //dst[j]+=overlap[j]; -// //overlap[j]+=dst[j+R]; -// float_vector_t *d = (float_vector_t*)(dst+j); -// float_vector_t *o = (float_vector_t*)(overlap+j); + } + + //inverse fft + fftwf_execute_dft_c2r(u->inverse_plan, output_window, dst); + + ////debug: tests overlaping add + ////and negates ALL PREVIOUS processing + ////yields a perfect reconstruction if COLA is held + //for(size_t j = 0; j < u->window_size; ++j){ + // dst[j] = W[j] * src[j]; + //} + + //overlap add and preserve overlap component from this window (linear phase) + for(size_t j = 0; j < overlap_size; j += v_size){ + //dst[j]+=overlap[j]; + //overlap[j]+=dst[j+R]; + float_vector_t *d = (float_vector_t*)(dst + j); + float_vector_t *o = (float_vector_t*)(overlap + j); //#if __SSE2__ -// d->m = _mm_add_ps(d->m, o->m); -// o->m = ((float_vector_t*)(dst+u->R+j))->m; + d->m = _mm_add_ps(d->m, o->m); + o->m = ((float_vector_t*)(dst + u->R + j))->m; //#else -// d->v = d->v+o->v; -// o->v = ((float_vector_t*)(dst+u->R+j))->v; +// d->v = d->v + o->v; +// o->v = ((float_vector_t*)(dst + u->R + j))->v; //#endif -// } -// //memcpy(overlap, dst+u->R, u->overlap_size*sizeof(float)); -// -// //////debug: tests if basic buffering works -// //////shouldn't modify the signal AT ALL (beyond roundoff) -// //for(size_t j = 0; j < u->window_size; ++j){ -// // dst[j] = src[j]; -// //} -// -// //preseve the needed input for the next window's overlap -// memmove(src, src + u->R, -// u->overlap_size * sizeof(float) -// ); -//} - -static void process_samples(struct userdata *u, pa_memchunk *tchunk){ + } + //memcpy(overlap, dst+u->R, u->overlap_size * sizeof(float)); //overlap preserve (debug) + //zero out the bit beyond the real overlap so we don't add garbage next iteration + memset(overlap + u->overlap_size, 0, overlap_size - u->overlap_size); + + ////debug: tests if basic buffering works + ////shouldn't modify the signal AT ALL (beyond roundoff) + //for(size_t j = 0; j < u->window_size; ++j){ + // dst[j] = src[j]; + //} + + //preseve the needed input for the next window's overlap + memmove(src, src + u->R, + (u->samples_gathered - u->R) * sizeof(float) + ); +} +#endif + +static void flatten_to_memblockq(struct userdata *u){ + size_t mbs = pa_mempool_block_size_max(u->sink->core->mempool); + pa_memchunk tchunk; + char *dst; + size_t i = 0; + while(i < u->output_buffer_length){ + tchunk.index = 0; + tchunk.length = PA_MIN((u->output_buffer_length - i), mbs); + tchunk.memblock = pa_memblock_new(u->sink->core->mempool, tchunk.length); + //pa_log_debug("pushing %ld into the q", tchunk.length); + dst = pa_memblock_acquire(tchunk.memblock); + memcpy(dst, u->output_buffer + i, tchunk.length); + pa_memblock_release(tchunk.memblock); + pa_memblockq_push(u->output_q, &tchunk); + pa_memblock_unref(tchunk.memblock); + i += tchunk.length; + } +} + +static void process_samples(struct userdata *u){ size_t fs = pa_frame_size(&(u->sink->sample_spec)); - float *dst; unsigned a_i; float *H, X; size_t iterations, offset; pa_assert(u->samples_gathered >= u->window_size); iterations = (u->samples_gathered - u->overlap_size) / u->R; - tchunk->index = 0; - tchunk->length = iterations * u->R * fs; - tchunk->memblock = pa_memblock_new(u->sink->core->mempool, tchunk->length); - dst = ((float*) pa_memblock_acquire(tchunk->memblock)); + //make sure there is enough buffer memory allocated + if(iterations * u->R * fs > u->output_buffer_max_length){ + u->output_buffer_max_length = iterations * u->R * fs; + if(u->output_buffer){ + pa_xfree(u->output_buffer); + } + u->output_buffer = pa_xmalloc(u->output_buffer_max_length); + } + u->output_buffer_length = iterations * u->R * fs; + for(size_t iter = 0; iter < iterations; ++iter){ offset = iter * u->R * fs; for(size_t c = 0;c < u->channels; c++) { @@ -534,14 +554,14 @@ static void process_samples(struct userdata *u, pa_memchunk *tchunk){ u->work_buffer[i] = u->W[i] <= FLT_EPSILON ? u->work_buffer[i] : u->work_buffer[i] / u->W[i]; } } - pa_sample_clamp(PA_SAMPLE_FLOAT32NE, (uint8_t *) (dst + c) + offset, fs, u->work_buffer, sizeof(float), u->R); + pa_sample_clamp(PA_SAMPLE_FLOAT32NE, (uint8_t *) (((float *)u->output_buffer) + c) + offset, fs, u->work_buffer, sizeof(float), u->R); } if(u->first_iteration){ u->first_iteration = FALSE; } u->samples_gathered -= u->R; } - pa_memblock_release(tchunk->memblock); + flatten_to_memblockq(u); } static void input_buffer(struct userdata *u, pa_memchunk *in){ @@ -565,36 +585,49 @@ static void input_buffer(struct userdata *u, pa_memchunk *in){ static int sink_input_pop_cb(pa_sink_input *i, size_t nbytes, pa_memchunk *chunk) { struct userdata *u; size_t fs, target_samples; - struct timeval start, end; + size_t mbs; + //struct timeval start, end; pa_memchunk tchunk; pa_sink_input_assert_ref(i); pa_assert_se(u = i->userdata); pa_assert(chunk); pa_assert(u->sink); fs = pa_frame_size(&(u->sink->sample_spec)); + mbs = pa_mempool_block_size_max(u->sink->core->mempool); + if(pa_memblockq_get_length(u->output_q) > 0){ + //pa_log_debug("qsize is %ld", pa_memblockq_get_length(u->output_q)); + goto END; + } + //nbytes = PA_MIN(nbytes, pa_mempool_block_size_max(u->sink->core->mempool)); target_samples = PA_ROUND_UP(nbytes / fs, u->R); + ////pa_log_debug("vanilla mbs = %ld",mbs); + //mbs = PA_ROUND_DOWN(mbs / fs, u->R); + //mbs = PA_MAX(mbs, u->R); + //target_samples = PA_MAX(target_samples, mbs); + //pa_log_debug("target samples: %ld", target_samples); if(u->first_iteration){ //allocate request_size target_samples = PA_MAX(target_samples, u->window_size); }else{ //allocate request_size + overlap target_samples += u->overlap_size; - alloc_input_buffers(u, target_samples); } alloc_input_buffers(u, target_samples); + //pa_log_debug("post target samples: %ld", target_samples); chunk->memblock = NULL; /* Hmm, process any rewind request that might be queued up */ pa_sink_process_rewind(u->sink, 0); //pa_log_debug("start output-buffered %ld, input-buffered %ld, requested %ld",buffered_samples,u->samples_gathered,samples_requested); - pa_rtclock_get(&start); + //pa_rtclock_get(&start); do{ size_t input_remaining = target_samples - u->samples_gathered; + // pa_log_debug("input remaining %ld samples", input_remaining); pa_assert(input_remaining > 0); while(pa_memblockq_peek(u->input_q, &tchunk) < 0){ //pa_sink_render(u->sink, input_remaining * fs, &tchunk); - pa_sink_render_full(u->sink, input_remaining * fs, &tchunk); + pa_sink_render_full(u->sink, PA_MIN(input_remaining * fs, mbs), &tchunk); pa_assert(tchunk.memblock); pa_memblockq_push(u->input_q, &tchunk); pa_memblock_unref(tchunk.memblock); @@ -605,25 +638,27 @@ static int sink_input_pop_cb(pa_sink_input *i, size_t nbytes, pa_memchunk *chunk //pa_log_debug("asked for %ld input samples, got %ld samples",input_remaining,buffer->length/fs); /* copy new input */ //pa_rtclock_get(start); + // pa_log_debug("buffering %ld bytes", tchunk.length); input_buffer(u, &tchunk); //pa_rtclock_get(&end); //pa_log_debug("Took %0.5f seconds to setup", pa_timeval_diff(end, start) / (double) PA_USEC_PER_SEC); pa_memblock_unref(tchunk.memblock); }while(u->samples_gathered < target_samples); - pa_rtclock_get(&end); - pa_log_debug("Took %0.6f seconds to get data", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC); + //pa_rtclock_get(&end); + //pa_log_debug("Took %0.6f seconds to get data", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC); pa_assert(u->fft_size >= u->window_size); pa_assert(u->R < u->window_size); - /* set the H filter */ - pa_rtclock_get(&start); + //pa_rtclock_get(&start); /* process a block */ - process_samples(u, chunk); - pa_rtclock_get(&end); - pa_log_debug("Took %0.6f seconds to process", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC); - + process_samples(u); + //pa_rtclock_get(&end); + //pa_log_debug("Took %0.6f seconds to process", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC); +END: + pa_assert_se(pa_memblockq_peek(u->output_q, chunk) >= 0); pa_assert(chunk->memblock); + pa_memblockq_drop(u->output_q, chunk->length); //pa_log_debug("gave %ld", chunk->length/fs); //pa_log_debug("end pop"); return 0; @@ -685,7 +720,7 @@ static void sink_input_process_rewind_cb(pa_sink_input *i, size_t nbytes) { //invalidate the output q pa_memblockq_seek(u->input_q, - (int64_t) amount, PA_SEEK_RELATIVE, TRUE); pa_log("Resetting filter"); - reset_filter(u); + //reset_filter(u); //this is the "proper" thing to do... } } @@ -814,33 +849,35 @@ static void sink_input_state_change_cb(pa_sink_input *i, pa_sink_input_state_t s static void pack(char **strs, size_t len, char **packed, size_t *length){ size_t t_len = 0; size_t headers = (1+len) * sizeof(uint16_t); - size_t offset = sizeof(uint16_t); + char *p; for(size_t i = 0; i < len; ++i){ t_len += strlen(strs[i]); } *length = headers + t_len; - *packed = pa_xmalloc0(*length); - ((uint16_t *) *packed)[0] = (uint16_t) len; + p = *packed = pa_xmalloc0(*length); + *((uint16_t *) p) = (uint16_t) len; + p += sizeof(uint16_t); for(size_t i = 0; i < len; ++i){ uint16_t l = strlen(strs[i]); - *((uint16_t *)(*packed + offset)) = l; - offset += sizeof(uint16_t); - memcpy(*packed + offset, strs[i], l); - offset += l; + *((uint16_t *) p) = (uint16_t) l; + p += sizeof(uint16_t); + memcpy(p, strs[i], l); + p += l; } } static void unpack(char *str, size_t length, char ***strs, size_t *len){ - size_t offset = sizeof(uint16_t); - *len = ((uint16_t *)str)[0]; + char *p = str; + *len = *((uint16_t *) p); + p += sizeof(uint16_t); *strs = pa_xnew(char *, *len); + for(size_t i = 0; i < *len; ++i){ - size_t l = *((uint16_t *)(str+offset)); - size_t e = PA_MIN(offset + l, length) - offset; - offset = PA_MIN(offset + sizeof(uint16_t), length); - (*strs)[i] = pa_xnew(char, e + 1); - memcpy((*strs)[i], str + offset, e); - (*strs)[i][e] = '\0'; - offset += l; + size_t l = *((uint16_t *) p); + p += sizeof(uint16_t); + (*strs)[i] = pa_xnew(char, l + 1); + memcpy((*strs)[i], p, l); + (*strs)[i][l] = '\0'; + p += l; } } static void save_profile(struct userdata *u, size_t channel, char *name){ @@ -885,17 +922,17 @@ static void save_state(struct userdata *u){ pack(u->base_profiles, u->channels, &packed, &packed_length); state = (float *) pa_xmalloc0(filter_state_size + packed_length); + memcpy(state + FILTER_STATE_SIZE, packed, packed_length); + pa_xfree(packed); for(size_t c = 0; c < u->channels; ++c){ a_i = pa_aupdate_read_begin(u->a_H[c]); - state[c * CHANNEL_PROFILE_SIZE] = u->Xs[a_i][c]; + state[c * CHANNEL_PROFILE_SIZE] = u->Xs[c][a_i]; H = u->Hs[c][a_i]; - H_n = state + c * CHANNEL_PROFILE_SIZE + 1; + H_n = &state[c * CHANNEL_PROFILE_SIZE + 1]; memcpy(H_n, H, FILTER_SIZE * sizeof(float)); pa_aupdate_read_end(u->a_H[c]); } - memcpy(((char *)state) + filter_state_size, packed, packed_length); - pa_xfree(packed); key.data = state_name; key.size = strlen(key.data); @@ -978,13 +1015,13 @@ static void load_state(struct userdata *u){ memcpy(u->Hs[c][a_i], H, FILTER_SIZE * sizeof(float)); pa_aupdate_write_end(u->a_H[c]); } - //unpack(((char *)value.data) + FILTER_STATE_SIZE, value.size - FILTER_STATE_SIZE, &names, &n_profs); - //n_profs = PA_MIN(n_profs, u->channels); - //for(size_t c = 0; c < n_profs; ++c){ - // pa_xfree(u->base_profiles[c]); - // u->base_profiles[c] = names[c]; - //} - //pa_xfree(names); + unpack(((char *)value.data) + FILTER_STATE_SIZE * sizeof(float), value.size - FILTER_STATE_SIZE * sizeof(float), &names, &n_profs); + n_profs = PA_MIN(n_profs, u->channels); + for(size_t c = 0; c < n_profs; ++c){ + pa_xfree(u->base_profiles[c]); + u->base_profiles[c] = names[c]; + } + pa_xfree(names); } pa_datum_free(&value); }else{ @@ -1062,9 +1099,12 @@ int pa__init(pa_module*m) { pa_modargs_get_value_boolean(ma, "set_default", &u->set_default); u->channels = ss.channels; - u->fft_size = pow(2, ceil(log(ss.rate)/log(2)));//probably unstable near corner cases of powers of 2 + u->fft_size = pow(2, ceil(log(ss.rate) / log(2)));//probably unstable near corner cases of powers of 2 pa_log_debug("fft size: %ld", u->fft_size); u->window_size = 15999; + if(u->window_size % 2 == 0){ + u->window_size--; + } u->R = (u->window_size + 1) / 2; u->overlap_size = u->window_size - u->R; u->samples_gathered = 0; @@ -1088,7 +1128,6 @@ int pa__init(pa_module*m) { u->a_H[c] = pa_aupdate_new(); u->input[c] = NULL; u->overlap_accum[c] = alloc(u->overlap_size, sizeof(float)); - memset(u->overlap_accum[c], 0, u->overlap_size*sizeof(float)); } u->output_window = alloc((FILTER_SIZE), sizeof(fftwf_complex)); u->forward_plan = fftwf_plan_dft_r2c_1d(u->fft_size, u->work_buffer, u->output_window, FFTW_ESTIMATE); @@ -1139,6 +1178,10 @@ int pa__init(pa_module*m) { u->sink->set_mute = sink_set_mute_cb; u->sink->userdata = u; u->input_q = pa_memblockq_new(0, MEMBLOCKQ_MAXLENGTH, 0, fs, 1, 1, 0, &u->sink->silence); + u->output_q = pa_memblockq_new(0, MEMBLOCKQ_MAXLENGTH, 0, fs, 1, 1, 0, NULL); + u->output_buffer = NULL; + u->output_buffer_length = 0; + u->output_buffer_max_length = 0; pa_sink_set_asyncmsgq(u->sink, master->asyncmsgq); //pa_sink_set_fixed_latency(u->sink, pa_bytes_to_usec(u->R*fs, &ss)); @@ -1251,6 +1294,10 @@ void pa__done(pa_module*m) { if (u->sink) pa_sink_unref(u->sink); + if(u->output_buffer){ + pa_xfree(u->output_buffer); + } + pa_memblockq_free(u->output_q); pa_memblockq_free(u->input_q); fftwf_destroy_plan(u->inverse_plan); |