1 files changed, 194 insertions, 147 deletions
diff --git a/src/modules/module-equalizer-sink.c b/src/modules/module-equalizer-sink.c
index 3a28b497..0a2860b0 100755..100644
--- a/src/modules/module-equalizer-sink.c
+++ b/src/modules/module-equalizer-sink.c
@@ -113,8 +113,11 @@ struct userdata {
     float **Xs;
     float ***Hs;//thread updatable copies of the freq response filters (magintude based)
     pa_aupdate **a_H;
-    pa_memchunk conv_buffer;
     pa_memblockq *input_q;
+    char *output_buffer;
+    size_t output_buffer_length;
+    size_t output_buffer_max_length;
+    pa_memblockq *output_q;
     pa_bool_t first_iteration;
 
     pa_dbus_protocol *dbus_protocol;
@@ -250,10 +253,11 @@ static int sink_process_msg_cb(pa_msgobject *o, int code, void *data, int64_t of
                 pa_sink_get_latency_within_thread(u->sink_input->sink) +
 
                 /* Add the latency internal to our sink input on top */
-                pa_bytes_to_usec(pa_memblockq_get_length(u->sink_input->thread_info.render_memblockq), &u->sink_input->sink->sample_spec);
+                pa_bytes_to_usec(pa_memblockq_get_length(u->output_q), &u->sink_input->sink->sample_spec) +
+                pa_bytes_to_usec(pa_memblockq_get_length(u->sink_input->thread_info.render_memblockq), &u->sink_input->sink->sample_spec) +
+                pa_bytes_to_usec(pa_memblockq_get_length(u->input_q), &u->sink_input->sink->sample_spec);
             //    pa_bytes_to_usec(u->samples_gathered * fs, &u->sink->sample_spec);
             //+ pa_bytes_to_usec(u->latency * fs, ss)
-            //+ pa_bytes_to_usec(pa_memblockq_get_length(u->input_q), ss);
             return 0;
         }
     }
@@ -337,7 +341,7 @@ static void sink_set_mute_cb(pa_sink *s) {
     pa_sink_input_set_mute(u->sink_input, s->muted, s->save_muted);
 }
 
-
+#if 1
 //reference implementation
 static void dsp_logic(
     float * restrict dst,//used as a temp array too, needs to be fft_length!
@@ -351,12 +355,12 @@ static void dsp_logic(
     fftwf_complex * restrict output_window,//The transformed window'd src
     struct userdata *u){
     //use a linear-phase sliding STFT and overlap-add method (for each channel)
-    //zero padd the data
-    memset(dst + u->window_size, 0, (u->fft_size - u->window_size) * sizeof(float));
     //window the data
     for(size_t j = 0; j < u->window_size; ++j){
         dst[j] = X * W[j] * src[j];
     }
+    //zero padd the the remaining fft window
+    memset(dst + u->window_size, 0, (u->fft_size - u->window_size) * sizeof(float));
     //Processing is done here!
     //do fft
     fftwf_execute_dft_r2c(u->forward_plan, dst, output_window);
@@ -390,125 +394,141 @@ static void dsp_logic(
         (u->samples_gathered - u->R) * sizeof(float)
     );
 }
-
+#else
 typedef float v4sf __attribute__ ((__aligned__(v_size * sizeof(float))));
 typedef union float_vector {
     float f[v_size];
     v4sf v;
-#ifdef __SSE2__
     __m128 m;
-#endif
 } float_vector_t;
 
-////regardless of sse enabled, the loops in here assume
-////16 byte aligned addresses and memory allocations divisible by v_size
-//void dsp_logic(
-//    float * restrict dst,//used as a temp array too, needs to be fft_length!
-//    float * restrict src,/*input data w/ overlap at start,
-//                               *automatically cycled in routine
-//                               */
-//    float * restrict overlap,//The size of the overlap
-//    const float X,//multipliar
-//    const float * restrict H,//The freq. magnitude scalers filter
-//    const float * restrict W,//The windowing function
-//    fftwf_complex * restrict output_window,//The transformed window'd src
-//    struct userdata *u){//Collection of constants
-      //float_vector_t x = {X, X, X, X};
-//    const size_t window_size = PA_ROUND_UP(u->window_size,v_size);
-//    const size_t fft_h = PA_ROUND_UP(FILTER_SIZE, v_size / 2);
-//    //const size_t R = PA_ROUND_UP(u->R, v_size);
-//    const size_t overlap_size = PA_ROUND_UP(u->overlap_size, v_size);
-//     overlap_size = PA_ROUND_UP(u->overlap_size, v_size);
-//
-//    //assert(u->samples_gathered >= u->R);
-//    //zero out the bit beyond the real overlap so we don't add garbage
-//    for(size_t j = overlap_size; j > u->overlap_size; --j){
-//       overlap[j-1] = 0;
-//    }
-//    //use a linear-phase sliding STFT and overlap-add method
-//    //zero padd the data
-//    memset(dst + u->window_size, 0, (u->fft_size - u->window_size)*sizeof(float));
-//    //window the data
-//    for(size_t j = 0; j < window_size; j += v_size){
-//        //dst[j] = W[j]*src[j];
-//        float_vector_t *d = (float_vector_t*) (dst+j);
-//        float_vector_t *w = (float_vector_t*) (W+j);
-//        float_vector_t *s = (float_vector_t*) (src+j);
+//regardless of sse enabled, the loops in here assume
+//16 byte aligned addresses and memory allocations divisible by v_size
+static void dsp_logic(
+    float * restrict dst,//used as a temp array too, needs to be fft_length!
+    float * restrict src,/*input data w/ overlap at start,
+                               *automatically cycled in routine
+                               */
+    float * restrict overlap,//The size of the overlap
+    const float X,//multipliar
+    const float * restrict H,//The freq. magnitude scalers filter
+    const float * restrict W,//The windowing function
+    fftwf_complex * restrict output_window,//The transformed window'd src
+    struct userdata *u){//Collection of constants
+    const size_t overlap_size = PA_ROUND_UP(u->overlap_size, v_size);
+    float_vector_t x;
+    x.f[0] = x.f[1] = x.f[2] = x.f[3] = X;
+
+    //assert(u->samples_gathered >= u->R);
+    //use a linear-phase sliding STFT and overlap-add method
+    for(size_t j = 0; j < u->window_size; j += v_size){
+        //dst[j] = W[j] * src[j];
+        float_vector_t *d = (float_vector_t*) (dst + j);
+        float_vector_t *w = (float_vector_t*) (W + j);
+        float_vector_t *s = (float_vector_t*) (src + j);
 //#if __SSE2__
-//        d->m = _mm_mul_ps(x->m, _mm_mul_ps(w->m, s->m));
-//#else
+        d->m = _mm_mul_ps(x.m, _mm_mul_ps(w->m, s->m));
 //        d->v = x->v * w->v * s->v;
 //#endif
-//    }
-//    //Processing is done here!
-//    //do fft
-//    fftwf_execute_dft_r2c(u->forward_plan, dst, output_window);
-//
-//
-//    //perform filtering - purely magnitude based
-//    for(size_t j = 0;j < fft_h; j+=v_size/2){
-//        //output_window[j][0]*=H[j];
-//        //output_window[j][1]*=H[j];
-//        float_vector_t *d = (float_vector_t*)(output_window+j);
-//        float_vector_t h;
-//        h.f[0] = h.f[1] = H[j];
-//        h.f[2] = h.f[3] = H[j+1];
+    }
+    //zero padd the the remaining fft window
+    memset(dst + u->window_size, 0, (u->fft_size - u->window_size) * sizeof(float));
+
+    //Processing is done here!
+    //do fft
+    fftwf_execute_dft_r2c(u->forward_plan, dst, output_window);
+    //perform filtering - purely magnitude based
+    for(size_t j = 0; j < FILTER_SIZE; j += v_size / 2){
+        //output_window[j][0]*=H[j];
+        //output_window[j][1]*=H[j];
+        float_vector_t *d = (float_vector_t*)( ((float *) output_window) + 2 * j);
+        float_vector_t h;
+        h.f[0] = h.f[1] = H[j];
+        h.f[2] = h.f[3] = H[j + 1];
 //#if __SSE2__
-//        d->m = _mm_mul_ps(d->m, h.m);
+        d->m = _mm_mul_ps(d->m, h.m);
 //#else
-//        d->v = d->v*h->v;
+//        d->v = d->v * h.v;
 //#endif
-//    }
-//    //inverse fft
-//    fftwf_execute_dft_c2r(u->inverse_plan, output_window, dst);
-//
-//    ////debug: tests overlaping add
-//    ////and negates ALL PREVIOUS processing
-//    ////yields a perfect reconstruction if COLA is held
-//    //for(size_t j = 0; j < u->window_size; ++j){
-//    //    dst[j] = W[j]*src[j];
-//    //}
-//
-//    //overlap add and preserve overlap component from this window (linear phase)
-//    for(size_t j = 0; j < overlap_size; j+=v_size){
-//        //dst[j]+=overlap[j];
-//        //overlap[j]+=dst[j+R];
-//        float_vector_t *d = (float_vector_t*)(dst+j);
-//        float_vector_t *o = (float_vector_t*)(overlap+j);
+    }
+
+    //inverse fft
+    fftwf_execute_dft_c2r(u->inverse_plan, output_window, dst);
+
+    ////debug: tests overlaping add
+    ////and negates ALL PREVIOUS processing
+    ////yields a perfect reconstruction if COLA is held
+    //for(size_t j = 0; j < u->window_size; ++j){
+    //    dst[j] = W[j] * src[j];
+    //}
+
+    //overlap add and preserve overlap component from this window (linear phase)
+    for(size_t j = 0; j < overlap_size; j += v_size){
+        //dst[j]+=overlap[j];
+        //overlap[j]+=dst[j+R];
+        float_vector_t *d = (float_vector_t*)(dst + j);
+        float_vector_t *o = (float_vector_t*)(overlap + j);
 //#if __SSE2__
-//        d->m = _mm_add_ps(d->m, o->m);
-//        o->m = ((float_vector_t*)(dst+u->R+j))->m;
+        d->m = _mm_add_ps(d->m, o->m);
+        o->m = ((float_vector_t*)(dst + u->R + j))->m;
 //#else
-//        d->v = d->v+o->v;
-//        o->v = ((float_vector_t*)(dst+u->R+j))->v;
+//        d->v = d->v + o->v;
+//        o->v = ((float_vector_t*)(dst + u->R + j))->v;
 //#endif
-//    }
-//    //memcpy(overlap, dst+u->R, u->overlap_size*sizeof(float));
-//
-//    //////debug: tests if basic buffering works
-//    //////shouldn't modify the signal AT ALL (beyond roundoff)
-//    //for(size_t j = 0; j < u->window_size; ++j){
-//    //    dst[j] = src[j];
-//    //}
-//
-//    //preseve the needed input for the next window's overlap
-//    memmove(src, src + u->R,
-//        u->overlap_size * sizeof(float)
-//    );
-//}
-
-static void process_samples(struct userdata *u, pa_memchunk *tchunk){
+    }
+    //memcpy(overlap, dst+u->R, u->overlap_size * sizeof(float)); //overlap preserve (debug)
+    //zero out the bit beyond the real overlap so we don't add garbage next iteration
+    memset(overlap + u->overlap_size, 0, overlap_size - u->overlap_size);
+
+    ////debug: tests if basic buffering works
+    ////shouldn't modify the signal AT ALL (beyond roundoff)
+    //for(size_t j = 0; j < u->window_size; ++j){
+    //    dst[j] = src[j];
+    //}
+
+    //preseve the needed input for the next window's overlap
+    memmove(src, src + u->R,
+        (u->samples_gathered - u->R) * sizeof(float)
+    );
+}
+#endif
+
+static void flatten_to_memblockq(struct userdata *u){
+    size_t mbs = pa_mempool_block_size_max(u->sink->core->mempool);
+    pa_memchunk tchunk;
+    char *dst;
+    size_t i = 0;
+    while(i < u->output_buffer_length){
+        tchunk.index = 0;
+        tchunk.length = PA_MIN((u->output_buffer_length - i), mbs);
+        tchunk.memblock = pa_memblock_new(u->sink->core->mempool, tchunk.length);
+        //pa_log_debug("pushing %ld into the q", tchunk.length);
+        dst = pa_memblock_acquire(tchunk.memblock);
+        memcpy(dst, u->output_buffer + i, tchunk.length);
+        pa_memblock_release(tchunk.memblock);
+        pa_memblockq_push(u->output_q, &tchunk);
+        pa_memblock_unref(tchunk.memblock);
+        i += tchunk.length;
+    }
+}
+
+static void process_samples(struct userdata *u){
     size_t fs = pa_frame_size(&(u->sink->sample_spec));
-    float *dst;
     unsigned a_i;
     float *H, X;
     size_t iterations, offset;
     pa_assert(u->samples_gathered >= u->window_size);
     iterations = (u->samples_gathered - u->overlap_size) / u->R;
-    tchunk->index = 0;
-    tchunk->length = iterations * u->R * fs;
-    tchunk->memblock = pa_memblock_new(u->sink->core->mempool, tchunk->length);
-    dst = ((float*) pa_memblock_acquire(tchunk->memblock));
+    //make sure there is enough buffer memory allocated
+    if(iterations * u->R * fs > u->output_buffer_max_length){
+        u->output_buffer_max_length = iterations * u->R * fs;
+        if(u->output_buffer){
+            pa_xfree(u->output_buffer);
+        }
+        u->output_buffer = pa_xmalloc(u->output_buffer_max_length);
+    }
+    u->output_buffer_length = iterations * u->R * fs;
+
     for(size_t iter = 0; iter < iterations; ++iter){
         offset = iter * u->R * fs;
         for(size_t c = 0;c < u->channels; c++) {
@@ -534,14 +554,14 @@ static void process_samples(struct userdata *u, pa_memchunk *tchunk){
                     u->work_buffer[i] = u->W[i] <= FLT_EPSILON ? u->work_buffer[i] : u->work_buffer[i] / u->W[i];
                 }
             }
-            pa_sample_clamp(PA_SAMPLE_FLOAT32NE, (uint8_t *) (dst + c) + offset, fs, u->work_buffer, sizeof(float), u->R);
+            pa_sample_clamp(PA_SAMPLE_FLOAT32NE, (uint8_t *) (((float *)u->output_buffer) + c) + offset, fs, u->work_buffer, sizeof(float), u->R);
         }
         if(u->first_iteration){
             u->first_iteration = FALSE;
         }
         u->samples_gathered -= u->R;
     }
-    pa_memblock_release(tchunk->memblock);
+    flatten_to_memblockq(u);
 }
 
 static void input_buffer(struct userdata *u, pa_memchunk *in){
@@ -565,36 +585,49 @@ static void input_buffer(struct userdata *u, pa_memchunk *in){
 static int sink_input_pop_cb(pa_sink_input *i, size_t nbytes, pa_memchunk *chunk) {
     struct userdata *u;
     size_t fs, target_samples;
-    struct timeval start, end;
+    size_t mbs;
+    //struct timeval start, end;
     pa_memchunk tchunk;
     pa_sink_input_assert_ref(i);
     pa_assert_se(u = i->userdata);
     pa_assert(chunk);
     pa_assert(u->sink);
     fs = pa_frame_size(&(u->sink->sample_spec));
+    mbs = pa_mempool_block_size_max(u->sink->core->mempool);
+    if(pa_memblockq_get_length(u->output_q) > 0){
+        //pa_log_debug("qsize is %ld", pa_memblockq_get_length(u->output_q));
+        goto END;
+    }
+    //nbytes = PA_MIN(nbytes, pa_mempool_block_size_max(u->sink->core->mempool));
     target_samples = PA_ROUND_UP(nbytes / fs, u->R);
+    ////pa_log_debug("vanilla mbs = %ld",mbs);
+    //mbs = PA_ROUND_DOWN(mbs / fs, u->R);
+    //mbs = PA_MAX(mbs, u->R);
+    //target_samples = PA_MAX(target_samples, mbs);
+    //pa_log_debug("target samples: %ld", target_samples);
     if(u->first_iteration){
         //allocate request_size
         target_samples = PA_MAX(target_samples, u->window_size);
     }else{
         //allocate request_size + overlap
         target_samples += u->overlap_size;
-        alloc_input_buffers(u, target_samples);
     }
     alloc_input_buffers(u, target_samples);
+    //pa_log_debug("post target samples: %ld", target_samples);
     chunk->memblock = NULL;
 
     /* Hmm, process any rewind request that might be queued up */
     pa_sink_process_rewind(u->sink, 0);
 
     //pa_log_debug("start output-buffered %ld, input-buffered %ld, requested %ld",buffered_samples,u->samples_gathered,samples_requested);
-    pa_rtclock_get(&start);
+    //pa_rtclock_get(&start);
     do{
         size_t input_remaining = target_samples - u->samples_gathered;
+       // pa_log_debug("input remaining %ld samples", input_remaining);
         pa_assert(input_remaining > 0);
         while(pa_memblockq_peek(u->input_q, &tchunk) < 0){
             //pa_sink_render(u->sink, input_remaining * fs, &tchunk);
-            pa_sink_render_full(u->sink, input_remaining * fs, &tchunk);
+            pa_sink_render_full(u->sink, PA_MIN(input_remaining * fs, mbs), &tchunk);
             pa_assert(tchunk.memblock);
             pa_memblockq_push(u->input_q, &tchunk);
             pa_memblock_unref(tchunk.memblock);
@@ -605,25 +638,27 @@ static int sink_input_pop_cb(pa_sink_input *i, size_t nbytes, pa_memchunk *chunk
         //pa_log_debug("asked for %ld input samples, got %ld samples",input_remaining,buffer->length/fs);
         /* copy new input */
         //pa_rtclock_get(start);
+       // pa_log_debug("buffering %ld bytes", tchunk.length);
         input_buffer(u, &tchunk);
         //pa_rtclock_get(&end);
         //pa_log_debug("Took %0.5f seconds to setup", pa_timeval_diff(end, start) / (double) PA_USEC_PER_SEC);
         pa_memblock_unref(tchunk.memblock);
     }while(u->samples_gathered < target_samples);
 
-    pa_rtclock_get(&end);
-    pa_log_debug("Took %0.6f seconds to get data", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC);
+    //pa_rtclock_get(&end);
+    //pa_log_debug("Took %0.6f seconds to get data", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC);
 
     pa_assert(u->fft_size >= u->window_size);
     pa_assert(u->R < u->window_size);
-    /* set the H filter */
-    pa_rtclock_get(&start);
+    //pa_rtclock_get(&start);
     /* process a block */
-    process_samples(u, chunk);
-    pa_rtclock_get(&end);
-    pa_log_debug("Took %0.6f seconds to process", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC);
-
+    process_samples(u);
+    //pa_rtclock_get(&end);
+    //pa_log_debug("Took %0.6f seconds to process", (double) pa_timeval_diff(&end, &start) / PA_USEC_PER_SEC);
+END:
+    pa_assert_se(pa_memblockq_peek(u->output_q, chunk) >= 0);
     pa_assert(chunk->memblock);
+    pa_memblockq_drop(u->output_q, chunk->length);
     //pa_log_debug("gave %ld", chunk->length/fs);
     //pa_log_debug("end pop");
     return 0;
@@ -685,7 +720,7 @@ static void sink_input_process_rewind_cb(pa_sink_input *i, size_t nbytes) {
             //invalidate the output q
             pa_memblockq_seek(u->input_q, - (int64_t) amount, PA_SEEK_RELATIVE, TRUE);
             pa_log("Resetting filter");
-            reset_filter(u);
+            //reset_filter(u); //this is the "proper" thing to do...
         }
     }
 
@@ -814,33 +849,35 @@ static void sink_input_state_change_cb(pa_sink_input *i, pa_sink_input_state_t s
 static void pack(char **strs, size_t len, char **packed, size_t *length){
     size_t t_len = 0;
     size_t headers = (1+len) * sizeof(uint16_t);
-    size_t offset = sizeof(uint16_t);
+    char *p;
     for(size_t i = 0; i < len; ++i){
         t_len += strlen(strs[i]);
     }
     *length = headers + t_len;
-    *packed = pa_xmalloc0(*length);
-    ((uint16_t *) *packed)[0] = (uint16_t) len;
+    p = *packed = pa_xmalloc0(*length);
+    *((uint16_t *) p) = (uint16_t) len;
+    p += sizeof(uint16_t);
     for(size_t i = 0; i < len; ++i){
         uint16_t l = strlen(strs[i]);
-        *((uint16_t *)(*packed + offset)) = l;
-        offset += sizeof(uint16_t);
-        memcpy(*packed + offset, strs[i], l);
-        offset += l;
+        *((uint16_t *) p) = (uint16_t) l;
+        p += sizeof(uint16_t);
+        memcpy(p, strs[i], l);
+        p += l;
     }
 }
 static void unpack(char *str, size_t length, char ***strs, size_t *len){
-    size_t offset = sizeof(uint16_t);
-    *len = ((uint16_t *)str)[0];
+    char *p = str;
+    *len = *((uint16_t *) p);
+    p += sizeof(uint16_t);
     *strs = pa_xnew(char *, *len);
+
     for(size_t i = 0; i < *len; ++i){
-        size_t l = *((uint16_t *)(str+offset));
-        size_t e = PA_MIN(offset + l, length) - offset;
-        offset = PA_MIN(offset + sizeof(uint16_t), length);
-        (*strs)[i] = pa_xnew(char, e + 1);
-        memcpy((*strs)[i], str + offset, e);
-        (*strs)[i][e] = '\0';
-        offset += l;
+        size_t l = *((uint16_t *) p);
+        p += sizeof(uint16_t);
+        (*strs)[i] = pa_xnew(char, l + 1);
+        memcpy((*strs)[i], p, l);
+        (*strs)[i][l] = '\0';
+        p += l;
     }
 }
 static void save_profile(struct userdata *u, size_t channel, char *name){
@@ -885,17 +922,17 @@ static void save_state(struct userdata *u){
 
     pack(u->base_profiles, u->channels, &packed, &packed_length);
     state = (float *) pa_xmalloc0(filter_state_size + packed_length);
+    memcpy(state + FILTER_STATE_SIZE, packed, packed_length);
+    pa_xfree(packed);
 
     for(size_t c = 0; c < u->channels; ++c){
         a_i = pa_aupdate_read_begin(u->a_H[c]);
-        state[c * CHANNEL_PROFILE_SIZE] = u->Xs[a_i][c];
+        state[c * CHANNEL_PROFILE_SIZE] = u->Xs[c][a_i];
         H = u->Hs[c][a_i];
-        H_n = state + c * CHANNEL_PROFILE_SIZE + 1;
+        H_n = &state[c * CHANNEL_PROFILE_SIZE + 1];
         memcpy(H_n, H, FILTER_SIZE * sizeof(float));
         pa_aupdate_read_end(u->a_H[c]);
     }
-    memcpy(((char *)state) + filter_state_size, packed, packed_length);
-    pa_xfree(packed);
 
     key.data = state_name;
     key.size = strlen(key.data);
@@ -978,13 +1015,13 @@ static void load_state(struct userdata *u){
                 memcpy(u->Hs[c][a_i], H, FILTER_SIZE * sizeof(float));
                 pa_aupdate_write_end(u->a_H[c]);
             }
-            //unpack(((char *)value.data) + FILTER_STATE_SIZE, value.size - FILTER_STATE_SIZE, &names, &n_profs);
-            //n_profs = PA_MIN(n_profs, u->channels);
-            //for(size_t c = 0; c < n_profs; ++c){
-            //    pa_xfree(u->base_profiles[c]);
-            //    u->base_profiles[c] = names[c];
-            //}
-            //pa_xfree(names);
+            unpack(((char *)value.data) + FILTER_STATE_SIZE * sizeof(float), value.size - FILTER_STATE_SIZE * sizeof(float), &names, &n_profs);
+            n_profs = PA_MIN(n_profs, u->channels);
+            for(size_t c = 0; c < n_profs; ++c){
+                pa_xfree(u->base_profiles[c]);
+                u->base_profiles[c] = names[c];
+            }
+            pa_xfree(names);
         }
         pa_datum_free(&value);
     }else{
@@ -1062,9 +1099,12 @@ int pa__init(pa_module*m) {
     pa_modargs_get_value_boolean(ma, "set_default", &u->set_default);
 
     u->channels = ss.channels;
-    u->fft_size = pow(2, ceil(log(ss.rate)/log(2)));//probably unstable near corner cases of powers of 2
+    u->fft_size = pow(2, ceil(log(ss.rate) / log(2)));//probably unstable near corner cases of powers of 2
     pa_log_debug("fft size: %ld", u->fft_size);
     u->window_size = 15999;
+    if(u->window_size % 2 == 0){
+        u->window_size--;
+    }
     u->R = (u->window_size + 1) / 2;
     u->overlap_size = u->window_size - u->R;
     u->samples_gathered = 0;
@@ -1088,7 +1128,6 @@ int pa__init(pa_module*m) {
         u->a_H[c] = pa_aupdate_new();
         u->input[c] = NULL;
         u->overlap_accum[c] = alloc(u->overlap_size, sizeof(float));
-        memset(u->overlap_accum[c], 0, u->overlap_size*sizeof(float));
     }
     u->output_window = alloc((FILTER_SIZE), sizeof(fftwf_complex));
     u->forward_plan = fftwf_plan_dft_r2c_1d(u->fft_size, u->work_buffer, u->output_window, FFTW_ESTIMATE);
@@ -1139,6 +1178,10 @@ int pa__init(pa_module*m) {
     u->sink->set_mute = sink_set_mute_cb;
     u->sink->userdata = u;
     u->input_q = pa_memblockq_new(0,  MEMBLOCKQ_MAXLENGTH, 0, fs, 1, 1, 0, &u->sink->silence);
+    u->output_q = pa_memblockq_new(0,  MEMBLOCKQ_MAXLENGTH, 0, fs, 1, 1, 0, NULL);
+    u->output_buffer = NULL;
+    u->output_buffer_length = 0;
+    u->output_buffer_max_length = 0;
 
     pa_sink_set_asyncmsgq(u->sink, master->asyncmsgq);
     //pa_sink_set_fixed_latency(u->sink, pa_bytes_to_usec(u->R*fs, &ss));
@@ -1251,6 +1294,10 @@ void pa__done(pa_module*m) {
     if (u->sink)
         pa_sink_unref(u->sink);
 
+    if(u->output_buffer){
+        pa_xfree(u->output_buffer);
+    }
+    pa_memblockq_free(u->output_q);
     pa_memblockq_free(u->input_q);
 
     fftwf_destroy_plan(u->inverse_plan);