gst/monoscope/convolve.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349

/* Karatsuba convolution
 *
 *  Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * 
 * Note: 7th December 2004: This file used to be licensed under the GPL,
 *       but we got permission from Ralp Loader to relicense it to LGPL.
 * 
 *  $Id$
 *
 */

/* The algorithm is based on the following.  For the convolution of a pair
 * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
 * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
 * b.d.  A similar relation enables us to compute a 2n by 2n convolution
 * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
 * multiplications (as opposed to the 4^n that the quadratic algorithm
 * takes. */

/* For large n, this is slower than the O(n log n) that the FFT method
 * takes, but we avoid using complex numbers, and we only have to compute
 * one convolution, as opposed to 3 FFTs.  We have good locality-of-
 * reference as well, which will help on CPUs with tiny caches.  */

/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
 * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
 * algorithm.  We actually want 257 outputs of a 256 x 512 convolution;
 * that doesn't appear to give an easy advantage for the FFT algorithm, but
 * for the Karatsuba algorithm, it's easy to use two 256 x 256
 * convolutions, taking 2 x 3^8 = 12312 multiplications.  [This difference
 * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
 * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
 * - 1]. */

/* There's a big lie above, actually... for a 4x4 convolution, it's quicker
 * to do it using 16 multiplications than the more complex Karatsuba
 * algorithm...  So the recursion bottoms out at 4x4s.  This increases the
 * number of multiplications by a factor of 16/9, but reduces the overheads
 * dramatically. */

/* The convolution algorithm is implemented as a stack machine.  We have a
 * stack of commands, each in one of the forms "do a 2^n x 2^n
 * convolution", or "combine these three length 2^n outputs into one
 * 2^{n+1} output." */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdlib.h>
#include "convolve.h"

typedef union stack_entry_s
{
  struct
  {
    const double *left, *right;
    double *out;
  }
  v;
  struct
  {
    double *main, *null;
  }
  b;

}
stack_entry;

#define STACK_SIZE (CONVOLVE_DEPTH * 3)

struct _struct_convolve_state
{
  double left[CONVOLVE_BIG];
  double right[CONVOLVE_SMALL * 3];
  double scratch[CONVOLVE_SMALL * 3];
  stack_entry stack[STACK_SIZE];
};

/*
 * Initialisation routine - sets up tables and space to work in.
 * Returns a pointer to internal state, to be used when performing calls.
 * On error, returns NULL.
 * The pointer should be freed when it is finished with, by convolve_close().
 */
convolve_state *
convolve_init (void)
{
  return (convolve_state *) malloc (sizeof (convolve_state));
}

/*
 * Free the state allocated with convolve_init().
 */
void
convolve_close (convolve_state * state)
{
  if (state)
    free (state);
}

static void
convolve_4 (double *out, const double *left, const double *right)
/* This does a 4x4 -> 7 convolution.  For what it's worth, the slightly odd
 * ordering gives about a 1% speed up on my Pentium II. */
{
  double l0, l1, l2, l3, r0, r1, r2, r3;
  double a;

  l0 = left[0];
  r0 = right[0];
  a = l0 * r0;
  l1 = left[1];
  r1 = right[1];
  out[0] = a;
  a = (l0 * r1) + (l1 * r0);
  l2 = left[2];
  r2 = right[2];
  out[1] = a;
  a = (l0 * r2) + (l1 * r1) + (l2 * r0);
  l3 = left[3];
  r3 = right[3];
  out[2] = a;

  out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
  out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
  out[5] = (l2 * r3) + (l3 * r2);
  out[6] = l3 * r3;
}

static void
convolve_run (stack_entry * top, unsigned size, double *scratch)
/* Interpret a stack of commands.  The stack starts with two entries; the
 * convolution to do, and an illegal entry used to mark the stack top.  The
 * size is the number of entries in each input, and must be a power of 2,
 * and at least 8.  It is OK to have out equal to left and/or right.
 * scratch must have length 3*size.  The number of stack entries needed is
 * 3n-4 where size=2^n. */
{
  do {
    const double *left;
    const double *right;
    double *out;

    /* When we get here, the stack top is always a convolve,
     * with size > 4.  So we will split it.  We repeatedly split
     * the top entry until we get to size = 4. */

    left = top->v.left;
    right = top->v.right;
    out = top->v.out;
    top++;

    do {
      double *s_left, *s_right;
      int i;

      /* Halve the size. */
      size >>= 1;

      /* Allocate the scratch areas. */
      s_left = scratch + size * 3;
      /* s_right is a length 2*size buffer also used for
       * intermediate output. */
      s_right = scratch + size * 4;

      /* Create the intermediate factors. */
      for (i = 0; i < size; i++) {
        double l = left[i] + left[i + size];
        double r = right[i] + right[i + size];

        s_left[i + size] = r;
        s_left[i] = l;
      }

      /* Push the combine entry onto the stack. */
      top -= 3;
      top[2].b.main = out;
      top[2].b.null = NULL;

      /* Push the low entry onto the stack.  This must be
       * the last of the three sub-convolutions, because
       * it may overwrite the arguments. */
      top[1].v.left = left;
      top[1].v.right = right;
      top[1].v.out = out;

      /* Push the mid entry onto the stack. */
      top[0].v.left = s_left;
      top[0].v.right = s_right;
      top[0].v.out = s_right;

      /* Leave the high entry in variables. */
      left += size;
      right += size;
      out += size * 2;

    } while (size > 4);

    /* When we get here, the stack top is a group of 3
     * convolves, with size = 4, followed by some combines.  */
    convolve_4 (out, left, right);
    convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
    convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
    top += 2;

    /* Now process combines. */
    do {
      /* b.main is the output buffer, mid is the middle
       * part which needs to be adjusted in place, and
       * then folded back into the output.  We do this in
       * a slightly strange way, so as to avoid having
       * two loops. */
      double *out = top->b.main;
      double *mid = scratch + size * 4;
      unsigned int i;

      top++;
      out[size * 2 - 1] = 0;
      for (i = 0; i < size - 1; i++) {
        double lo;
        double hi;

        lo = mid[0] - (out[0] + out[2 * size]) + out[size];
        hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
        out[size] = lo;
        out[2 * size] = hi;
        out++;
        mid++;
      }
      size <<= 1;
    } while (top->b.null == NULL);
  } while (top->b.main != NULL);
}

int
convolve_match (const int *lastchoice,
    const short *input, convolve_state * state)
/* lastchoice is a 256 sized array.  input is a 512 array.  We find the
 * contiguous length 256 sub-array of input that best matches lastchoice.
 * A measure of how good a sub-array is compared with the lastchoice is
 * given by the sum of the products of each pair of entries.  We maximise
 * that, by taking an appropriate convolution, and then finding the maximum
 * entry in the convolutions.  state is a (non-NULL) pointer returned by
 * convolve_init.  */
{
  double avg;
  double best;
  int p = 0;
  int i;
  double *left = state->left;
  double *right = state->right;
  double *scratch = state->scratch;
  stack_entry *top = state->stack + STACK_SIZE - 1;

#if 1
  for (i = 0; i < 512; i++)
    left[i] = input[i];

  avg = 0;
  for (i = 0; i < 256; i++) {
    double a = lastchoice[255 - i];

    right[i] = a;
    avg += a;
  }
#endif
  /* We adjust the smaller of the two input arrays to have average
   * value 0.  This makes the eventual result insensitive to both
   * constant offsets and positive multipliers of the inputs. */
  avg /= 256;
  for (i = 0; i < 256; i++)
    right[i] -= avg;
  /* End-of-stack marker. */
#if     0                       /* The following line produces a CRASH, need to figure out why?!! */
  top[1].b.null = scratch;
#endif
  top[1].b.main = NULL;
  /* The low 256x256, of which we want the high 256 outputs. */
  top->v.left = left;
  top->v.right = right;
  top->v.out = right + 256;
  convolve_run (top, 256, scratch);

  /* The high 256x256, of which we want the low 256 outputs. */
  top->v.left = left + 256;
  top->v.right = right;
  top->v.out = right;
  convolve_run (top, 256, scratch);

  /* Now find the best position amoungs this.  Apart from the first
   * and last, the required convolution outputs are formed by adding
   * outputs from the two convolutions above. */
  best = right[511];
  right[767] = 0;
  p = -1;
  for (i = 0; i < 256; i++) {
    double a = right[i] + right[i + 512];

    if (a > best) {
      best = a;
      p = i;
    }
  }
  p++;

#if 0
  {
    /* This is some debugging code... */
    int bad = 0;

    best = 0;
    for (i = 0; i < 256; i++)
      best += ((double) input[i + p]) * ((double) lastchoice[i] - avg);

    for (i = 0; i < 257; i++) {
      double tot = 0;
      unsigned int j;

      for (j = 0; j < 256; j++)
        tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg);
      if (tot > best)
        printf ("(%i)", i);
      if (tot != left[i + 255])
        printf ("!");
    }

    printf ("%i\n", p);
  }
#endif

  return p;
}