/* Karatsuba convolution * * Copyright (C) 1999 Ralph Loader * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Library General Public * License as published by the Free Software Foundation; either * version 2 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Library General Public License for more details. * * You should have received a copy of the GNU Library General Public * License along with this library; if not, write to the * Free Software Foundation, Inc., 59 Temple Place - Suite 330, * Boston, MA 02111-1307, USA. * * * Note: 7th December 2004: This file used to be licensed under the GPL, * but we got permission from Ralp Loader to relicense it to LGPL. * * $Id$ * */ /* The algorithm is based on the following. For the convolution of a pair * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c - * b.d. A similar relation enables us to compute a 2n by 2n convolution * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n * multiplications (as opposed to the 4^n that the quadratic algorithm * takes. */ /* For large n, this is slower than the O(n log n) that the FFT method * takes, but we avoid using complex numbers, and we only have to compute * one convolution, as opposed to 3 FFTs. We have good locality-of- * reference as well, which will help on CPUs with tiny caches. */ /* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160 * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba * algorithm. We actually want 257 outputs of a 256 x 512 convolution; * that doesn't appear to give an easy advantage for the FFT algorithm, but * for the Karatsuba algorithm, it's easy to use two 256 x 256 * convolutions, taking 2 x 3^8 = 12312 multiplications. [This difference * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n, * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n * - 1]. */ /* There's a big lie above, actually... for a 4x4 convolution, it's quicker * to do it using 16 multiplications than the more complex Karatsuba * algorithm... So the recursion bottoms out at 4x4s. This increases the * number of multiplications by a factor of 16/9, but reduces the overheads * dramatically. */ /* The convolution algorithm is implemented as a stack machine. We have a * stack of commands, each in one of the forms "do a 2^n x 2^n * convolution", or "combine these three length 2^n outputs into one * 2^{n+1} output." */ #ifdef HAVE_CONFIG_H #include "config.h" #endif #include #include "convolve.h" typedef union stack_entry_s { struct { const double *left, *right; double *out; } v; struct { double *main, *null; } b; } stack_entry; #define STACK_SIZE (CONVOLVE_DEPTH * 3) struct _struct_convolve_state { double left[CONVOLVE_BIG]; double right[CONVOLVE_SMALL * 3]; double scratch[CONVOLVE_SMALL * 3]; stack_entry stack[STACK_SIZE]; }; /* * Initialisation routine - sets up tables and space to work in. * Returns a pointer to internal state, to be used when performing calls. * On error, returns NULL. * The pointer should be freed when it is finished with, by convolve_close(). */ convolve_state * convolve_init (void) { return (convolve_state *) malloc (sizeof (convolve_state)); } /* * Free the state allocated with convolve_init(). */ void convolve_close (convolve_state * state) { if (state) free (state); } static void convolve_4 (double *out, const double *left, const double *right) /* This does a 4x4 -> 7 convolution. For what it's worth, the slightly odd * ordering gives about a 1% speed up on my Pentium II. */ { double l0, l1, l2, l3, r0, r1, r2, r3; double a; l0 = left[0]; r0 = right[0]; a = l0 * r0; l1 = left[1]; r1 = right[1]; out[0] = a; a = (l0 * r1) + (l1 * r0); l2 = left[2]; r2 = right[2]; out[1] = a; a = (l0 * r2) + (l1 * r1) + (l2 * r0); l3 = left[3]; r3 = right[3]; out[2] = a; out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0); out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1); out[5] = (l2 * r3) + (l3 * r2); out[6] = l3 * r3; } static void convolve_run (stack_entry * top, unsigned size, double *scratch) /* Interpret a stack of commands. The stack starts with two entries; the * convolution to do, and an illegal entry used to mark the stack top. The * size is the number of entries in each input, and must be a power of 2, * and at least 8. It is OK to have out equal to left and/or right. * scratch must have length 3*size. The number of stack entries needed is * 3n-4 where size=2^n. */ { do { const double *left; const double *right; double *out; /* When we get here, the stack top is always a convolve, * with size > 4. So we will split it. We repeatedly split * the top entry until we get to size = 4. */ left = top->v.left; right = top->v.right; out = top->v.out; top++; do { double *s_left, *s_right; int i; /* Halve the size. */ size >>= 1; /* Allocate the scratch areas. */ s_left = scratch + size * 3; /* s_right is a length 2*size buffer also used for * intermediate output. */ s_right = scratch + size * 4; /* Create the intermediate factors. */ for (i = 0; i < size; i++) { double l = left[i] + left[i + size]; double r = right[i] + right[i + size]; s_left[i + size] = r; s_left[i] = l; } /* Push the combine entry onto the stack. */ top -= 3; top[2].b.main = out; top[2].b.null = NULL; /* Push the low entry onto the stack. This must be * the last of the three sub-convolutions, because * it may overwrite the arguments. */ top[1].v.left = left; top[1].v.right = right; top[1].v.out = out; /* Push the mid entry onto the stack. */ top[0].v.left = s_left; top[0].v.right = s_right; top[0].v.out = s_right; /* Leave the high entry in variables. */ left += size; right += size; out += size * 2; } while (size > 4); /* When we get here, the stack top is a group of 3 * convolves, with size = 4, followed by some combines. */ convolve_4 (out, left, right); convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right); convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right); top += 2; /* Now process combines. */ do { /* b.main is the output buffer, mid is the middle * part which needs to be adjusted in place, and * then folded back into the output. We do this in * a slightly strange way, so as to avoid having * two loops. */ double *out = top->b.main; double *mid = scratch + size * 4; unsigned int i; top++; out[size * 2 - 1] = 0; for (i = 0; i < size - 1; i++) { double lo; double hi; lo = mid[0] - (out[0] + out[2 * size]) + out[size]; hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size]; out[size] = lo; out[2 * size] = hi; out++; mid++; } size <<= 1; } while (top->b.null == NULL); } while (top->b.main != NULL); } int convolve_match (const int *lastchoice, const short *input, convolve_state * state) /* lastchoice is a 256 sized array. input is a 512 array. We find the * contiguous length 256 sub-array of input that best matches lastchoice. * A measure of how good a sub-array is compared with the lastchoice is * given by the sum of the products of each pair of entries. We maximise * that, by taking an appropriate convolution, and then finding the maximum * entry in the convolutions. state is a (non-NULL) pointer returned by * convolve_init. */ { double avg; double best; int p = 0; int i; double *left = state->left; double *right = state->right; double *scratch = state->scratch; stack_entry *top = state->stack + STACK_SIZE - 1; #if 1 for (i = 0; i < 512; i++) left[i] = input[i]; avg = 0; for (i = 0; i < 256; i++) { double a = lastchoice[255 - i]; right[i] = a; avg += a; } #endif /* We adjust the smaller of the two input arrays to have average * value 0. This makes the eventual result insensitive to both * constant offsets and positive multipliers of the inputs. */ avg /= 256; for (i = 0; i < 256; i++) right[i] -= avg; /* End-of-stack marker. */ #if 0 /* The following line produces a CRASH, need to figure out why?!! */ top[1].b.null = scratch; #endif top[1].b.main = NULL; /* The low 256x256, of which we want the high 256 outputs. */ top->v.left = left; top->v.right = right; top->v.out = right + 256; convolve_run (top, 256, scratch); /* The high 256x256, of which we want the low 256 outputs. */ top->v.left = left + 256; top->v.right = right; top->v.out = right; convolve_run (top, 256, scratch); /* Now find the best position amoungs this. Apart from the first * and last, the required convolution outputs are formed by adding * outputs from the two convolutions above. */ best = right[511]; right[767] = 0; p = -1; for (i = 0; i < 256; i++) { double a = right[i] + right[i + 512]; if (a > best) { best = a; p = i; } } p++; #if 0 { /* This is some debugging code... */ int bad = 0; best = 0; for (i = 0; i < 256; i++) best += ((double) input[i + p]) * ((double) lastchoice[i] - avg); for (i = 0; i < 257; i++) { double tot = 0; unsigned int j; for (j = 0; j < 256; j++) tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg); if (tot > best) printf ("(%i)", i); if (tot != left[i + 255]) printf ("!"); } printf ("%i\n", p); } #endif return p; }