/* Karatsuba convolution
 *
 *  Copyright (C) 1999 Ralph Loader <suckfish@ihug.co.nz>
 *
 * This library is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Library General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Library General Public License for more details.
 *
 * You should have received a copy of the GNU Library General Public
 * License along with this library; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 02111-1307, USA.
 *
 * 
 * Note: 7th December 2004: This file used to be licensed under the GPL,
 *       but we got permission from Ralp Loader to relicense it to LGPL.
 * 
 *  $Id$
 *
 */

/* The algorithm is based on the following.  For the convolution of a pair
 * of pairs, (a,b) * (c,d) = (0, a.c, a.d+b.c, b.d), we can reduce the four
 * multiplications to three, by the formulae a.d+b.c = (a+b).(c+d) - a.c -
 * b.d.  A similar relation enables us to compute a 2n by 2n convolution
 * using 3 n by n convolutions, and thus a 2^n by 2^n convolution using 3^n
 * multiplications (as opposed to the 4^n that the quadratic algorithm
 * takes. */

/* For large n, this is slower than the O(n log n) that the FFT method
 * takes, but we avoid using complex numbers, and we only have to compute
 * one convolution, as opposed to 3 FFTs.  We have good locality-of-
 * reference as well, which will help on CPUs with tiny caches.  */

/* E.g., for a 512 x 512 convolution, the FFT method takes 55 * 512 = 28160
 * (real) multiplications, as opposed to 3^9 = 19683 for the Karatsuba
 * algorithm.  We actually want 257 outputs of a 256 x 512 convolution;
 * that doesn't appear to give an easy advantage for the FFT algorithm, but
 * for the Karatsuba algorithm, it's easy to use two 256 x 256
 * convolutions, taking 2 x 3^8 = 12312 multiplications.  [This difference
 * is that the FFT method "wraps" the arrays, doing a 2^n x 2^n -> 2^n,
 * while the Karatsuba algorithm pads with zeros, doing 2^n x 2^n -> 2.2^n
 * - 1]. */

/* There's a big lie above, actually... for a 4x4 convolution, it's quicker
 * to do it using 16 multiplications than the more complex Karatsuba
 * algorithm...  So the recursion bottoms out at 4x4s.  This increases the
 * number of multiplications by a factor of 16/9, but reduces the overheads
 * dramatically. */

/* The convolution algorithm is implemented as a stack machine.  We have a
 * stack of commands, each in one of the forms "do a 2^n x 2^n
 * convolution", or "combine these three length 2^n outputs into one
 * 2^{n+1} output." */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <stdlib.h>
#include "convolve.h"

typedef union stack_entry_s
{
  struct
  {
    const double *left, *right;
    double *out;
  }
  v;
  struct
  {
    double *main, *null;
  }
  b;

}
stack_entry;

#define STACK_SIZE (CONVOLVE_DEPTH * 3)

struct _struct_convolve_state
{
  double left[CONVOLVE_BIG];
  double right[CONVOLVE_SMALL * 3];
  double scratch[CONVOLVE_SMALL * 3];
  stack_entry stack[STACK_SIZE];
};

/*
 * Initialisation routine - sets up tables and space to work in.
 * Returns a pointer to internal state, to be used when performing calls.
 * On error, returns NULL.
 * The pointer should be freed when it is finished with, by convolve_close().
 */
convolve_state *
convolve_init (void)
{
  return (convolve_state *) malloc (sizeof (convolve_state));
}

/*
 * Free the state allocated with convolve_init().
 */
void
convolve_close (convolve_state * state)
{
  if (state)
    free (state);
}

static void
convolve_4 (double *out, const double *left, const double *right)
/* This does a 4x4 -> 7 convolution.  For what it's worth, the slightly odd
 * ordering gives about a 1% speed up on my Pentium II. */
{
  double l0, l1, l2, l3, r0, r1, r2, r3;
  double a;

  l0 = left[0];
  r0 = right[0];
  a = l0 * r0;
  l1 = left[1];
  r1 = right[1];
  out[0] = a;
  a = (l0 * r1) + (l1 * r0);
  l2 = left[2];
  r2 = right[2];
  out[1] = a;
  a = (l0 * r2) + (l1 * r1) + (l2 * r0);
  l3 = left[3];
  r3 = right[3];
  out[2] = a;

  out[3] = (l0 * r3) + (l1 * r2) + (l2 * r1) + (l3 * r0);
  out[4] = (l1 * r3) + (l2 * r2) + (l3 * r1);
  out[5] = (l2 * r3) + (l3 * r2);
  out[6] = l3 * r3;
}

static void
convolve_run (stack_entry * top, unsigned size, double *scratch)
/* Interpret a stack of commands.  The stack starts with two entries; the
 * convolution to do, and an illegal entry used to mark the stack top.  The
 * size is the number of entries in each input, and must be a power of 2,
 * and at least 8.  It is OK to have out equal to left and/or right.
 * scratch must have length 3*size.  The number of stack entries needed is
 * 3n-4 where size=2^n. */
{
  do {
    const double *left;
    const double *right;
    double *out;

    /* When we get here, the stack top is always a convolve,
     * with size > 4.  So we will split it.  We repeatedly split
     * the top entry until we get to size = 4. */

    left = top->v.left;
    right = top->v.right;
    out = top->v.out;
    top++;

    do {
      double *s_left, *s_right;
      int i;

      /* Halve the size. */
      size >>= 1;

      /* Allocate the scratch areas. */
      s_left = scratch + size * 3;
      /* s_right is a length 2*size buffer also used for
       * intermediate output. */
      s_right = scratch + size * 4;

      /* Create the intermediate factors. */
      for (i = 0; i < size; i++) {
        double l = left[i] + left[i + size];
        double r = right[i] + right[i + size];

        s_left[i + size] = r;
        s_left[i] = l;
      }

      /* Push the combine entry onto the stack. */
      top -= 3;
      top[2].b.main = out;
      top[2].b.null = NULL;

      /* Push the low entry onto the stack.  This must be
       * the last of the three sub-convolutions, because
       * it may overwrite the arguments. */
      top[1].v.left = left;
      top[1].v.right = right;
      top[1].v.out = out;

      /* Push the mid entry onto the stack. */
      top[0].v.left = s_left;
      top[0].v.right = s_right;
      top[0].v.out = s_right;

      /* Leave the high entry in variables. */
      left += size;
      right += size;
      out += size * 2;

    } while (size > 4);

    /* When we get here, the stack top is a group of 3
     * convolves, with size = 4, followed by some combines.  */
    convolve_4 (out, left, right);
    convolve_4 (top[0].v.out, top[0].v.left, top[0].v.right);
    convolve_4 (top[1].v.out, top[1].v.left, top[1].v.right);
    top += 2;

    /* Now process combines. */
    do {
      /* b.main is the output buffer, mid is the middle
       * part which needs to be adjusted in place, and
       * then folded back into the output.  We do this in
       * a slightly strange way, so as to avoid having
       * two loops. */
      double *out = top->b.main;
      double *mid = scratch + size * 4;
      unsigned int i;

      top++;
      out[size * 2 - 1] = 0;
      for (i = 0; i < size - 1; i++) {
        double lo;
        double hi;

        lo = mid[0] - (out[0] + out[2 * size]) + out[size];
        hi = mid[size] - (out[size] + out[3 * size]) + out[2 * size];
        out[size] = lo;
        out[2 * size] = hi;
        out++;
        mid++;
      }
      size <<= 1;
    } while (top->b.null == NULL);
  } while (top->b.main != NULL);
}

int
convolve_match (const int *lastchoice,
    const short *input, convolve_state * state)
/* lastchoice is a 256 sized array.  input is a 512 array.  We find the
 * contiguous length 256 sub-array of input that best matches lastchoice.
 * A measure of how good a sub-array is compared with the lastchoice is
 * given by the sum of the products of each pair of entries.  We maximise
 * that, by taking an appropriate convolution, and then finding the maximum
 * entry in the convolutions.  state is a (non-NULL) pointer returned by
 * convolve_init.  */
{
  double avg;
  double best;
  int p = 0;
  int i;
  double *left = state->left;
  double *right = state->right;
  double *scratch = state->scratch;
  stack_entry *top = state->stack + STACK_SIZE - 1;

#if 1
  for (i = 0; i < 512; i++)
    left[i] = input[i];

  avg = 0;
  for (i = 0; i < 256; i++) {
    double a = lastchoice[255 - i];

    right[i] = a;
    avg += a;
  }
#endif
  /* We adjust the smaller of the two input arrays to have average
   * value 0.  This makes the eventual result insensitive to both
   * constant offsets and positive multipliers of the inputs. */
  avg /= 256;
  for (i = 0; i < 256; i++)
    right[i] -= avg;
  /* End-of-stack marker. */
#if     0                       /* The following line produces a CRASH, need to figure out why?!! */
  top[1].b.null = scratch;
#endif
  top[1].b.main = NULL;
  /* The low 256x256, of which we want the high 256 outputs. */
  top->v.left = left;
  top->v.right = right;
  top->v.out = right + 256;
  convolve_run (top, 256, scratch);

  /* The high 256x256, of which we want the low 256 outputs. */
  top->v.left = left + 256;
  top->v.right = right;
  top->v.out = right;
  convolve_run (top, 256, scratch);

  /* Now find the best position amoungs this.  Apart from the first
   * and last, the required convolution outputs are formed by adding
   * outputs from the two convolutions above. */
  best = right[511];
  right[767] = 0;
  p = -1;
  for (i = 0; i < 256; i++) {
    double a = right[i] + right[i + 512];

    if (a > best) {
      best = a;
      p = i;
    }
  }
  p++;

#if 0
  {
    /* This is some debugging code... */
    int bad = 0;

    best = 0;
    for (i = 0; i < 256; i++)
      best += ((double) input[i + p]) * ((double) lastchoice[i] - avg);

    for (i = 0; i < 257; i++) {
      double tot = 0;
      unsigned int j;

      for (j = 0; j < 256; j++)
        tot += ((double) input[i + j]) * ((double) lastchoice[j] - avg);
      if (tot > best)
        printf ("(%i)", i);
      if (tot != left[i + 255])
        printf ("!");
    }

    printf ("%i\n", p);
  }
#endif

  return p;
}