/* Copyright (C) 2003 Reliable Software Group
 *                    - University of California, Santa Barbara
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

/* CVS $Id: numerical_utilities.cpp,v 1.9 2004/10/20 00:34:58 dhm Exp $ */

#include <assert.h>
#include <math.h>
#include <anomaly.h>

const double numericalUtilities::_epsilon = 1E-6;
const bool   numericalUtilities::_debug = false;


double numericalUtilities::mean(ListCollection *l) {
  ListCollection::iterator i;

  if (l->size() == 0) 
    return 0.0;

  double sum = 0.0;

  for (i = l->begin(); i != l->end(); i++) {
    NumericalItem *ni = dynamic_cast<NumericalItem *>(*i);
    assert(ni);
    sum += ni->value();
  }

  return sum / l->size();
}


double numericalUtilities::doubleMean(ListCollection *l) {
  ListCollection::iterator i;
  unsigned int counter;

  if (l->size() == 0) 
    return 0.0;

  double sum = 0.0;

  for (i = l->begin(), counter = 0; i != l->end(); i++, counter++) {
    DoubleItem *ni = dynamic_cast<DoubleItem *>(*i);

    if (ni == 0x0) {
      cerr << "numericalUtilities::doubleMean: x[" << counter << "] "
	   << "is not of type (DoubleItem *)! exiting." << endl;
      exit(1);
    }

    assert(ni);
    sum += ni->get_value();
  }

  return sum / l->size();
}

double numericalUtilities::max(ListCollection *l) {
  ListCollection::iterator i;
  double max;

  if (l->size() == 0) 
    return 0.0;

  i = l->begin();
  NumericalItem *ni = dynamic_cast<NumericalItem *>(*i);
  assert(ni);
  max = ni->value();

  for (i = l->begin(); i != l->end(); i++) {
    NumericalItem *ni = dynamic_cast<NumericalItem *>(*i);
    assert(ni);
  
    if (ni->value() > max) 
      max = ni->value();
  }

  return max;
}


double numericalUtilities::variance(ListCollection *l) {
  ListCollection::iterator i;

  if (l->size() <= 1) 
    return 0.0;

  double mean = numericalUtilities::mean(l);
  
  double sum = 0.0;

  for (i = l->begin(); i != l->end(); i++) {
    NumericalItem *ni = dynamic_cast<NumericalItem *>(*i);
    assert(ni);
    sum += (ni->value() - mean) * (ni->value() - mean);
  }

  double variance = sum / (l->size() - 1);

  return variance;
}

/* sample variance */
double numericalUtilities::doubleVariance(ListCollection *l) {
  ListCollection::iterator i;

  if (l->size() <= 1) 
    return 0.0;

  double mean = numericalUtilities::doubleMean(l);
  
  double sum = 0.0;

  for (i = l->begin(); i != l->end(); i++) {
    DoubleItem *ni = dynamic_cast<DoubleItem *>(*i);
    assert(ni);
    sum += (ni->get_value() - mean) * (ni->get_value() - mean);
  }

  double variance = sum / (l->size() - 1);

  return variance;
}


double numericalUtilities::standardDeviation(ListCollection *l) {
  double variance = numericalUtilities::variance(l);
  return sqrt(variance);
}


double numericalUtilities::doubleStandardDeviation(ListCollection *l) {
  double variance = numericalUtilities::doubleVariance(l);
  return sqrt(variance);
}

/** Compute the covariance of two sample sets (sequences). Length of
 *  sequences must match.
 * @param sequence1: first sequence
 * @param sequence2: second sequence
 */
double numericalUtilities::doubleCovariance(ListCollection *sequenceX,
					    ListCollection *sequenceY) {
  // ideally, this is computed independently from the correlation
  // measure.

  assert(sequenceX->size() == sequenceY->size());

  double varianceX = doubleStandardDeviation(sequenceX);
  double varianceY = doubleStandardDeviation(sequenceY);
  double correlationXY = pearsonProductMomentCorrelation(sequenceX,
							 sequenceY);

  return correlationXY * varianceX * varianceY;
}

/** Compute the Pearson product-moment sample correlation between two
 *  sequences. Length of sequences must match, and zero variance in either
 *  or both sequences results in zero correlation.
 * @param X1: first sequence
 * @param X2: second sequence
 * @url: http://ww2.mcgill.ca/course/204204B01/applets/explanations/corexpl.html
 */
double 
numericalUtilities::pearsonProductMomentCorrelation(ListCollection *X1,
						    ListCollection *X2) {
  double sumX1 = 0.0;
  double sumX2 = 0.0;
  double sumX1X2 = 0.0;
  double sumX1square = 0.0;
  double sumX2square = 0.0;

  assert (X1->size() == X2->size());

  double N = (double) X1->size();

  for (ListCollection::iterator i = X1->begin(), j = X2->begin(); 
       i != X1->end(); i++, j++) {
    DoubleItem *x1 = dynamic_cast<DoubleItem *>(*i);
    assert(x1);
    DoubleItem *x2 = dynamic_cast<DoubleItem *>(*j);
    assert(x2);
    sumX1 += x1->get_value();
    sumX2 += x2->get_value();
    sumX1square += x1->get_value() * x1->get_value();
    sumX2square += x2->get_value() * x2->get_value();
    sumX1X2 += x1->get_value() * x2->get_value();
  }

  double numerator = (N * sumX1X2) - (sumX1 * sumX2);
  double denominator = sqrt((N * sumX1square - (sumX1 * sumX1)) *
			    (N * sumX2square - (sumX2 * sumX2)));

  // cout << "term1 = " << ((N * sumX1square - (sumX1 * sumX1))) << endl;
  // cout << "term1 = " << ((N * sumX2square - (sumX2 * sumX2))) << endl;

  /*
  printf("sumX1square = %f\n", sumX1square);
  printf("sumX2square = %f\n", sumX2square);
  printf("sumX1       = %f\n", sumX1);
  printf("sumX2       = %f\n", sumX2);
  printf("term1       = %f\n", term1);
  printf("term2       = %f\n", term2);
  printf("term3       = %f\n", term3);
  printf("term4       = %f\n", term4);
  */

  if (fpEqual(denominator, 0.0)) {
    /*
    if (fpEqual((N * sumX1square - (sumX1 * sumX1)), 0.0)) {
      cout << "numericalUtilities::pearsonProductMomentCorrelation: lhs == 0"
	   << endl;
    }
    
    if (fpEqual((N * sumX2square - (sumX2 * sumX2)), 0.0)) {
      cout << "numericalUtilities::pearsonProductMomentCorrelation: rhs == 0"
	   << endl;
    }
    */

    return 0.0;
  }
  else
    return numerator / denominator;
}


bool numericalUtilities::fpEqual(double a, double b) {
  if (fabs(a - b) > _epsilon) {
    return false;
  }
  else {
    return true;
  }
}

void numericalUtilities::doublePrintValues(ListCollection *l) {
  ListCollection::iterator i;
  unsigned int counter;

  if (l->size() == 0) 
    return;

  for (i = l->begin(), counter = 0; i != l->end(); i++, counter++) {
    DoubleItem *ni = dynamic_cast<DoubleItem *>(*i);
    assert(ni);
    cout << "x[" << counter << "] = " << ni->get_value() << endl;
  }
}

void numericalUtilities::doubleHistogram(ListCollection *l,
					 double bucketSize) {
  // search for the low and high values automatically
  ListCollection::iterator i;
  double min;
  double max;

  if (l->size() == 0) 
    return;

  i = l->begin();
  DoubleItem *ni = dynamic_cast<DoubleItem *>(*i);
  min = ni->get_value();
  max = ni->get_value();

  for (i = l->begin(); i != l->end(); i++) {
    DoubleItem *ni = dynamic_cast<DoubleItem *>(*i);
    assert(ni);
    
    if (min > ni->get_value())
      min = ni->get_value();
    
    if (max < ni->get_value())
      max = ni->get_value();
  }

  numericalUtilities::doubleHistogram(l, min, max, bucketSize);
}

void numericalUtilities::doubleHistogram(ListCollection *l,
					 double low,
					 double high,
					 double bucketSize) {
  ListCollection::iterator i;
  assert(high > low);
  assert(bucketSize < (high - low));

  if (l->size() == 0) 
    return;

  unsigned int numBuckets = ((unsigned int) ((high - low) / bucketSize)) + 2;
  unsigned int *buckets = new unsigned int[numBuckets];
  unsigned int counter, index, j;

  if (_debug) {
    cout << "numericalUtilities::doubleHistogram: " << endl;
    cout << "  creating hist in interval (" << low << "," << high 
	 << ") with " << numBuckets << " buckets and bucketSize=" 
	 << bucketSize << endl;
  }

  for (j = 0; j < numBuckets; j++)
    buckets[j] = 0;

  for (i = l->begin(), counter = 0; i != l->end(); i++, counter++) {
    DoubleItem *ni = dynamic_cast<DoubleItem *>(*i);
    assert(ni);

    if ((ni->get_value() < low) || (ni->get_value() > high)) {
      cerr << "numericalUtilities::doubleHistogram: warning: skipped x[" 
	   << counter << "]" << endl;
      continue;
    }

    index = (unsigned int) ((ni->get_value() - low) / bucketSize);
    buckets[index]++;

    if (_debug) {
      cout << "inserted x[" << counter << "] = " << ni->get_value() << endl;
    }
  }

  for (j = 0; j < numBuckets; j++)
    cout << (low + j * bucketSize) << " " << buckets[j] << endl;
}

bool numericalUtilities::test(bool verbose) {
  ListCollection sequenceX;
  ListCollection sequenceY;
  ListCollection sequenceZ;
  unsigned int i;
  struct timeval tv;
  int r;
  unsigned int sequenceLength = 100;
  double randomLinearCorrelationLimit = 0.036;

  // gettimeofday(&tv, 0x0);
  // srand(tv.tv_usec);
  srand(123456);

  for (i = 0; i < sequenceLength; i++) {
    sequenceX.push_back(new DoubleItem((double) i));
  }

  for (i = 0; i < sequenceLength; i++) {
    sequenceY.push_back(new DoubleItem((double) i * 2));
    // sequenceY.push_back(new DoubleItem((double) 100 - i));
  }

  // random uniform (0,1) distribution scaled up to (0,100)
  for (i = 0; i < sequenceLength; i++) {
    sequenceZ.push_back(new DoubleItem(rand() / (double) RAND_MAX * 100.0));
  }

  if (_debug) {
    cout << "X = " << sequenceX << endl;
    cout << "Y = " << sequenceY << endl;
    cout << "Z = " << sequenceZ << endl;
  }

  double meanX = numericalUtilities::doubleMean(&sequenceX);
  double meanY = numericalUtilities::doubleMean(&sequenceY);
  double meanZ = numericalUtilities::doubleMean(&sequenceZ);

  double sdX = numericalUtilities::doubleStandardDeviation(&sequenceX);
  double sdY = numericalUtilities::doubleStandardDeviation(&sequenceY);
  double sdZ = numericalUtilities::doubleStandardDeviation(&sequenceZ);

  double covXY = 
    numericalUtilities::doubleCovariance(&sequenceX,&sequenceY);
  double ppmCorXY = 
    numericalUtilities::pearsonProductMomentCorrelation(&sequenceX,&sequenceY);

  double covXZ = 
    numericalUtilities::doubleCovariance(&sequenceX, &sequenceZ);
  double ppmCorXZ = 
    numericalUtilities::pearsonProductMomentCorrelation(&sequenceX,&sequenceZ);

  if (verbose) {
    cout << "mu_X     = " << meanX << endl;
    cout << "mu_Y     = " << meanY << endl;
    cout << "mu_Z     = " << meanZ << endl;

    cout << "sd(X)    = " << sdX << endl;
    cout << "sd(Y)    = " << sdY << endl;
    cout << "sd(Z)    = " << sdZ << endl;

    cout << "cov(X,Y) = " << covXY << endl;
    cout << "cor(X,Y) = " << ppmCorXY << endl;

    cout << "cov(X,Z) = " << covXZ << endl;
    cout << "cor(X,Z) = " << ppmCorXZ << endl;
  }

  if (!fpEqual(ppmCorXY, 1.0)) {
    throw LibAnomalyException("numericalUtilities::test: "
			      "cor(X,Y) doesn't look right\n");
    return false;
  }

  if (fabs(ppmCorXZ) > randomLinearCorrelationLimit) {
    throw LibAnomalyException("numericalUtilities::test: "
			      "cov(X,Z) doesn't look right\n");
    return false;
  }

  return true;
}
