#ifndef MODEL_H__
#define MODEL_H__

/* Declares models that are used to evaluate the anomaly of 
 * features of input elements. Also declares the factories used
 * to instantiate models properly.
 * Copyright (C) 2003 Reliable Software Group
 *                    - University of California, Santa Barbara
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public License
 * as published by the Free Software Foundation; either version 2
 * of the License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 * 
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
 */

/* CVS $Id: model.h,v 1.15 2003/10/06 23:17:16 dhm Exp $ */

#include <anomaly.h>


/** \brief The abstract model root class defines the minimal functionality 
 * that each model must support.
 *
 * A model supports two basic modes of operation. In one mode (learning), it is 
 * used to learn the properties of normal instances of the feature
 * of input elements. In the other mode (detection), it is used to compare input
 * elements to the previously established profile and report the deviation (as a 
 * double value in the interval [0,1]. When the deviation is significant, a value 
 * close to zero is returned, otherwise a value close to one.
 */
class Model {
 public:

/** \enum ModelMode
 * Defines the different modes (i.e. states) that a model can be in.
 */ 
enum ModelMode { 
  Training, /**< The model is in learning mode (insert_item) */
  Detection /**< The model is in detection mode (check_item) */
};

 public:

  /** Default constructor.
   */
  Model(void);

  /** Constructor that assigns a name to this model.
   * @param name: a string that names this model
   */
  Model(string name);

  /** Virtual destructor.
   */
  virtual ~Model(void);

  /** Insert an item into the model during the training phase.
   * @param item: a pointer to the object that is added to the model
   */
  virtual void insert_item(Item *item) = 0;
  
  /** Switch between different modes. Used to make the transition between
   * learning and detection phase.
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   */
  virtual void switch_mode(ModelMode mode) = 0;

  /** Check a new item to accordance with the model profile and return 
   * its anomaly score.
   * @param item: a pointer to the item that should be checked
   * @return a double value in the interval [0,1] where 0 is totally abnormal 
   * and 1 is perfectly normal
  */
  virtual double check_item(Item *item) = 0;

  /** Return the confidence that the model has in its anomaly scores
   * (as returned by check_item).
   * @return a double value in the interval [0,1]. 0 expresses that the
   * value returned by check_item is useless and 1 expresses that the model is
   * absolutely sure about the results of check_item.
   */
  virtual double get_confidence(void);

  /** Set the name of this model 
   * @param name: a string which sets the name of this model 
   */
  virtual void set_name(string name);

  /** Return the name of this model (default is empty string)
   * @return a string with the name of this model 
   */
  virtual string get_name(void);

  /** Turn on/off debugging for this model.
   *  @param debug: new value for debug flag
   */
  virtual void set_debug(bool debug);

 protected:
  /** The current mode of the model (such as learning or detection).
   */
  ModelMode _mode;

  /** The current debug mode of the model (on/off).
   */
  bool      _debug;

 private:

  /** The name of this model (could be used for debugging, identification, etc.)
   */
  string _name;
};




/** \brief The abstract model factory root class defines the minimal functionality 
 * that each model must support.
 *
 * A model factory is a wrapper around a model. It is used to
 * instantiate its corresponding model and can provide basic
 * initialization. It also allows a user to encapsulate different
 * implementations of a certain model interface in the same factory
 * class. It is also useful for serialization as it can be used to
 * construct models from their serialized form.
 */
class ModelFactory : public Item {
 public:

  /** Virtual destructor.
   */
  virtual ~ModelFactory(void);

  /** Instantiate the encapsulated model.
   * @return a pointer to the instantiated model
   */
  virtual Model *instance(void) = 0;

  /** Return a hash value for this Item. This value is zero per
   * default (bad hash function).
   * @return opaque hash_value of type size_t 
   */
  virtual size_t hash_value(void) const;
};




/** \brief The purpose of the token finder model is to determine whether the
 * values of a certain feature are drawn from a limited set
 * of possible alternatives (i.e., they are tokens or elements of an
 * enumeration).
 */
class TokenFinder : public Model {
 public:
  /** Default constructor.
   */
  TokenFinder(void);

  /** Virtual destructor.
   */
  virtual ~TokenFinder(void);

  /** Insert an item into the TokenFinder during the training phase.
   * @param item: a pointer to the object that is added to the model
   */
  virtual void insert_item(Item *item);
  
  /** Switch between different modes. Used to make the transition between
   * learning and detection phase.
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   * @throws ModelException: an illegal state transition has been
   * requested. Currently, TokenFinder can only be switched
   * to Detection (Model::ModelMode)
   */
  virtual void switch_mode(ModelMode mode) throw (ModelException);

  /** Check a new item to accordance with the model profile and return 
   * its anomaly score.
   * @param item: a pointer to the item that should be checked
   * @return a double value in the interval [0,1]. When the TokenFinder
   * has concluded that the items are unique identifiers, it always returns
   * 1. When it stores and enumeration and the item is member of this enumeration,
   * it returns 1, 0 otherwise.
   * @throws ModelException: when the TokenFinder is not in state
   * Detection (Model::ModelMode)
   */
  virtual double check_item(Item *item) throw (ModelException);

  /** Module test function 
   * @param verbose: when set to true, output for each test is written to stderr, 
   * otherwise no output is generated
   * @return true, if all tests succeeded, false otherwise
   */
  static bool test(bool verbose);

 private:

  /** Internal function used in the process to determine if items are unique
   * or members of enumeration. 
   * This function calculates the mean of the integers in the argument list.
   * @param list: list of integers
   * @return mean of integers in list
   * @throws ModelException: when the list has less than one
   * element 
   */
  double get_int_mean(list<int> &list) throw (ModelException);

  /** Internal function used in the process to determine if items are unique
   * or members of enumeration. 
   * This function calculates the variance of the integers in the argument list.
   * @param list: list of integers
   * @return variance of integers in list
   * @throws ModelException: when the list has less than two
   * element 
   */
  double get_int_variance(list<int> &list) throw (ModelException);

  /** Internal function used in the process to determine if items are unique
   * or members of enumeration. 
   * This function calculates the covariance between the integers in
   * the argument lists. Both lists must contain at least two elements
   * and must have the same length.
   * @param list1: first list of integers
   * @param list2: second list of integers
   * @return covariance of integers in first and second list
   * @throws ModelException: when the lists have different length
   * (i.e., number of elements) or less than two elements
   */
  double get_int_covariance(list<int> &list1, list<int> &list2) throw (ModelException);

  /** Internal function used in the process to determine if items are unique
   * or members of enumeration. 
   * This function determines if the integers in two lists are
   * correlated, and if so, whether they are positively or negatively correlated.
   * @param list1: first list of integers
   * @param list2: second list of integers
   * @param positive: return argument. When the lists are positively
   * correlated, the value is set to true. When the lists are negatively
   * correlated, the value is set to false. When the lists are not
   * correlated at all, this value is undefined.
   * @return true, if integers in first and second list are
   * correlated, false otherwise (see third argument).
   */
  bool is_correlated(list<int> &list1, list<int> &list2, bool &positive);

  /** Return the confidence that the TokenFinder has in its anomaly
   * scores (as returned by check_item).  
   * @return 0, when the * TokenFinder has determined that all items
   * are unique, 1 when they * are elements of an enumeration
   */
  virtual double get_confidence(void);

private:
  /** Map to store tokens together with their number of occurrences. 
   */
  __gnu_cxx::hash_map<Item *, unsigned long> _elements;

  /** Store total number of elements.
   */
  unsigned long _element_count;
};




/** \brief Factory class that encapsulates the TokenFinder model.
 */
class TokenFinderFactory : public ModelFactory {
 public:
  /** Instantiate the encapsulated model.
   * @return a pointer to the instantiated TokenFinder
   */
  Model *instance(void);
};




/* Forward Declarations */
class Histogram;
class BayesianNetwork;

/** \brief A model that represents the profile as a probability
 * density function.
 *
 * The probability density function (pdf) model requires that the
 * input items can be mapped to numerical values (have the type
 * NumericalItem or be derived from it). This is necessary to assign
 * probabilities to each distinct numerical value. The sum of the
 * probabilities for all numerical values (for the complete domain)
 * has to be 1 (i.e., a well-defined pdf).
 */
class PdfFunction : public Model {
 public:
  /** Default constructor.
   */
  PdfFunction(void);

  /** Virtual destructor.
   */
  virtual ~PdfFunction(void);

  /** Get pdf function value (i.e., probability) at position x. 
   * @param x: point where the pdf should be evaluated (note that a
   * NumericalItem can be mapped to a size_t which can be converted
   * into a double
   * @return the pdf value at point x
   */
  virtual double f(double x) = 0;

  /** Get probability that a value is smaller than x - this is based
   * on the cdf (which can be derived from the pdf). This is identical
   * to the sum of the pdf values in the interval [-inf, x]. 
   * @param x: specifies the right border of the interval [-inf, x] 
   * @return the sum of the pdf values in the interval [-inf, x] 
   */ 
  virtual double F(double x) = 0;

  /** Get the mean (expected value) E[x] of the pdf. 
   * @return the mean (expected value) E[x] of the pdf 
   */
  virtual double get_mean(void) = 0;

  /** Get the variance E[(x-E(x))^2] of the pdf. 
   * @return the variance of the pdf
   */
  virtual double get_variance(void) = 0;

  /** Check a new item to accordance with the model profile and return
   * its anomaly score. The item must be of type NumericalItem (or
   * derived) or a ListCollection of NumericalItem..
   * @param item: a pointer to the item that should be checked. The
   * item must be a NumericalItem (or derived) or a ListCollection of
   * NumericalItem.
   * @return a double value in the interval [0,1]. When item is a
   * single NumericalItem (with a value of x), this return
   * value is calculated as the probability that a value drawn from the
   * model distribution pdf deviates at least as much as x (defined as the
   * numerical value of the item) from the mean of the pdf. This reflects
   * the intuition that items with values that are close to the mean of the pdf are
   * regular (i.e., receive a high probability) while others that are far
   * out are anomalous (i.e., have a low probability to occur).
   * The exact value is calculated as twice the area below the
   * distribution's pdf in the interval restricted by x and
   * either negative or positive infinity (when x is less than or greater
   * than the mean, respectively). The result is 1 when x is identical to
   * the mean and gradually decreases to 0 with x getting smaller or
   * bigger. When a list of NumericalItem is passed, they are
   * transformed into a histogram (by frequency counting) and validate_data()
   * is called.
   * @throws ModelInputException: when item is not of type
   * NumericalItem or ListCollection of NumericalItem
   * @throws ModelException: when the PdfFunction is not in state
   * Detection (Model::ModelMode)
   */
  virtual double check_item(Item *item) throw (ModelInputException, ModelException);

  /** Get the probability that the data in the histogram is derived
   * from the underlying probability density function. This probability
   * is calculated by a statistical test (a variant of the Pearson
   * chi^2-test as a `goodness-of-fit' test). The probability is close
   * to 1 if the pdf and the histogram correspond, close to 0 if this
   * is not the case.
   * @param data: numerical values with their corresponding
   * frequencies (i.e., a histogram)
   * @return the probability that this histogram corresponds to the pdf
   */
  virtual double validate_data(Histogram *data);

  /** Get the probability that the data in the histogram is derived
   * from the underlying probability density function as explained for
   * validata_data(Histogram*). This function takes a list of
   * integers that can influence the buckets of the statistical test.
   * @param data: numerical values with their corresponding
   * frequencies (i.e., a histogram)
   * @param intervals: a list of integers that divide the domain into
   * intervals (buckets). The list must be sorted in ascending
   * order. E.g. the list (1,11,42) would divide the domain into the
   * intervals [1,11[ and [11,42[ for the statistical test.
   * @return the probability that this histogram corresponds to the
   * pdf
   * @throws ModelException: when the list of intervals does not specify
   * at least a single interval (i.e., has not at least two elements (bounds))
   */
  virtual double validate_data(Histogram *data, list<int> intervals) throw (ModelException);

  /** Function to add interval bounds to the private intervals
   *   list. The list is always sorted in ascending order and
   *   duplicates are suppressed.
   * @param bound: the new interval bound to add 
   */
  void add_interval_bound(int bound);

 protected:
  /** A variable that determines whether recalculations are necessary
   * to satisfy mean, variance and F() calls. Often, costly
   * calculations are necessary to return results for these functions.
   * Much computation can be done once nd reused for later calls. _valid
   * is true, if no recalculations are necessary, false otherwise.
   */
  bool _valid;

 private:

  /** Actual check_item() function that is called when the argument 
   *  to check_item is a single NumericalItem.
   */ 
  virtual double _check_item(NumericalItem *item); 

  /** Actual check_item() function that is called when the argument 
   *  to check_item is a list of NumericalItem.
   * @param list: list of NumericalItem to check via statistical test
   */ 
  double _check_item_list(ListCollection *list); 

  /** Default intervals that are used when calling
   * check_item with a list. When no intervals are given, the
   * statistical test divides the domain into several equal sized
   * intervals. 
   */  
  list<int> intervals;
};




typedef  map<int, unsigned long>::iterator HistogramIter;
typedef  map<int, unsigned long>::reverse_iterator HistogramRIter;
typedef map<int, double>::iterator CdfIter;

/** \brief The Histogram model is a discrete probability function
 * (pdf) that assigns a probability to each numerical value (of
 * NumericalItem).
 */
class Histogram : public PdfFunction  {
 public:
  /** Default constructor. 
   */
  Histogram(void);

  /** Virtual destructor.
   */
  virtual ~Histogram(void);

  /** Insert an item into the model during the training phase.
   * @param item: a pointer to the object that is added to the
   * model. The item pointer must point to a NumericalItem or a list
   * of NumericalItem or a list of ItemCount. 
   * @throws ModelInputException: item is not a NumericalItem or a
   * ListCollection of NumericalItem or a ListCollection of ItemCount.
   */
  virtual void insert_item(Item *item) throw (ModelInputException);
  
  /** Switch between different modes. Used to make the transition between
   * learning and detection phase.
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   * @throws ModelException: an illegal state transition has been
   * requested. Currently, Histogram can only be switched to Detection (Model::ModelMode)
   */
  virtual void switch_mode(ModelMode mode) throw (ModelException);

  /** Get pdf function value (i.e., probability) at position x. 
   * @param x: point where the pdf should be evaluated (note that a
   * NumericalItem can be mapped to a size_t which can be converted
   * into a double
   * @return the pdf value at point x
   */
  virtual double f(double x);

  /** Get probability that a value is smaller than x - this is based
   * on the cdf (which can be derived from the pdf). This is identical
   * to the sum of the pdf values in the interval [-inf, x]. 
   * @param x: specifies the right border of the interval [-inf, x] 
   * @return the sum of the pdf values in the interval [-inf, x] 
   */ 
  virtual double F(double x);

  /** Get the mean (expected value) E[x] of the pdf. 
   * @return the mean (expected value) E[x] of the pdf 
   * @throws ModelException: mean requested when model is not in state
   * Detection (Model::ModelMode) 
   */
  virtual double get_mean(void) throw (ModelException);

  /** Get the variance E[(x-E(x))^2] of the pdf. 
   * @return the variance of the pdf
   * @throws ModelException: variance requested when model is not in state
   * Detection (Model::ModelMode) 
   */
  virtual double get_variance(void) throw (ModelException);

  /** Retrieve the total number of inserted (stored) elements in this
   *  Histogram.
   * @return the number of stored elements
   */ 
  virtual unsigned long get_element_count(void);

  /** Return the confidence that a histogram model has in its anomaly scores
   * (as returned by check_item).
   * @return a double value in the interval [0,1]. 0 expresses that the
   * value returned by check_item is useless and 1 expresses that the model is
   * absolutely sure about the results of check_item. The confidence
   * is calculated by a statistical test (chi-2 test) that determines
   * how well the derived curve fits the underlying data. E.g. when 10 data
   * points (with 5 having a value of 0 and 5 having a value of 2) are
   * approximated by a Gaussian, the Gaussian will have a mean of 1
   * and a low variance. However, the data is not approximated well. This
   * should be reflected in the confidence score. 
   */
  virtual double get_confidence(void);

  /** Module test function. 
   * @param verbose: when set to true, output for each test is written to stderr, 
   * otherwise no output is generated
   * @return true, if all tests succeeded, false otherwise
   */
  static bool test(bool verbose);

  // FIX remove this -- dhm
  void display_elements(void);
 
 protected:
  /* Pdf function needs direct access to the elements for the
   * statistical test. 
   */
  friend class PdfFunction;

  /** Stores elements together with their count. Items must be of type
   * NumericalItem and their value is used as index in the map.
   */
  map<int, unsigned long> _elements;

  /** Stores the total number of inserted items (i.e. total of all
   * counts). 
   */
  unsigned long _element_count;

  /** Stores the cdf of the histogram for fast retrieval. The cdf of
   * the histogram allows to answer F() calls efficiently.
   */
  map<int, double> _element_cdf;

  /** The mean of the model when PdfFunction::_valid is true.
   */
  double _mu; 

  /** The variance of the model when PdfFunction::_valid is true.
   */
  double _sigma;

  /** The confidence value as returned by get_confidence. It is
   *  calculated in switch_mode(). 
   */
  double _confidence;

  /** The number of times that insert is being called. This is
   * important for some models to estimate their confidence. It has
   * to be increased by each call to inert_item() of derived classes
   * that override this function.
   */
  unsigned long _inserts;

  // FIX remove these -- dhm
  static unsigned int _instanceCounter;
  unsigned int _instanceNumber;

 private:
  /** Actual insert_item() function that is called when the argument 
   *  to insert_item is a single NumericalItem.
   * @param item NumericalItem that is inserted
   */ 
  void _insert_item(NumericalItem *item); 

  /** Actual insert_item() function that is called when the argument 
   *  to insert_item is a list of NumericalItem.
   * @param list: list of NumericalItem to insert
   * @throws ModelInputException: item is not a ListCollection of NumericalItem
   * or a ListCollection of ItemCount. 
   */ 
  void _insert_item_list(ListCollection *list) throw (ModelInputException); 
};




/** \brief Factory class that encapsulates the Histogram model. 
 *
 * The character distribution models the distribution of the 256
 * character (byte) values. It can be used to analyze strings.
 */
class CharacterDistributionFactory : public ModelFactory 
{
 public:
  /** Instantiate the encapsulated Histogram and fill the intervals
   *  with the values needed for the character distribution.
   * @return a pointer to the instantiated Histogram
   */
  Model *instance(void);
};




/** \brief A probability density function that is Gaussian (normal)
 * distributed. 
 */
class NormalPdfFunction : public Histogram {
 public:
  /** Default constructor.
   */
  NormalPdfFunction(void);

  /** Virtual destructor.
   */
  virtual ~NormalPdfFunction(void);

  /** Switch between different modes. Used to make the transition between
   * learning and detection phase.
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   * @throws ModelException: an illegal state transition has been
   * requested. Currently, NormalPdfFunction can only be switched
   * to Detection (Model::ModelMode)
   */
  virtual void switch_mode(ModelMode mode) throw (ModelException);

  /** Get the mean (expected value) E[x] of the normal pdf. 
   * @return the mean (expected value) E[x] of the pdf 
   * @throws ModelException: mean requested when model is not in state
   * Detection (Model::ModelMode) 
   */
  virtual double get_mean(void) throw (ModelException);

  /** Get the variance E[(x-E(x))^2] of the pdf. 
   * @return the variance of the pdf
   * @throws ModelException: variance requested when model is not in state
   * Detection (Model::ModelMode) 
   */
  virtual double get_variance(void) throw (ModelException);

  /** Get pdf function value (i.e., probability) at position x. This
   * is calculated using the pdf formula for the Gaussian distribution.
   * @param x: point where the pdf should be evaluated (note that a
   * NumericalItem can be mapped to a size_t which can be converted
   * into a double
   * @return the pdf value at point x
   */
  virtual double f(double x);

  /** Get probability that a value is smaller than x - this is based
   * on the cdf table for the Gaussian distribution.
   * @param x: specifies the right border of the interval [-inf, x] 
   * @return the sum of the pdf values in the interval [-inf, x] 
   */ 
  virtual double F(double x);

  /** Module test function. 
   * @param verbose: when set to true, output for each test is written to stderr, 
   * otherwise no output is generated
   * @return true, if all tests succeeded, false otherwise
   */
   static bool test(bool verbose);

 private:
   /** Double variable that holds 1/(sigma * sqrt(2*PI)) and that is used for
    * speeding up the computation of f() when PdfFunction::_valid is true. 
    */
  double _denominator;
};




/** \brief A probability density function that is logarithmic-normal
 * distributed. 
 *
 * A variable x is logarithmic-normal (lognormal) distributed if y =
 * ln(x) is normal distributed, where ln denotes the natural logarithm.
 */
class LogNormalPdfFunction : public Histogram {
 public:
  /** Default constructor.
   */
  LogNormalPdfFunction(void);

  /** Virtual destructor.
   */
  virtual ~LogNormalPdfFunction(void);

  /** Switch between different modes. Used to make the transition between
   * learning and detection phase.
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   * @throws ModelException: an illegal state transition has been
   * requested. Currently, LogNormalPdfFunction can only be switched
   * to Detection (Model::ModelMode) 
   */
  virtual void switch_mode(ModelMode mode) throw (ModelException);

  /** Get the mean (expected value) E[x] of the lognormal pdf. 
   * @return the mean (expected value) E[x] of the pdf 
   * @throws ModelException: mean requested when model is not in state
   * Detection (Model::ModelMode)
   */
  virtual double get_mean(void) throw (ModelException);


  /** Get the variance E[(x-E(x))^2] of the pdf. 
   * @return the variance of the pdf
   * @throws ModelException: variance requested when model is not in state
   * Detection (Model::ModelMode)
   */
  virtual double get_variance(void) throw (ModelException);

  /** Get pdf function value (i.e., probability) at position x. This
   * is calculated using the pdf formula for the Gaussian distribution.
   * @param x: point where the pdf should be evaluated (note that a
   * NumericalItem can be mapped to a size_t which can be converted
   * into a double
   * @return the pdf value at point x
   */
  virtual double f(double x);

  /** Get probability that a value is smaller than x - this is based
   * on the cdf table for the lognormal distribution.
   * @param x: specifies the right border of the interval [-inf, x] 
   * @return the sum of the pdf values in the interval [-inf, x] 
   */ 
  virtual double F(double x);

  /** Module test function. 
   * @param verbose: when set to true, output for each test is written to stderr, 
   * otherwise no output is generated
   * @return true, if all tests succeeded, false otherwise
   */
  static bool test(bool verbose);

 private:

  /** The location parameter of the lognormal distribution when
   * PdfFunction::_valid is true. Shows the location of the lognormal
   * distribution relative to the y-axis.
   */
  double _loc;

  /** The scale parameter of the lognormal distribution when
   * PdfFunction::_valid is true (similar to mean).
   */
  double _scale;

  /** The shape parameter of the lognormal distribution when
   * PdfFunction::_valid is true (similar to variance).
   */
  double _shape;
};




/** \brief Factory class that encapsulates the LogNormalPdfFunction model. 
 *
 * It is used to analyze features that a lognormal distributed (such
 * as the length of strings).
 */
class LogNormalPdfFunctionFactory : public ModelFactory {
 public:
  /** Instantiate the encapsulated model.
   * @return a pointer to the instantiated LogNormalPdfFunction
   */
  Model *instance(void);
};



/** \brief A probability density function that is a straight line dervied 
 * from the underlying histogram via linear regresion.
 */
class LinearRegression : public Histogram {
 public:
  /** Default constructor.
   */
  LinearRegression(void);

  /** Virtual destructor.
   */
  virtual ~LinearRegression(void);

  /** Switch between different modes. Used to make the transition between
   * learning and detection phase.
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   * @throws ModelException: an illegal state transition has been
   * requested. Currently, LinearRegression can only be switched
   * to Detection (Model::ModelMode) 
   */
  virtual void switch_mode(ModelMode mode) throw (ModelException);

  /** Get the mean (expected value) E[x] of the straight line which is derived
   * by linear regression. This is calculated as ((_right_bound -
   * _left_bound) / 2). 
   * @return the mean (expected value) E[x] of the pdf 
   * @throws ModelException: mean requested when model is not in state
   * Detection (Model::ModelMode)
   */
  virtual double get_mean(void) throw (ModelException);


  /** Get the variance E[(x-E(x))^2] of the pdf. This is calculated as
   * ((_right_bound - _left_bound)^2 / 12).
   * @return the variance of the pdf
   * @throws ModelException: variance requested when model is not in state
   * Detection (Model::ModelMode)
   */
  virtual double get_variance(void) throw (ModelException);

  /** Get pdf function value (i.e., probability) at position x. In the
   * case of the linear regression model, f(x) is constant and
   * calculated as 1/(_right_bound - _left_bound) 
   * @param x: point where the pdf should be evaluated (note that a
   * NumericalItem can be mapped to a size_t which can be converted
   * into a double
   * @return the pdf value at point x
   */
  virtual double f(double x);

  /** Get probability that a value is smaller than x - this is based
   * on the ratio between (x - _left_bound) / (_right_bound - _left_bound)
   * @param x: specifies the right border of the interval [-inf, x] 
   * @return the sum of the pdf values in the interval [-inf, x] 
   */ 
  virtual double F(double x);

  /** Module test function. 
   * @param verbose: when set to true, output for each test is written to stderr, 
   * otherwise no output is generated
   * @return true, if all tests succeeded, false otherwise
   */
  static bool test(bool verbose);

 private:

  /** The left bound of the interval where the linear regression line is
   * defined. For values smaller than this bound, the pdf is zero.
   */
  double _left_bound;

  /** The right bound of the interval where the linear regression line is
   * defined. For values greater than this bound, the pdf is zero.
   */
  double _right_bound;

  /** The slope indicates whether the straight line is increasing or
   *  decreasing.
   */
  bool _increasing;
};






/** \brief A hidden Markov model used to represent the grammar of
 * sequences of input elements (of type Item). 
 *
 * This model can be used to infer the structure (grammar) of
 * sequences of input elements. It uses Bayesian techniques to provide
 * the optimal level of abstraction. That means that some amount of
 * abstraction takes place (i.e., the grammar often produces a
 * reasonable superset of the input sequences) without loosing too
 * many details. A Markov model is usually represented by a
 * non-deterministic automaton.
 */
class HiddenMarkovModel : public Model {
 public:
  /** Virtual Destructor.
   */
  virtual ~HiddenMarkovModel(void);

  /** Return an unused identifier for a model node (state)
   * @return an unused id
   */
  virtual int get_next_state_id(void) = 0;

  /** The grammar inference process is costly and not invoked after
   *  every insert. It is only invoked when the model is switched to
   *  detection mode. Alternatively, this function can be called to
   *  start the grammar inference (probability optimization) process
   *  immediately.
   */
  virtual void optimize(void) = 0;

  /** Insert an item into the model. The item must be a sequence
   * (ListCollection) of items. 
   * @param item: a sequence (list) of items whose structure should be
   * inferred
   * @throws ModelInputException: when item is not of type ListCollection
   */
  virtual void insert_item(Item *item) throw (ModelInputException) = 0;
  
  /** Switch between different modes. Used to make the transition between
   * learning and detection phase and to invoke the optimization
   * process (refer to optimize()).
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   */
  virtual void switch_mode(ModelMode mode) = 0;

  /** Check if an input sequence is in accordance with the model. That
   * is, check if the input sequence can be produced (or derived from)
   * the grammar stored by this Markov model
   * @param item: a ListCollection of Item that represents the input
   * sequence to check
   * @return 1 if the grammar produces the input sequence, 0 otherwise
   * @throws ModelInputException: when item is not of type ListCollection
   */
  virtual double check_item(Item *item) throw (ModelInputException) = 0;

  /** Get the number of states of the Markov model.
   * @result the number of nodes (states) in the Markov model (automaton) */
  virtual unsigned int get_model_size(void) = 0;

  /** Get the number of letters in input alphabet.
   * @return the number of different input items that have been added
   * to the model. Note that this does not refer to the number of
   * sequences added by insert_item() but to the number of distinct items
   * that all these sequences are composed of.
   */
  virtual unsigned int get_alphabet_size(void) = 0;

  /** Print a string representation of this Markov model to an output stream.
   * Use the standard output stream as default.
   * @param o: output stream where the Markov model should write itself to
   */
  virtual void to_string(ostream &o = cout) = 0;

  /** Module test function. 
   * @param verbose: when set to true, output for each test is written to stderr, 
   * otherwise no output is generated
   * @return true, if all tests succeeded, false otherwise
   */
  static bool test(bool verbose);

  /** Return the confidence that a markov model has in its anomaly scores
   * (as returned by check_item).
   * @return a double value in the interval [0,1]. 0 expresses that the
   * value returned by check_item is useless and 1 expresses that the model is
   * absolutely sure about the results of check_item. The confidence
   * is calculated from the number of inserted elements and the degree
   * of generalization. The confidence is higher when more elements have
   * been inserted and/or when the generalization has been more aggressive.
   */
  virtual double get_confidence(void) = 0;

  static Model *instance(void);
};




/** \brief Factory class that encapsulates the HiddenMarkovModel model. 
 *
 * It uses a derivation of the Stolcke Omohundro approach of grammar
 * inference to deduce the structure of input sequences. 
 */
class HiddenMarkovModelFactory : public ModelFactory {
 public:
  /** Instantiate the encapsulated model.
   * @return a pointer to the instantiated Markov model implementation
   */
  Model *instance(void);
};



class StringLengthModel : public Model {

 public:

  /** Default constructor.
   */
  StringLengthModel(void);

  /** Virtual destructor.
   */
  virtual ~StringLengthModel(void);

  /** Insert an item into the model during the training phase.
   * @param item: a pointer to the object that is added to the model
   */
  virtual void insert_item(Item *item) throw (ModelInputException);
  
  /** Switch between different modes. Used to make the transition between
   * learning and detection phase.
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   */
  virtual void switch_mode(ModelMode mode) throw (ModelException);

  /** Check a new item to accordance with the model profile and return 
   * its anomaly score.
   * @param item: a pointer to the item that should be checked
   * @return a double value in the interval [0,1] where 0 is totally abnormal 
   * and 1 is perfectly normal
  */
  virtual double check_item(Item *item) throw (ModelInputException);

 private:
  
  /** Stores elements together with their count. Items must be of type
   * NumericalItem and their value is used as index in the map.
   */
  map<int, unsigned long> _elements;

  unsigned long _element_count;

  double _mean;

  double _sigma;
};


class StringLengthModelFactory : public ModelFactory {
 public:
  /** Instantiate the encapsulated model.
   * @return a pointer to the instantiated Markov model implementation
   */
  Model *instance(void);
};


/** \brief An abstract libAnomaly Model for Bayesian Networks. The
 *  concrete implementation of this class (BayesianNetworkModelImpl)
 *  is a 'view' of an instance of BayesianNetwork, corresponding to a
 *  single state of a single query variable in the instance of
 *  Bayesian Network.
 */
class BayesianNetworkModel : public Model {
 public:
  /** Insert an item into the model. The item must be a sequence
   * (ListCollection) of items. 
   * @param item: a sequence (list) of items whose structure should be
   * inferred
   * @throws ModelInputException: when item is not of type ListCollection
   */
  virtual void insert_item(Item *item) throw (ModelInputException) = 0;
  
  /** Switch between different modes. Used to make the transition between
   * learning and detection phase and to invoke the optimization
   * process (refer to optimize()).
   * @param  mode: a value from the ModelMode enumeration that specifies the
   * new mode
   */
  virtual void switch_mode(ModelMode mode) = 0;


  /** Check if an input sequence is in accordance with the model. That
   * is, check if the input sequence can be produced (or derived from)
   * the grammar stored by this Markov model
   * @param item: a ListCollection of Item that represents the input
   * sequence to check
   * @return 1 if the grammar produces the input sequence, 0 otherwise
   * @throws ModelInputException: when item is not of type ListCollection
   */
  virtual double check_item(Item *item) throw (ModelInputException) = 0;


  /** Return the confidence that the model has in its anomaly scores
   * (as returned by check_item).
   * @return a double value in the interval [0,1]. 0 expresses that the
   * value returned by check_item is useless and 1 expresses that the model is
   * absolutely sure about the results of check_item.
   */
  virtual double get_confidence(void) = 0;

  /** Module test function. 
   * @param verbose: when set to true, output for each test is written to 
   * stderr, otherwise no output is generated
   * @return true, if all tests succeeded, false otherwise
   */
  static bool test(bool verbose);

  static BayesianNetworkModel *instance(BayesianNetwork *bn,
					string queryVariable,
					string stateName);
};

class BayesianNetwork {
 public:
  static BayesianNetwork *instance(string filename);
};

class BayesianEvidence : public Item {
 private:
  string                        _variable;
  unsigned int                  _state;

 public:

 private:
 public:
  // evidence says variable 'variable' is in state 'state'
  BayesianEvidence(string variable, int state);
  const char *getVariable(void) { return _variable.c_str(); }
  unsigned int getState(void) { return _state; }
  virtual size_t hash_value(void) const;
};

#endif


