Source Code of weka.filters.unsupervised.attribute.NumericCleaner

/*
 *    This program is free software; you can redistribute it and/or modify
 *    it under the terms of the GNU General Public License as published by
 *    the Free Software Foundation; either version 2 of the License, or
 *    (at your option) any later version.
 *
 *    This program is distributed in the hope that it will be useful,
 *    but WITHOUT ANY WARRANTY; without even the implied warranty of
 *    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *    GNU General Public License for more details.
 *
 *    You should have received a copy of the GNU General Public License
 *    along with this program; if not, write to the Free Software
 *    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */


/*
 * NumericCleaner.java
 * Copyright (C) 2006 University of Waikato, Hamilton, New Zealand
 */


package weka.filters.unsupervised.attribute;


import weka.core.Capabilities;
import weka.core.Instance;
import weka.core.Instances;
import weka.core.Option;
import weka.core.Range;
import weka.core.RevisionUtils;
import weka.core.Utils;
import weka.core.Capabilities.Capability;
import weka.filters.SimpleStreamFilter;


import java.util.Enumeration;
import java.util.Vector;




/**
 <!-- globalinfo-start -->
 * A filter that 'cleanses' the numeric data from values that are too small, too big or very close to a certain value (e.g., 0) and sets these values to a pre-defined default.
 * <p/>
 <!-- globalinfo-end -->
 * 
 <!-- options-start -->
 * Valid options are: <p/>
 * 
 * <pre> -D
 *  Turns on output of debugging information.</pre>
 * 
 * <pre> -min &lt;double&gt;
 *  The minimum threshold. (default -Double.MAX_VALUE)</pre>
 * 
 * <pre> -min-default &lt;double&gt;
 *  The replacement for values smaller than the minimum threshold.
 *  (default -Double.MAX_VALUE)</pre>
 * 
 * <pre> -max &lt;double&gt;
 *  The maximum threshold. (default Double.MAX_VALUE)</pre>
 * 
 * <pre> -max-default &lt;double&gt;
 *  The replacement for values larger than the maximum threshold.
 *  (default Double.MAX_VALUE)</pre>
 * 
 * <pre> -closeto &lt;double&gt;
 *  The number values are checked for closeness. (default 0)</pre>
 * 
 * <pre> -closeto-default &lt;double&gt;
 *  The replacement for values that are close to '-closeto'.
 *  (default 0)</pre>
 * 
 * <pre> -closeto-tolerance &lt;double&gt;
 *  The tolerance below which numbers are considered being close to 
 *  to each other. (default 1E-6)</pre>
 * 
 * <pre> -decimals &lt;int&gt;
 *  The number of decimals to round to, -1 means no rounding at all.
 *  (default -1)</pre>
 * 
 * <pre> -R &lt;col1,col2,...&gt;
 *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
 *  (default first-last)</pre>
 * 
 * <pre> -V
 *  Inverts the matching sense.</pre>
 * 
 * <pre> -include-class
 *  Whether to include the class in the cleansing.
 *  The class column will always be skipped, if this flag is not
 *  present. (default no)</pre>
 * 
 <!-- options-end -->
 *
 * @author  fracpete (fracpete at waikato dot ac dot nz)
 * @version $Revision: 1.2 $
 */
public class NumericCleaner
  extends SimpleStreamFilter {


  /** for serialization */
  private static final long serialVersionUID = -352890679895066592L;


  /** the minimum threshold */
  protected double m_MinThreshold = -Double.MAX_VALUE;


  /** the minimum default replacement value */
  protected double m_MinDefault = -Double.MAX_VALUE;


  /** the maximum threshold */
  protected double m_MaxThreshold = Double.MAX_VALUE;


  /** the maximum default replacement value */
  protected double m_MaxDefault = Double.MAX_VALUE;


  /** the number the values are checked for closeness to */
  protected double m_CloseTo = 0;


  /** the default replacement value for numbers "close-to" */
  protected double m_CloseToDefault = 0;


  /** the tolerance distance, below which numbers are considered being "close-to" */
  protected double m_CloseToTolerance = 1E-6;


  /** Stores which columns to cleanse */
  protected Range m_Cols = new Range("first-last");


  /** whether to include the class attribute */
  protected boolean m_IncludeClass = false;
  
  /** the number of decimals to round to (-1 means no rounding) */
  protected int m_Decimals = -1;
  
  /**
   * Returns a string describing this filter.
   *
   * @return      a description of the filter suitable for
   *              displaying in the explorer/experimenter gui
   */
  public String globalInfo() {
    return 
        "A filter that 'cleanses' the numeric data from values that are too "
      + "small, too big or very close to a certain value (e.g., 0) and sets "
      + "these values to a pre-defined default.";
  }


  /**
   * Returns an enumeration describing the available options.
   *
   * @return an enumeration of all the available options.
   */
  public Enumeration listOptions() {
    Vector        result;
    Enumeration   enm;


    result = new Vector();


    enm = super.listOptions();
    while (enm.hasMoreElements())
      result.addElement(enm.nextElement());


    result.addElement(new Option(
  "\tThe minimum threshold. (default -Double.MAX_VALUE)",
  "min", 1, "-min <double>"));
    
    result.addElement(new Option(
  "\tThe replacement for values smaller than the minimum threshold.\n"
  + "\t(default -Double.MAX_VALUE)",
  "min-default", 1, "-min-default <double>"));


    result.addElement(new Option(
  "\tThe maximum threshold. (default Double.MAX_VALUE)",
  "max", 1, "-max <double>"));
    
    result.addElement(new Option(
  "\tThe replacement for values larger than the maximum threshold.\n"
  + "\t(default Double.MAX_VALUE)",
  "max-default", 1, "-max-default <double>"));


    result.addElement(new Option(
  "\tThe number values are checked for closeness. (default 0)",
  "closeto", 1, "-closeto <double>"));
    
    result.addElement(new Option(
  "\tThe replacement for values that are close to '-closeto'.\n"
  + "\t(default 0)",
  "closeto-default", 1, "-closeto-default <double>"));
    
    result.addElement(new Option(
  "\tThe tolerance below which numbers are considered being close to \n"
  + "\tto each other. (default 1E-6)",
  "closeto-tolerance", 1, "-closeto-tolerance <double>"));


    result.addElement(new Option(
  "\tThe number of decimals to round to, -1 means no rounding at all.\n"
  + "\t(default -1)",
  "decimals", 1, "-decimals <int>"));
    
    result.addElement(new Option(
  "\tThe list of columns to cleanse, e.g., first-last or first-3,5-last.\n"
  + "\t(default first-last)",
  "R", 1, "-R <col1,col2,...>"));


    result.addElement(new Option(
  "\tInverts the matching sense.",
  "V", 0, "-V"));


    result.addElement(new Option(
  "\tWhether to include the class in the cleansing.\n"
  + "\tThe class column will always be skipped, if this flag is not\n"
  + "\tpresent. (default no)",
  "include-class", 0, "-include-class"));


    return result.elements();
  }    


  /**
   * Gets the current settings of the filter.
   *
   * @return an array of strings suitable for passing to setOptions
   */
  public String[] getOptions() {
    int       i;
    Vector    result;
    String[]  options;


    result = new Vector();
    options = super.getOptions();
    for (i = 0; i < options.length; i++)
      result.add(options[i]);


    result.add("-min"); 
    result.add("" + m_MinThreshold);


    result.add("-min-default"); 
    result.add("" + m_MinDefault);


    result.add("-max"); 
    result.add("" + m_MaxThreshold);


    result.add("-max-default"); 
    result.add("" + m_MaxDefault);


    result.add("-closeto"); 
    result.add("" + m_CloseTo);


    result.add("-closeto-default"); 
    result.add("" + m_CloseToDefault);
    
    result.add("-closeto-tolerance"); 
    result.add("" + m_CloseToTolerance);


    result.add("-R"); 
    result.add("" + m_Cols.getRanges());


    if (m_Cols.getInvert())
      result.add("-V");
    
    if (m_IncludeClass)
      result.add("-include-class"); 


    result.add("-decimals"); 
    result.add("" + getDecimals());


    return (String[]) result.toArray(new String[result.size()]);    
  }    


  /**
   * Parses a given list of options. <p/>
   *
   <!-- options-start -->
   * Valid options are: <p/>
   * 
   * <pre> -D
   *  Turns on output of debugging information.</pre>
   * 
   * <pre> -min &lt;double&gt;
   *  The minimum threshold. (default -Double.MAX_VALUE)</pre>
   * 
   * <pre> -min-default &lt;double&gt;
   *  The replacement for values smaller than the minimum threshold.
   *  (default -Double.MAX_VALUE)</pre>
   * 
   * <pre> -max &lt;double&gt;
   *  The maximum threshold. (default Double.MAX_VALUE)</pre>
   * 
   * <pre> -max-default &lt;double&gt;
   *  The replacement for values larger than the maximum threshold.
   *  (default Double.MAX_VALUE)</pre>
   * 
   * <pre> -closeto &lt;double&gt;
   *  The number values are checked for closeness. (default 0)</pre>
   * 
   * <pre> -closeto-default &lt;double&gt;
   *  The replacement for values that are close to '-closeto'.
   *  (default 0)</pre>
   * 
   * <pre> -closeto-tolerance &lt;double&gt;
   *  The tolerance below which numbers are considered being close to 
   *  to each other. (default 1E-6)</pre>
   * 
   * <pre> -decimals &lt;int&gt;
   *  The number of decimals to round to, -1 means no rounding at all.
   *  (default -1)</pre>
   * 
   * <pre> -R &lt;col1,col2,...&gt;
   *  The list of columns to cleanse, e.g., first-last or first-3,5-last.
   *  (default first-last)</pre>
   * 
   * <pre> -V
   *  Inverts the matching sense.</pre>
   * 
   * <pre> -include-class
   *  Whether to include the class in the cleansing.
   *  The class column will always be skipped, if this flag is not
   *  present. (default no)</pre>
   * 
   <!-- options-end -->
   * 
   * @param options the list of options as an array of strings
   * @throws Exception if an option is not supported 
   */
  public void setOptions(String[] options) throws Exception {
    String  tmpStr;


    tmpStr = Utils.getOption("min", options);
    if (tmpStr.length() != 0)
      setMinThreshold(Double.parseDouble(tmpStr));
    else
      setMinThreshold(-Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("min-default", options);
    if (tmpStr.length() != 0)
      setMinDefault(Double.parseDouble(tmpStr));
    else
      setMinDefault(-Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("max", options);
    if (tmpStr.length() != 0)
      setMaxThreshold(Double.parseDouble(tmpStr));
    else
      setMaxThreshold(Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("max-default", options);
    if (tmpStr.length() != 0)
      setMaxDefault(Double.parseDouble(tmpStr));
    else
      setMaxDefault(Double.MAX_VALUE);
    
    tmpStr = Utils.getOption("closeto", options);
    if (tmpStr.length() != 0)
      setCloseTo(Double.parseDouble(tmpStr));
    else
      setCloseTo(0);
    
    tmpStr = Utils.getOption("closeto-default", options);
    if (tmpStr.length() != 0)
      setCloseToDefault(Double.parseDouble(tmpStr));
    else
      setCloseToDefault(0);
    
    tmpStr = Utils.getOption("closeto-tolerance", options);
    if (tmpStr.length() != 0)
      setCloseToTolerance(Double.parseDouble(tmpStr));
    else
      setCloseToTolerance(1E-6);
    
    tmpStr = Utils.getOption("R", options);
    if (tmpStr.length() != 0)
      setAttributeIndices(tmpStr);
    else
      setAttributeIndices("first-last");
    
    setInvertSelection(Utils.getFlag("V", options));
    
    setIncludeClass(Utils.getFlag("include-class", options));


    tmpStr = Utils.getOption("decimals", options);
    if (tmpStr.length() != 0)
      setDecimals(Integer.parseInt(tmpStr));
    else
      setDecimals(-1);
    
    super.setOptions(options);
  }    


  /** 
   * Returns the Capabilities of this filter.
   *
   * @return            the capabilities of this object
   * @see               Capabilities
   */
  public Capabilities getCapabilities() {
    Capabilities result = super.getCapabilities();


    // attributes
    result.enableAllAttributes();
    result.enable(Capability.MISSING_VALUES);
    
    // class
    result.enableAllClasses();
    result.enable(Capability.MISSING_CLASS_VALUES);
    result.enable(Capability.NO_CLASS);
    
    return result;
  }
  
  /**
   * Determines the output format based on the input format and returns 
   * this. In case the output format cannot be returned immediately, i.e.,
   * immediateOutputFormat() returns false, then this method will be called
   * from batchFinished().
   *
   * @param inputFormat     the input format to base the output format on
   * @return                the output format
   * @throws Exception      in case the determination goes wrong
   * @see   #hasImmediateOutputFormat()
   * @see   #batchFinished()
   */
  protected Instances determineOutputFormat(Instances inputFormat)
      throws Exception {


    m_Cols.setUpper(inputFormat.numAttributes() - 1);
    
    return new Instances(inputFormat);
  }


  /**
   * processes the given instance (may change the provided instance) and
   * returns the modified version.
   *
   * @param instance    the instance to process
   * @return            the modified data
   * @throws Exception  in case the processing goes wrong
   */
  protected Instance process(Instance instance) throws Exception {
    Instance    result;
    int      i;
    double    val;
    double    factor;
    
    result = (Instance) instance.copy();
    
    if (m_Decimals > -1)
      factor = StrictMath.pow(10, m_Decimals);
    else
      factor = 1;
    
    for (i = 0; i < result.numAttributes(); i++) {
      // only numeric attributes
      if (!result.attribute(i).isNumeric())
  continue;


      // out of range?
      if (!m_Cols.isInRange(i))
  continue;
      
      // skip class?
      if ( (result.classIndex() == i) && (!m_IncludeClass) )
  continue;
      
      // too small?
      if (result.value(i) < m_MinThreshold) {
  if (getDebug())
    System.out.println("Too small: " + result.value(i) + " -> " + m_MinDefault);
  result.setValue(i, m_MinDefault);
      }
      // too big?
      else if (result.value(i) > m_MaxThreshold) {
  if (getDebug())
    System.out.println("Too big: " + result.value(i) + " -> " + m_MaxDefault);
  result.setValue(i, m_MaxDefault);
      }
      // too close?
      else if (    (result.value(i) - m_CloseTo < m_CloseToTolerance) 
          && (m_CloseTo - result.value(i) < m_CloseToTolerance) 
          && (result.value(i) != m_CloseTo) ) {
  if (getDebug())
    System.out.println("Too close: " + result.value(i) + " -> " + m_CloseToDefault);
  result.setValue(i, m_CloseToDefault);
      }
      
      // decimals?
      if (m_Decimals > -1) {
  val = result.value(i);
  val = StrictMath.round(val * factor) / factor;
  result.setValue(i, val);
      }
    }


    return result;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String minThresholdTipText() {
    return "The minimum threshold below values are replaced by a default.";
  }


  /**
   * Get the minimum threshold. 
   *
   * @return     the minimum threshold.
   */
  public double getMinThreshold() {
    return m_MinThreshold;
  }


  /**
   * Set the minimum threshold. 
   *
   * @param value  the minimum threshold to use.
   */
  public void setMinThreshold(double value) {
    m_MinThreshold = value;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String minDefaultTipText() {
    return "The default value to replace values that are below the minimum threshold.";
  }


  /**
   * Get the minimum default. 
   *
   * @return     the minimum default.
   */
  public double getMinDefault() {
    return m_MinDefault;
  }


  /**
   * Set the minimum default. 
   *
   * @param value  the minimum default to use.
   */
  public void setMinDefault(double value) {
    m_MinDefault = value;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String maxThresholdTipText() {
    return "The maximum threshold above values are replaced by a default.";
  }


  /**
   * Get the maximum threshold. 
   *
   * @return     the maximum threshold.
   */
  public double getMaxThreshold() {
    return m_MaxThreshold;
  }


  /**
   * Set the maximum threshold. 
   *
   * @param value  the maximum threshold to use.
   */
  public void setMaxThreshold(double value) {
    m_MaxThreshold = value;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String maxDefaultTipText() {
    return "The default value to replace values that are above the maximum threshold.";
  }


  /**
   * Get the maximum default. 
   *
   * @return     the maximum default.
   */
  public double getMaxDefault() {
    return m_MaxDefault;
  }


  /**
   * Set the naximum default. 
   *
   * @param value  the maximum default to use.
   */
  public void setMaxDefault(double value) {
    m_MaxDefault = value;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String closeToTipText() {
    return 
        "The number values are checked for whether they are too close to "
      + "and get replaced by a default.";
  }


  /**
   * Get the "close to" number.
   *
   * @return     the "close to" number.
   */
  public double getCloseTo() {
    return m_CloseTo;
  }


  /**
   * Set the "close to" number.
   *
   * @param value  the number to use for checking closeness.
   */
  public void setCloseTo(double value) {
    m_CloseTo = value;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String closeToDefaultTipText() {
    return "The default value to replace values with that are too close.";
  }


  /**
   * Get the "close to" default.
   *
   * @return     the "close to" default.
   */
  public double getCloseToDefault() {
    return m_CloseToDefault;
  }


  /**
   * Set the "close to" default. 
   *
   * @param value  the "close to" default to use.
   */
  public void setCloseToDefault(double value) {
    m_CloseToDefault = value;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String closeToToleranceTipText() {
    return "The value below which values are considered close to.";
  }


  /**
   * Get the "close to" Tolerance.
   *
   * @return     the "close to" Tolerance.
   */
  public double getCloseToTolerance() {
    return m_CloseToTolerance;
  }


  /**
   * Set the "close to" Tolerance. 
   *
   * @param value  the "close to" Tolerance to use.
   */
  public void setCloseToTolerance(double value) {
    m_CloseToTolerance = value;
  }


  /**
   * Returns the tip text for this property
   *
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String attributeIndicesTipText() {
    return "The selection of columns to use in the cleansing processs, first and last are valid indices.";
  }


  /**
   * Gets the selection of the columns, e.g., first-last or first-3,5-last
   *
   * @return     the selected indices
   */
  public String getAttributeIndices() {
    return m_Cols.getRanges();
  }


  /**
   * Sets the columns to use, e.g., first-last or first-3,5-last
   *
   * @param value   the columns to use
   */
  public void setAttributeIndices(String value) {
    m_Cols.setRanges(value);
  }


  /**
   * Returns the tip text for this property
   *
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String invertSelectionTipText() {
    return "If enabled the selection of the columns is inverted.";
  }


  /**
   * Gets whether the selection of the columns is inverted
   *
   * @return     true if the selection is inverted
   */
  public boolean getInvertSelection() {
    return m_Cols.getInvert();
  }


  /**
   * Sets whether the selection of the indices is inverted or not
   *
   * @param value   the new invert setting
   */
  public void setInvertSelection(boolean value) {
    m_Cols.setInvert(value);
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String includeClassTipText() {
    return "If disabled, the class attribute will be always left out of the cleaning process.";
  }


  /**
   * Gets whether the class is included in the cleaning process or always 
   * skipped.
   *
   * @return     true if the class can be considered for cleaning.
   */
  public boolean getIncludeClass() {
    return m_IncludeClass;
  }


  /**
   * Sets whether the class can be cleaned, too.
   *
   * @param value  true if the class can be cleansed, too
   */
  public void setIncludeClass(boolean value) {
    m_IncludeClass = value;
  }


  /**
   * Returns the tip text for this property
   * 
   * @return     tip text for this property suitable for
   *       displaying in the explorer/experimenter gui
   */
  public String decimalsTipText() {
    return "The number of decimals to round to, -1 means no rounding at all.";
  }


  /**
   * Get the number of decimals to round to. 
   *
   * @return     the number of decimals.
   */
  public int getDecimals() {
    return m_Decimals;
  }


  /**
   * Set the number of decimals to round to.
   *
   * @param value  the number of decimals.
   */
  public void setDecimals(int value) {
    m_Decimals = value;
  }
  
  /**
   * Returns the revision string.
   * 
   * @return    the revision
   */
  public String getRevision() {
    return RevisionUtils.extract("$Revision: 1.2 $");
  }


  /**
   * Runs the filter from commandline, use "-h" to see all options.
   * 
   * @param args the commandline options for the filter
   */
  public static void main(String[] args) {
    runFilter(new NumericCleaner(), args);
  }
}
Source Code of weka.filters.unsupervised.attribute.NumericCleaner

Related Classes of weka.filters.unsupervised.attribute.NumericCleaner