Source Code of edu.isi.karma.cleaning.QuestionableRecord.OutlierDetector

/*******************************************************************************
 * Copyright 2012 University of Southern California
 * 
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 * 
 *   http://www.apache.org/licenses/LICENSE-2.0
 * 
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * 
 * This code was developed by the Information Integration Group as part 
 * of the Karma project at the Information Sciences Institute of the 
 * University of Southern California.  For more information, publications, 
 * and related projects, please see: http://www.isi.edu/integration
 ******************************************************************************/


package edu.isi.karma.cleaning.QuestionableRecord;


import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map.Entry;
import java.util.Vector;


import edu.isi.karma.cleaning.Ruler;
import edu.isi.karma.cleaning.TNode;


public class OutlierDetector {
  public HashMap<String, double[]> rVectors = new HashMap<String, double[]>();
  public double currentMax = -1;
  public HashSet<String> dict = new HashSet<String>();


  public OutlierDetector() {


  }


  public double getDistance(double[] x, double[] y) {
    if (x == null || y==null ||x.length != y.length )
      return Double.MAX_VALUE;
    double value = 0.0;
    for (int i = 0; i < x.length; i++) {
      value += Math.pow(x[i] - y[i], 2);
    }
    return Math.sqrt(value);
  }


  // find all the word appearing in org and tar records
  public void buildDict(Collection<String[]> data) {
    HashMap<String, Integer> mapHashSet = new HashMap<String, Integer>();
    for (String[] pair : data) {
      String s1 = pair[0];
      String s2 = pair[1];
      if (s1.contains("<_START>")) {
        s1 = s1.replace("<_START>", "");
      }
      if (s1.contains("<_END>")) {
        s1 = s1.replace("<_END>", "");
      }
      if (s2.contains("<_START>")) {
        s2 = s2.replace("<_START>", "");
      }
      if (s2.contains("<_END>")) {
        s2 = s2.replace("<_END>", "");
      }
      Ruler r = new Ruler();
      r.setNewInput(s1);
      Vector<TNode> v = r.vec;
      r.setNewInput(s2);
      v.addAll(r.vec);
      HashSet<String> curRow = new HashSet<String>();
      for (TNode t : v) {
        String k = t.text;
        k = k.replaceAll("[0-9]+", "DIGITs");
        if (k.trim().length() == 0)
          continue;
        // only consider K once in one row
        if (curRow.contains(k)) {
          continue;
        } else {
          curRow.add(k);
        }
        if (mapHashSet.containsKey(k)) {
          mapHashSet.put(k, mapHashSet.get(k) + 1);
        } else {
          mapHashSet.put(k, 1);
        }
      }
    }
    // prune infrequent terms
    int thresdhold = (int) (data.size() * 0.5);
    Iterator<Entry<String, Integer>> iter = mapHashSet.entrySet()
        .iterator();
    while (iter.hasNext()) {
      Entry<String, Integer> e = iter.next();
      if (e.getValue() < thresdhold) {
        iter.remove();
      }
    }
    this.dict = new HashSet<String>(mapHashSet.keySet());
  }


  // find outliers for one partition
  // simple 2d distance
  // testdata rowid:{tar, tarcolor}
  public String getOutliers(HashMap<String, String[]> testdata,
      double[] meanVector, double Max, HashSet<String> dic) {
    String Id = "";
    for (String key : testdata.keySet()) {
      String[] vpair = testdata.get(key);
      FeatureVector fvFeatureVector = new FeatureVector(dic);
      Vector<RecFeature> vRecFeatures = fvFeatureVector.createVector(
          vpair[0], vpair[1]);
      double[] x = new double[fvFeatureVector.size()];
      for (int i = 0; i < vRecFeatures.size(); i++) {
        x[i] = vRecFeatures.get(i).computerScore();
      }
      double value = this.getDistance(x, meanVector);
      if (value > Max) {
        Max = value;
        this.currentMax = Max;
        Id = key;
      }
    }
    return Id;
  }


  // pid: [{rawstring, code}]
  public void buildMeanVector(HashMap<String, Vector<String[]>> data,
      HashSet<String> dict) {
    rVectors.clear();
    this.currentMax = -1;
    if (data == null)
      return;
    for (String key : data.keySet()) {
      Vector<String[]> vs = data.get(key);
      FeatureVector fVector = new FeatureVector(dict);
      double[] dvec = new double[fVector.size()];
      for (int i = 0; i < dvec.length; i++) {
        dvec[i] = 0;
      }
      for (String[] elem : vs) {
        Vector<RecFeature> sFeatures = fVector.createVector(elem[0],
            elem[1]);
        for (int j = 0; j < sFeatures.size(); j++) {
          dvec[j] += sFeatures.get(j).computerScore();
        }
      }
      // get average size
      for (int i = 0; i < dvec.length; i++) {
        dvec[i] = dvec[i] * 1.0 / vs.size();
      }
      rVectors.put(key, dvec);
    }
  }


  public String test(double[] row) {
    String string = "";
    for (double d : row) {
      string += d + ",";
    }
    return string.substring(1, string.length() - 1);
  }


}
Source Code of edu.isi.karma.cleaning.QuestionableRecord.OutlierDetector

Related Classes of edu.isi.karma.cleaning.QuestionableRecord.OutlierDetector