Package edu.isi.karma.controller.command.cleaning

Source Code of edu.isi.karma.controller.command.cleaning.GenerateCleaningRulesCommand

/*******************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code was developed by the Information Integration Group as part
* of the Karma project at the Information Sciences Institute of the
* University of Southern California.  For more information, publications,
* and related projects, please see: http://www.isi.edu/integration
******************************************************************************/
package edu.isi.karma.controller.command.cleaning;

import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Vector;

import org.json.JSONArray;
import org.json.JSONObject;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import edu.isi.karma.cleaning.DataPreProcessor;
import edu.isi.karma.cleaning.ExampleSelection;
import edu.isi.karma.cleaning.Messager;
import edu.isi.karma.cleaning.Ruler;
import edu.isi.karma.cleaning.TNode;
import edu.isi.karma.cleaning.UtilTools;
import edu.isi.karma.cleaning.Research.ConfigParameters;
import edu.isi.karma.cleaning.Research.DataCollection;
import edu.isi.karma.controller.command.CommandException;
import edu.isi.karma.controller.command.CommandType;
import edu.isi.karma.controller.command.WorksheetSelectionCommand;
import edu.isi.karma.controller.command.selection.SuperSelection;
import edu.isi.karma.controller.update.CleaningResultUpdate;
import edu.isi.karma.controller.update.UpdateContainer;
import edu.isi.karma.rep.HNodePath;
import edu.isi.karma.rep.Node;
import edu.isi.karma.rep.Worksheet;
import edu.isi.karma.rep.Workspace;
import edu.isi.karma.rep.cleaning.RamblerTransformationExample;
import edu.isi.karma.rep.cleaning.RamblerTransformationInputs;
import edu.isi.karma.rep.cleaning.RamblerTransformationOutput;
import edu.isi.karma.rep.cleaning.RamblerValueCollection;
import edu.isi.karma.rep.cleaning.TransformationExample;
import edu.isi.karma.rep.cleaning.ValueCollection;
public class GenerateCleaningRulesCommand extends WorksheetSelectionCommand {
  final String hNodeId;
  private Vector<TransformationExample> examples;
  private HashSet<String> nodeIds = new HashSet<String>();
  RamblerTransformationInputs inputs;
  public String compResultString = "";
  private static Logger logger = LoggerFactory.getLogger(GenerateCleaningRulesCommand.class);

  public GenerateCleaningRulesCommand(String id, String worksheetId,
      String hNodeId, String examples, String cellIDs, String selectionId) {
    super(id, worksheetId, selectionId);
    this.hNodeId = hNodeId;
    this.nodeIds = parseNodeIds(cellIDs);
    ConfigParameters cfg = new ConfigParameters();
    cfg.initeParameters();
    DataCollection.config = cfg.getString();
    this.examples = parseExample(examples);
  }

  private HashSet<String> parseNodeIds(String Ids) {
    HashSet<String> tSet = new HashSet<String>();
    try {
      JSONArray jsa = new JSONArray(Ids);
      for (int i = 0; i < jsa.length(); i++) {
        tSet.add(jsa.getString(i));
      }

    } catch (Exception e) {
      logger.error("" + e.toString());
    }
    return tSet;
  }

  public static Vector<TransformationExample> parseExample(String example) {
    Vector<TransformationExample> x = new Vector<TransformationExample>();
    try {
      JSONArray jsa = new JSONArray(example);
      for (int i = 0; i < jsa.length(); i++) {
        String[] ary = new String[3];
        JSONObject jo = (JSONObject) jsa.get(i);
        String nodeid = (String) jo.get("nodeId");
        String before = (String) jo.getString("before");
        String after = (String) jo.getString("after");
        ary[0] = nodeid;
        ary[1] = "<_START>" + before + "<_END>";
        ary[2] = after;
        TransformationExample re = new RamblerTransformationExample(
            ary[1], ary[2], ary[0]);
        x.add(re);
      }
    } catch (Exception ex) {
      logger.error("" + ex.toString());
    }
    return x;
  }

  private String getBestExample(HashMap<String, String[]> xHashMap,
      HashMap<String, Vector<String[]>> expFeData) {
    ExampleSelection es = new ExampleSelection();
    es.inite(xHashMap, expFeData);
    return es.Choose();
  }
  @Override
  public String getCommandName() {
    return GenerateCleaningRulesCommand.class.getSimpleName();
  }

  @Override
  public String getTitle() {
    return "Generate Cleaning Rules";
  }

  @Override
  public String getDescription() {
    // TODO Auto-generated method stub
    return null;
  }

  @Override
  public CommandType getCommandType() {
    return CommandType.notInHistory;
  }

  @Override
  public UpdateContainer doIt(Workspace workspace) throws CommandException {
    Worksheet wk = workspace.getWorksheet(worksheetId);
    SuperSelection selection = getSuperSelection(wk);
    String msg = String.format("Gen rule start,Time,%d, Worksheet,%s",System.currentTimeMillis(),worksheetId);
    logger.info(msg);
    // Get the HNode
    HashMap<String, String> rows = new HashMap<String, String>();
    HashMap<String, Integer> amb = new HashMap<String, Integer>();
    HNodePath selectedPath = null;
    List<HNodePath> columnPaths = wk.getHeaders().getAllPaths();
    for (HNodePath path : columnPaths) {
      if (path.getLeaf().getId().equals(hNodeId)) {
        selectedPath = path;
      }
    }
   
    Collection<Node> nodes = new ArrayList<Node>();
    wk.getDataTable().collectNodes(selectedPath, nodes, selection);
    for (Node node : nodes) {
      String id = node.getId();
      if (!this.nodeIds.contains(id))
        continue;
      String originalVal = node.getValue().asString();
      rows.put(id, originalVal);
      this.compResultString += originalVal + "\n";
      calAmbScore(id, originalVal, amb);
    }
    RamblerValueCollection vc = new RamblerValueCollection(rows);
    HashMap<String, Vector<String[]>> expFeData = new HashMap<String, Vector<String[]>>();
    Messager mg = null;
    if(wk.getMsg()!= null)
    {
      mg = (Messager) wk.getMsg();
    }
    else
    {
      mg = new Messager();
      wk.setMsg(mg);
    }
    DataPreProcessor dp = null;
    if(wk.getDpp()!= null)
    {
      dp = (DataPreProcessor) wk.getDpp();
    }
    else
    {
      dp = new DataPreProcessor(rows.values());
      dp.run();
      wk.setDpp(dp);
    }
    inputs = new RamblerTransformationInputs(examples, vc,dp,mg);
    // generate the program
    boolean results = false;
    int iterNum = 0;
    RamblerTransformationOutput rtf = null;
    // initialize the vocabulary
    Iterator<String> iterx = inputs.getInputValues().getValues().iterator();
    Vector<String> v = new Vector<String>();
    int vb_cnt = 0;
    while(iterx.hasNext() && vb_cnt < 30)
    {
      String eString = iterx.next();
      v.add(eString);
      vb_cnt ++;
    }
    while (iterNum < 1 && !results) // try to find an program within iterNum
    {
      rtf = new RamblerTransformationOutput(inputs);
      if (rtf.getTransformations().keySet().size() > 0) {
        results = true;
      }
      iterNum++;
    }
    Iterator<String> iter = rtf.getTransformations().keySet().iterator();
    // id:{org: tar: orgdis: tardis: }
    HashMap<String, HashMap<String, String>> resdata = new HashMap<String, HashMap<String, String>>();
    HashSet<String> keys = new HashSet<String>();
    while (iter.hasNext()) {
      String tpid = iter.next();
      ValueCollection rvco = rtf.getTransformedValues_debug(tpid);
      if (rvco == null)
        continue;
      // constructing displaying data
      HashMap<String, String[]> xyzHashMap = new HashMap<String, String[]>();
      for (String key : rvco.getNodeIDs()) {
        HashMap<String, String> dict = new HashMap<String, String>();
        // add to the example selection
        boolean isExp = false;
        String org = vc.getValue(key);
        String classLabel = rvco.getClass(key);
        String pretar = rvco.getValue(key);
        String dummyValue = pretar;
        if(pretar.indexOf("_FATAL_ERROR_")!= -1)
        {
          dummyValue = org;
          //dummyValue = "#ERROR";
        }
        try
        {
          UtilTools.StringColorCode(org, dummyValue, dict);
        }
        catch(Exception ex)
        {
          logger.info(String.format("ColorCoding Exception%s, %s", org,dummyValue));
          //set dict
          dict.put("Org", org);
          dict.put("Tar", "ERROR");
          dict.put("Orgdis", org);
          dict.put("Tardis", "ERROR");
        }
        for (TransformationExample exp : examples) {
          if (exp.getNodeId().compareTo(key) == 0) {
            if (!expFeData.containsKey(classLabel)) {
              Vector<String[]> vstr = new Vector<String[]>();
              String[] texp = {dict.get("Org"), pretar};
              vstr.add(texp);
              expFeData.put(classLabel, vstr);
            } else {
              String[] texp = { dict.get("Org"), pretar };
              expFeData.get(classLabel).add(texp);
            }
            isExp = true;
          }
        }

        if (!isExp) {
          String[] pair = { dict.get("Org"), dict.get("Tar"), pretar,
              classLabel };
          xyzHashMap.put(key, pair);
        }
        resdata.put(key, dict);
      }
      if(!rtf.nullRule)
        keys.add(getBestExample(xyzHashMap, expFeData));
    }
    // find the best row
    String vars = "";
    String expstr = "";
    String recmd = "";
    for(TransformationExample x:examples)
    {
      expstr += String.format("%s|%s", x.getBefore(),x.getAfter());
    }
    expstr += "|";
    if(rtf.nullRule)
    {
      keys.clear();
      //keys.add("-2"); // "-2 indicates null rule"
    }
    if(!resdata.isEmpty() && !rtf.nullRule)
    {
      recmd = resdata.get(keys.iterator().next()).get("Org");
    }
    else
    {
      recmd = "";
    }
    msg = String.format("Gen rule end, Time,%d, Worksheet,%s,Examples:%s,Recmd:%s",System.currentTimeMillis(),worksheetId,expstr,recmd);
    logger.info(msg);
    return new UpdateContainer(new CleaningResultUpdate(hNodeId, resdata,
        vars, keys));
  }

  public String getVarJSON(HashMap<String, HashSet<String>> values) {
    JSONObject jsobj = new JSONObject();
    try {
      for (String key : values.keySet()) {
        JSONArray jsonArray = new JSONArray();
        HashSet<String> vs = values.get(key);
        for (String v : vs)
          jsonArray.put(v);
        jsobj.put(key, jsonArray);
      }
    } catch (Exception e) {
      logger.error("value generation error");
    }
    return jsobj.toString();
  }

  public void calAmbScore(String id, String org, HashMap<String, Integer> amb) {
    Ruler ruler = new Ruler();
    ruler.setNewInput(org);
    Vector<TNode> tNodes = ruler.vec;
    int tcnt = 1;
    for (int i = 0; i < tNodes.size(); i++) {
      if (tNodes.get(i).text.compareTo(" ") == 0)
        continue;
      for (int j = 0; j > i && j < tNodes.size(); j++) {
        if (tNodes.get(j).sameNode(tNodes.get(i))) {
          tcnt++;
        }
      }
    }
    amb.put(id, tcnt);
  }

  public void updateCandiScore(ValueCollection rvco,
      HashMap<String, HashMap<String, Integer>> values) {
    Iterator<String> ids = rvco.getNodeIDs().iterator();
    while (ids.hasNext()) {
      String id = ids.next();
      String value = rvco.getValue(id);
      HashMap<String, Integer> dict;
      if (values.containsKey(id)) {
        dict = values.get(id);
      } else {
        dict = new HashMap<String, Integer>();
        values.put(id, dict);
      }
      if (dict.containsKey(value)) {
        dict.put(value, dict.get(value) + 1);
      } else {
        dict.put(value, 1);
      }
    }
    return;
  }

  public HashMap<String, Double> getScore(HashMap<String, Integer> dicts,
      HashMap<String, HashMap<String, Integer>> values, boolean sw) {

    int topKsize = 1;
    if (sw)
      topKsize = Integer.MAX_VALUE;
    HashMap<String, Double> topK = new HashMap<String, Double>();
    Iterator<String> iditer = dicts.keySet().iterator();
    while (iditer.hasNext()) {
      String id = iditer.next();
      HashMap<String, Integer> hm = values.get(id);
      int div = 0;
      div = hm.keySet().size();
      double score = div;
      if (topK.keySet().size() < topKsize && div > 1) {
        topK.put(id, score);
      } else {
        String[] keys = topK.keySet().toArray(
            new String[topK.keySet().size()]);
        for (String key : keys) {
          if (topK.get(key) < score) {
            topK.remove(key);
            topK.put(id, score);
          }
        }
      }
    }
    return topK;
  }
  @Override
  public UpdateContainer undoIt(Workspace workspace) {
    // TODO Auto-generated method stub
    return null;
  }

  /* save for future use
   * public static void main(String[] args) {
   *
    String dirpath = "/Users/bowu/Research/testdata/TestSingleFile";
    File nf = new File(dirpath);
    File[] allfiles = nf.listFiles();
    for (File f : allfiles) {
      try {
        if (f.getName().indexOf(".csv") == (f.getName().length() - 4)) {

          CSVReader cr = new CSVReader(new FileReader(f), '\t');
          String[] pair;
          int isadded = 0;
          HashMap<String, String> tx = new HashMap<String, String>();
          int i = 0;
          Vector<TransformationExample> vrt = new Vector<TransformationExample>();
          while ((pair = cr.readNext()) != null) {

            pair[0] = "<_START>" + pair[0] + "<_END>";
            tx.put(i + "", pair[0]);
            if (isadded < 2) {
              RamblerTransformationExample tmp = new RamblerTransformationExample(
                  pair[0], pair[1], i + "");
              vrt.add(tmp);
              isadded++;
            }
            i++;
          }
          RamblerValueCollection vc = new RamblerValueCollection(tx);
          RamblerTransformationInputs inputs = new RamblerTransformationInputs(
              vrt, vc);
          // generate the program
          RamblerTransformationOutput rtf = new RamblerTransformationOutput(
              inputs);
          HashMap<String, Vector<String>> js2tps = new HashMap<String, Vector<String>>();
          Iterator<String> iter = rtf.getTransformations().keySet()
              .iterator();
          Vector<ValueCollection> vvc = new Vector<ValueCollection>();
          while (iter.hasNext()) {
            String tpid = iter.next();
            ValueCollection rvco = rtf.getTransformedValues(tpid);
            vvc.add(rvco);
            String reps = rvco.getJson().toString();
            if (js2tps.containsKey(reps)) {
              js2tps.get(reps).add(tpid); // update the variance
                            // dic
            } else {
              Vector<String> tps = new Vector<String>();
              tps.add(tpid);
              js2tps.put(reps, tps);
            }
          }
          // //////
          if (js2tps.keySet().size() == 0) {
            logger.debug("No Rules have been found");
            return;
          }
          for (String s : js2tps.keySet()) {
            logger.debug("" + s);
          }
        }
      } catch (Exception ex) {
        logger.error("" + ex.toString());
      }
    }
  }*/
TOP

Related Classes of edu.isi.karma.controller.command.cleaning.GenerateCleaningRulesCommand

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.