Package edu.isi.karma.cleaning.Research

Source Code of edu.isi.karma.cleaning.Research.Anchor

/*******************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
*   http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code was developed by the Information Integration Group as part
* of the Karma project at the Information Sciences Institute of the
* University of Southern California.  For more information, publications,
* and related projects, please see: http://www.isi.edu/integration
******************************************************************************/

package edu.isi.karma.cleaning.Research;

import java.io.File;
import java.io.FileReader;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Vector;

import edu.isi.karma.cleaning.Ruler;
import edu.isi.karma.cleaning.TNode;
import au.com.bytecode.opencsv.CSVReader;

public class RecordDistiller {
  // {anchor:{"Id": , "Count": , "LefContext":[], "RigContext":[]}
  public static int cxt_limit = 3;
  public int totalnumber = 0;
  public HashMap<String, Anchor> anchors = new HashMap<String, Anchor>();

  public void readRecord(String ID, Vector<TNode> record) {
    HashMap<String, Integer> curIndices = new HashMap<String, Integer>();
    for (int i = 1; i < record.size() - 1; i++) // skip the start and end
                          // token
    {
      TNode t = record.get(i);
      String type = t.getType();
      String anchor = type;
      if (curIndices.containsKey(type)) {
        int cnt = curIndices.get(type);
        curIndices.put(type, cnt + 1);
        anchor += cnt;
      } else {
        curIndices.put(type, 0);
        anchor += "0";
      }
      // get left and right context
      String lcxt = "";
      String rcxt = "";
      for (int j = i; j < i + RecordDistiller.cxt_limit
          && j < record.size(); j++) {
        rcxt += record.get(j).getType();
      }
      for (int j = i; j >= 0 && j > i - cxt_limit; j--) {
        lcxt += record.get(j).getType();
      }
      // update the anchor repository
      if (this.anchors.containsKey(anchor)) {
        Anchor an = this.anchors.get(anchor);
        an.IDs.add(ID);
        an.count += 1;
        an.lefCxt.put(ID, lcxt);
        an.rigCxt.put(ID, rcxt);
      } else {
        Vector<String> Ids = new Vector<String>();
        Ids.add(ID);
        HashMap<String, String> vlcxt = new HashMap<String, String>();
        vlcxt.put(ID, lcxt);
        HashMap<String, String> vrcxt = new HashMap<String, String>();
        vrcxt.put(ID, rcxt);
        Anchor nan = new Anchor(anchor, Ids, 1, vlcxt, vrcxt);
        anchors.put(anchor, nan);
      }
    }
  }

  // identify the anchor tokens
  public void idenAnchor(int total) {
    Vector<String> dels = new Vector<String>();
    for (String a : anchors.keySet()) {
      int count = anchors.get(a).count;
      // if an anchor appears in more 10% records, it's a valid anchor
      if (count * 1.0 / total < 0.1) {
        dels.add(a);
      }
    }
    for (String s : dels) {
      anchors.remove(s);
    }
  }

  // identify the representative records of one anchor.
  // minimal set
  public HashSet<String> idenAnchorRecords(String anchor) {
    HashMap<String, Vector<String>> lcxt2ids = new HashMap<String, Vector<String>>();
    HashMap<String, Vector<String>> rcxt2ids = new HashMap<String, Vector<String>>();
    for (String Id : this.anchors.get(anchor).lefCxt.keySet()) {
      String s = this.anchors.get(anchor).lefCxt.get(Id);
      boolean isnew = true;
      for (String elem : lcxt2ids.keySet()) {
        if (elem.indexOf(s) == 0) {
          s = elem;
          isnew = false;
        }
      }
      if (isnew) {
        Vector<String> vs = new Vector<String>();
        vs.add(Id);
        lcxt2ids.put(s, vs);
      } else {
        lcxt2ids.get(s).add(Id);
      }
    }
    for (String Id : this.anchors.get(anchor).rigCxt.keySet()) {
      String s = this.anchors.get(anchor).rigCxt.get(Id);
      boolean isnew = true;
      for (String elem : rcxt2ids.keySet()) {
        if (elem.indexOf(s) == 0) {
          s = elem;
          isnew = false;
        }
      }
      if (isnew) {
        Vector<String> vs = new Vector<String>();
        vs.add(Id);
        rcxt2ids.put(s, vs);
      } else {
        rcxt2ids.get(s).add(Id);
      }
    }
    // generate candiate set
    HashSet<String> result = new HashSet<String>();
    for (String cxt : lcxt2ids.keySet()) {
      if (lcxt2ids.get(cxt).size() != 0) {
        String idString = lcxt2ids.get(cxt).get(0);
        if (!result.contains(idString)) {
          result.add(idString);
        }
      }
    }
    for (String cxt : rcxt2ids.keySet()) {
      if (rcxt2ids.get(cxt).size() != 0) {
        String idString = rcxt2ids.get(cxt).get(0);
        if (!result.contains(idString)) {
          result.add(idString);
        }
      }
    }
    return result;
  }

  // merge the record sets generated by each anchor
  // return the final Record ID list
  public HashSet<String> refineRecords() {
    HashSet<String> ids = new HashSet<String>();
    // find the union of the ids of all anchors
    for (String anchor : this.anchors.keySet()) {
      HashSet<String> set = this.idenAnchorRecords(anchor);
      ids.addAll(set);
    }
    return ids;
  }

  public static void main(String[] args) {
    String dirpath = "/Users/bowu/Research/testdata/TestSingleFile";
    RecordDistiller distiller = new RecordDistiller();
    File nf = new File(dirpath);
    File[] allfiles = nf.listFiles();
    for (File f : allfiles) {
      try {
        if (f.getName().indexOf(".csv") == (f.getName().length() - 4)) {
          @SuppressWarnings("resource")
          CSVReader cr = new CSVReader(new FileReader(f), ',', '"',
              '\0');
          String[] pair;
          int id = 0;
          HashMap<String, String> id2String = new HashMap<String, String>();
          while ((pair = cr.readNext()) != null) {
            if (pair == null || pair.length <= 1)
              break;
            Ruler ruler = new Ruler();
            ruler.setNewInput(pair[0]);
            distiller.readRecord("" + id, ruler.vec);
            id2String.put("" + id, pair[0]);
            id++;
          }
          distiller.idenAnchor(id2String.keySet().size());
          HashSet<String> allids = distiller.refineRecords();
          for (String xid : allids) {
            System.out.println(id2String.get(xid));
          }
          double compressRate = (allids.size() * 1.0)
              / id2String.keySet().size();
          for (String name : distiller.anchors.keySet()) {
            System.out.println("Anchor: " + name);
            for (String dString : distiller.anchors.get(name).IDs) {
              System.out.print(" " + dString);
            }
            System.out.println("\n");
          }
          System.out.println("" + compressRate);
        }
      } catch (Exception e) {
        System.out.println("" + e.toString());
      }
    }
  }
}

class Anchor {
  public String name;
  public Vector<String> IDs;
  public int count;
  // id 2 the left context
  public HashMap<String, String> lefCxt = new HashMap<String, String>();
  // id 2 the right context
  public HashMap<String, String> rigCxt = new HashMap<String, String>();

  public Anchor(String anchor, Vector<String> Ids, int count,
      HashMap<String, String> lcxt, HashMap<String, String> rcxt) {
    this.name = anchor;
    this.IDs = Ids;
    this.count = count;
    this.lefCxt = lcxt;
    this.rigCxt = rcxt;
  }
}
TOP

Related Classes of edu.isi.karma.cleaning.Research.Anchor

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.