/*******************************************************************************
* Copyright 2012 University of Southern California
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* This code was developed by the Information Integration Group as part
* of the Karma project at the Information Sciences Institute of the
* University of Southern California. For more information, publications,
* and related projects, please see: http://www.isi.edu/integration
******************************************************************************/
package edu.isi.karma.cleaning.features;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Vector;
import org.antlr.runtime.ANTLRStringStream;
import org.antlr.runtime.CharStream;
import org.antlr.runtime.Token;
import edu.isi.karma.cleaning.TNode;
import edu.isi.karma.cleaning.Tokenizer;
public class RegularityFeatureSet implements FeatureSet {
public ArrayList<Vector<TNode>> tokenseqs;
public ArrayList<Vector<TNode>> otokenseqs;
public Vector<String> fnames;
public static String[] targets = { "#", ";", ",", "!", "~", "@", "$", "%",
"^", "&", "*", "(", ")", "_", "-", "{", "}", "[", "]", "\"", "\'",
":", "?", "<", ">", ".", "bnk", "syb", "wrd", "num" };
public RegularityFeatureSet() {
tokenseqs = new ArrayList<Vector<TNode>>();
otokenseqs = new ArrayList<Vector<TNode>>();
fnames = new Vector<String>();
}
public Vector<TNode> tokenizer(String Org) {
CharStream cs = new ANTLRStringStream(Org);
Tokenizer tk = new Tokenizer(cs);
Token t;
t = tk.nextToken();
Vector<TNode> x = new Vector<TNode>();
while (t.getType() != -1) {
int mytype = -1;
if (t.getType() == 15) {
mytype = TNode.UWRDTYP;
} else if (t.getType() == 4) {
mytype = TNode.BNKTYP;
} else if (t.getType() == 10) {
mytype = TNode.NUMTYP;
} else if (t.getType() == 12) {
mytype = TNode.SYBSTYP;
} else if (t.getType() == 9) {
mytype = TNode.LWRDTYP;
}
TNode tx = new TNode(mytype, t.getText());
x.add(tx);
t = tk.nextToken();
}
return x;
}
public Collection<Feature> computeFeatures(Collection<String> examples,
Collection<String> oexamples) {
Vector<Feature> r = new Vector<Feature>();
for (String s : examples) {
Vector<TNode> x = this.tokenizer(s);
this.tokenseqs.add(x);
}
for (String s : oexamples) {
Vector<TNode> x = this.tokenizer(s);
this.otokenseqs.add(x);
}
// counting feature
String[] symbol = { "#", ";", ",", "!", "~", "@", "$", "%", "^", "&",
"*", "(", ")", "_", "-", "{", "}", "[", "]", "\"", "'", ":",
"?", "<", ">", "." };
Vector<CntFeature> cntfs = new Vector<CntFeature>(symbol.length);
// moving feature
Vector<MovFeature> movfs = new Vector<MovFeature>(symbol.length);
for (int i = 0; i < symbol.length; i++) {
TNode t = new TNode(TNode.SYBSTYP, symbol[i]);
Vector<TNode> li = new Vector<TNode>();
li.add(t);
cntfs.add(i, new CntFeature(this.otokenseqs, this.tokenseqs, li));
cntfs.get(i).setName("entr_cnt_" + symbol[i]);
movfs.add(i, new MovFeature(this.otokenseqs, this.tokenseqs, li));
movfs.get(i).setName("entr_mov" + symbol[i]);
}
// count the blank, symbol wrd and number token
TNode t = new TNode(TNode.BNKTYP, TNode.ANYTOK);
Vector<TNode> li = new Vector<TNode>();
li.add(t);
CntFeature cf = new CntFeature(this.otokenseqs, this.tokenseqs, li);
cf.setName("entr_cnt_bnk");
TNode t1 = new TNode(TNode.SYBSTYP, TNode.ANYTOK);
Vector<TNode> li1 = new Vector<TNode>();
li1.add(t1);
CntFeature cf1 = new CntFeature(this.otokenseqs, this.tokenseqs, li1);
cf1.setName("entr_cnt_syb");
TNode t2 = new TNode(TNode.LWRDTYP, TNode.ANYTOK);
Vector<TNode> li2 = new Vector<TNode>();
li2.add(t2);
CntFeature cf2 = new CntFeature(this.otokenseqs, this.tokenseqs, li2);
cf2.setName("entr_cnt_lwrd");
TNode t3 = new TNode(TNode.NUMTYP, TNode.ANYTOK);
Vector<TNode> li3 = new Vector<TNode>();
li3.add(t3);
CntFeature cf3 = new CntFeature(this.otokenseqs, this.tokenseqs, li3);
cf3.setName("entr_cnt_num");
/*
* TNode t4 = new TNode(TNode.UWRDTYP,TNode.ANYTOK); Vector<TNode> li4 =
* new Vector<TNode>(); li3.add(t4); CntFeature cf4 = new
* CntFeature(this.otokenseqs,this.tokenseqs,li4);
* cf3.setName("entr_cnt_num");
*/
cntfs.add(cf);
cntfs.add(cf1);
cntfs.add(cf2);
cntfs.add(cf3);
// cntfs.add(cf4);
r.addAll(cntfs);
r.addAll(movfs);
for (int i = 0; i < r.size(); i++) {
fnames.add(r.get(i).getName());
}
return r;
}
public static void buildEntropy(double a, int[] buk) {
int buks[] = buk;
if (a >= 0.0 && a < 0.1) {
buks[0] += 1;
} else if (a >= 0.1 && a < 0.2) {
buks[1] += 1;
} else if (a >= 0.2 && a < 0.3) {
buks[2] += 1;
} else if (a >= 0.3 && a < 0.4) {
buks[3] += 1;
} else if (a >= 0.4 && a < 0.5) {
buks[4] += 1;
} else if (a >= 0.5 && a < 0.6) {
buks[5] += 1;
} else if (a >= 0.6 && a < 0.7) {
buks[6] += 1;
} else if (a >= 0.7 && a < 0.8) {
buks[7] += 1;
} else if (a >= 0.8 && a < 0.9) {
buks[8] += 1;
} else if (a >= 0.9 && a <= 1.0) {
buks[9] += 1;
}
}
public static double calShannonEntropy(int[] a) {
int cnt = 0;
for (int c : a) {
cnt += c;
}
if (cnt == 0)
return Math.log(10);//
double entropy = 0.0;
for (int i = 0; i < a.length; i++) {
double freq = a[i] * 1.0 / cnt;
if (freq == 0)
continue;
entropy -= freq * Math.log(freq);
}
return entropy;
}
public Collection<String> getFeatureNames() {
return fnames;
}
}