Package org.apache.mahout.vectorizer.encoders

Examples of org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder


    return vectorize(counts, w, normalize, dimension);
  }

  static Vector vectorize(Multiset<String> doc, CorpusWeighting w, boolean normalize, int dimension) {
    Vector v = new RandomAccessSparseVector(dimension);
    FeatureVectorEncoder encoder = new StaticWordValueEncoder("text");
    for (String word : doc.elementSet()) {
      encoder.addToVector(word, w.weight(word, doc.count(word)), v);
    }
    if (normalize) {
      return v.assign(Functions.div(v.norm(2)));
    } else {
      return v;
View Full Code Here


import org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder;

public class TokenizingAndVectorizingText {

  public static void main(String[] args) throws IOException {
    FeatureVectorEncoder encoder = new StaticWordValueEncoder("text");
    Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_31);    

    StringReader in = new StringReader("text to magically vectorize");
    TokenStream ts = analyzer.tokenStream("body", in);
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);

    Vector v1 = new RandomAccessSparseVector(100);                  
    while (ts.incrementToken()) {
      char[] termBuffer = termAtt.termBuffer();
      int termLen = termAtt.termLength();
      String w = new String(termBuffer, 0, termLen);                
      encoder.addToVector(w, 1, v1);                                
    }
    System.out.printf("%s\n", new SequentialAccessSparseVector(v1));
  }
View Full Code Here

  public static void main(String[] args) {
    File base = new File(args[0]);
    overallCounts = HashMultiset.create();

    Map<String, Set<Integer>> traceDictionary = new TreeMap<String, Set<Integer>>();
    FeatureVectorEncoder encoder = new StaticWordValueEncoder("body");
    encoder.setProbes(2);
    encoder.setTraceDictionary(traceDictionary);
    FeatureVectorEncoder bias = new ConstantValueEncoder("Intercept");
    bias.setTraceDictionary(traceDictionary);
    FeatureVectorEncoder lines = new ConstantValueEncoder("Lines");
    lines.setTraceDictionary(traceDictionary);
    Dictionary newsGroups = new Dictionary();
View Full Code Here

TOP

Related Classes of org.apache.mahout.vectorizer.encoders.StaticWordValueEncoder

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.