Package com.enigmastation.extractors.impl

Source Code of com.enigmastation.extractors.impl.LuceneStemmingWordLister

package com.enigmastation.extractors.impl;

import javolution.util.FastSet;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.snowball.SnowballFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;

import java.io.IOException;
import java.io.StringReader;
import java.util.Set;
import java.util.Collection;

import com.enigmastation.extractors.WordLister;

/**
* This is an implementation of a wordlister that uses Lucene's Snowball
* stemming implementation. It's... okay. In the training corpus, it was just
* as accurate as the SimpleWordLister, but almost twice as slow:
* <p/>
* 2146 items, 36 misses: 98.32% accuracy and 6ms/item.
* <p/>
* Yuck. Use this puppy ONLY if you really need Snowball.
*/
public class LuceneStemmingWordLister extends SimpleWordLister {
    static final Set<String> emptySet = new FastSet<String>();

    public void addWords(Object obj, Collection<String> features) {
        String document = obj.toString().toLowerCase();
        StandardTokenizer tokenizer = new StandardTokenizer(new StringReader(document));
        tokenizer.setMaxTokenLength(20);
        TokenFilter psf = new SnowballFilter(tokenizer, "English");
        Token t;
        StringBuilder sb = new StringBuilder();
        try {
            while ((t = psf.next()) != null) {
                sb.setLength(0);
                sb.append(t.termBuffer(), 0, t.termLength());
                //System.out.println(sb.toString());
                features.add(sb.toString());
            }
        } catch (IOException e) {
            // should never happen! We're reading a flippin' STRING!
            e.printStackTrace();
        }
    }
}
TOP

Related Classes of com.enigmastation.extractors.impl.LuceneStemmingWordLister

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.