package ivory.core.tokenize;
import ivory.core.Constants;
import java.io.StringReader;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.log4j.Level;
import org.apache.log4j.Logger;
import org.apache.lucene.analysis.LowerCaseFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.standard.StandardFilter;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.util.Version;
import edu.umd.hooka.VocabularyWritable;
import edu.umd.hooka.alignment.HadoopAlign;
public class LuceneAnalyzer extends ivory.core.tokenize.Tokenizer {
private static final Logger LOG = Logger.getLogger(LuceneAnalyzer.class);
static{
LOG.setLevel(Level.WARN);
}
private Tokenizer tokenizer;
private Stemmer stemmer;
private int lang;
private static final int SPANISH = 0, TURKISH = 1, CZECH = 2;
private static final String[] classes = {
"org.tartarus.snowball.ext.spanishStemmer",
"org.tartarus.snowball.ext.turkishStemmer",
"ivory.core.tokenize.CzechStemmer"};
@Override
public void configure(Configuration conf) {
configure(conf, null);
}
@Override
public void configure(Configuration conf, FileSystem fs) {
if (conf.getBoolean(Constants.Stemming, true)) {
setLanguageAndStemmer(conf.get(Constants.Language));
isStemming = true;
}else {
setLanguage(conf.get(Constants.Language));
}
// read stopwords from file (stopwords will be empty set if file does not exist or is empty)
String stopwordsFile = conf.get(Constants.StopwordList);
stopwords = readInput(fs, stopwordsFile);
String stemmedStopwordsFile = conf.get(Constants.StemmedStopwordList);
stemmedStopwords = readInput(fs, stemmedStopwordsFile);
isStopwordRemoval = !stopwords.isEmpty();
VocabularyWritable vocab;
try {
vocab = (VocabularyWritable) HadoopAlign.loadVocab(new Path(conf.get(Constants.CollectionVocab)), fs);
setVocab(vocab);
} catch (Exception e) {
LOG.warn("No vocabulary provided to tokenizer.");
vocab = null;
}
LOG.warn("Stemming is " + isStemming + "; Stopword removal is " + isStopwordRemoval +"; number of stopwords: " + stopwords.size() +"; stemmed: " + stemmedStopwords.size());
}
public void setLanguage(String l){
if(l.equalsIgnoreCase("spanish") || l.equalsIgnoreCase("es")){
lang = SPANISH;
}else if (l.equalsIgnoreCase("turkish") || l.equalsIgnoreCase("tr")){
lang = TURKISH;
}else if (l.equalsIgnoreCase("czech") || l.equalsIgnoreCase("cs") || l.equalsIgnoreCase("cz")){
lang = CZECH;
}else{
LOG.warn("Language not recognized, setting to English!");
}
}
@SuppressWarnings("unchecked")
public void setLanguageAndStemmer(String l){
setLanguage(l);
Class<? extends Stemmer> stemClass;
try {
stemClass = (Class<? extends Stemmer>) Class.forName(classes[lang]);
stemmer = (Stemmer) stemClass.newInstance();
} catch (ClassNotFoundException e) {
LOG.warn("Stemmer class not recognized!\n" + classes[lang]);
stemmer = null;
return;
} catch (Exception e) {
e.printStackTrace();
throw new RuntimeException(e);
}
}
@Override
public String[] processContent(String text) {
tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(text));
TokenStream tokenStream = new StandardFilter(Version.LUCENE_35, tokenizer);
tokenStream = new LowerCaseFilter(Version.LUCENE_35, tokenStream);
String tokenized = postNormalize(streamToString(tokenStream));
StringBuilder finalTokenized = new StringBuilder();
for (String token : tokenized.split(" ")) {
if ( isStopwordRemoval() && isDiscard(false, token) ) {
continue;
}
String stemmedToken = stem(token);
if ( vocab != null && vocab.get(stemmedToken) <= 0) {
continue;
}
finalTokenized.append(stemmedToken + " ");
}
return finalTokenized.toString().trim().split(" ");
}
@Override
public String stem(String token) {
if ( stemmer != null ) {
return stemmer.toStem(token);
}else {
return token;
}
}
@Override
public float getOOVRate(String text, VocabularyWritable vocab) {
int countOOV = 0, countAll = 0;
tokenizer = new StandardTokenizer(Version.LUCENE_35, new StringReader(text));
TokenStream tokenStream = new StandardFilter(Version.LUCENE_35, tokenizer);
tokenStream = new LowerCaseFilter(Version.LUCENE_35, tokenStream);
String tokenized = postNormalize(streamToString(tokenStream));
for (String token : tokenized.split(" ")) {
if ( isStopwordRemoval() && isDiscard(false, token) ) {
continue;
}
String stemmedToken = stem(token);
if ( vocab != null && vocab.get(stemmedToken) <= 0) {
countOOV++;
}
countAll++;
}
return (countOOV / (float) countAll);
}
}