Package org.apache.lucene.analysis.cn.smart

Examples of org.apache.lucene.analysis.cn.smart.WordTokenFilter


    this.stopWords = stopWords;
  }

  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new SentenceTokenizer(reader);
    result = new WordTokenFilter(result);
    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (stopWords != null) {
View Full Code Here


    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
      streams = new SavedStreams();
      setPreviousTokenStream(streams);
      streams.tokenStream = new SentenceTokenizer(reader);
      streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
      streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
      if (stopWords != null) {
        streams.filteredTokenStream = new StopFilter(streams.filteredTokenStream, stopWords, false);
      }
    } else {
View Full Code Here

    }
  }
 
  @Override
  public TokenFilter create(TokenStream input) {
      return new WordTokenFilter(input);
  }
View Full Code Here

  }

  @Override
  public TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer tokenizer = new SentenceTokenizer(reader);
    TokenStream result = new WordTokenFilter(tokenizer);
    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (!stopWords.isEmpty()) {
View Full Code Here

  }

  @Override
  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new SentenceTokenizer(reader);
    result = new WordTokenFilter(result);
    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (!stopWords.isEmpty()) {
View Full Code Here

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
      streams = new SavedStreams();
      setPreviousTokenStream(streams);
      streams.tokenStream = new SentenceTokenizer(reader);
      streams.filteredTokenStream = new WordTokenFilter(streams.tokenStream);
      streams.filteredTokenStream = new PorterStemFilter(streams.filteredTokenStream);
      if (!stopWords.isEmpty()) {
        streams.filteredTokenStream = new StopFilter(StopFilter.getEnablePositionIncrementsVersionDefault(matchVersion),
                                                     streams.filteredTokenStream, stopWords, false);
      }
View Full Code Here

* <code>words="org/apache/lucene/analysis/cn/smart/stopwords.txt"</code>
* @lucene.experimental
*/
public class SmartChineseWordTokenFilterFactory extends BaseTokenFilterFactory {
  public TokenFilter create(TokenStream input) {
      return new WordTokenFilter(input);
  }
View Full Code Here

* @lucene.experimental
*/
public class SmartChineseWordTokenFilterFactory extends TokenFilterFactory {
  @Override
  public TokenFilter create(TokenStream input) {
      return new WordTokenFilter(input);
  }
View Full Code Here

                log.error(message,e);
                throw new EngineException(this, ci, message, e);
            }
        }
        //now the tokens
        TokenStream tokens = new WordTokenFilter(new AnalyzedTextSentenceTokenizer(at));
        try {
          tokens.reset();
            while(tokens.incrementToken()){
                OffsetAttribute offset = tokens.addAttribute(OffsetAttribute.class);
                Token t = at.addToken(offset.startOffset(), offset.endOffset());
                log.trace("detected {}",t);
            }
        } catch (IOException e) {
            String message = String.format("IOException while reading from "
View Full Code Here

    this.matchVersion = matchVersion;
  }

  public TokenStream tokenStream(String fieldName, Reader reader) {
    TokenStream result = new SentenceTokenizer(reader);
    result = new WordTokenFilter(result);
    // result = new LowerCaseFilter(result);
    // LowerCaseFilter is not needed, as SegTokenFilter lowercases Basic Latin text.
    // The porter stemming is too strict, this is not a bug, this is a feature:)
    result = new PorterStemFilter(result);
    if (stopWords != null) {
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.cn.smart.WordTokenFilter

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.