Package org.apache.lucene.analysis.core

Examples of org.apache.lucene.analysis.core.WhitespaceTokenizer


    assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  }

  public void testRetainMockAttribute() throws Exception {
    CharArraySet dict = makeDictionary("abc", "d", "efg");
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
        new StringReader("abcdefg"));
    TokenStream stream = new MockRetainAttributeFilter(tokenizer);
    stream = new DictionaryCompoundWordTokenFilter(
        TEST_VERSION_CURRENT, stream, dict,
        CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
View Full Code Here


    final TokenizerFactory factory = tokenizerFactory == null ? null : loadTokenizerFactory(loader, tokenizerFactory);
   
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = factory == null ? new WhitespaceTokenizer(Version.LUCENE_CURRENT, reader) : factory.create(reader);
        TokenStream stream = ignoreCase ? new LowerCaseFilter(Version.LUCENE_CURRENT, tokenizer) : tokenizer;
        return new TokenStreamComponents(tokenizer, stream);
      }
    };
View Full Code Here

  }

 
  static void assertAlgorithm(Encoder encoder, boolean inject, String input,
      String[] expected) throws Exception {
    Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
        new StringReader(input));
    PhoneticFilter filter = new PhoneticFilter(tokenizer, encoder, inject);
    assertTokenStreamContents(filter, expected);
  }
View Full Code Here

import org.apache.lucene.util.TestUtil;

public class DoubleMetaphoneFilterTest extends BaseTokenStreamTestCase {

  public void testSize4FalseInject() throws Exception {
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
    assertTokenStreamContents(filter, new String[] { "ANTR" });
  }
View Full Code Here

    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
    assertTokenStreamContents(filter, new String[] { "ANTR" });
  }

  public void testSize4TrueInject() throws Exception {
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
    assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
  }
View Full Code Here

    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, true);
    assertTokenStreamContents(filter, new String[] { "international", "ANTR" });
  }

  public void testAlternateInjectFalse() throws Exception {
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("Kuczewski"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
    assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
  }
View Full Code Here

    TokenStream filter = new DoubleMetaphoneFilter(stream, 4, false);
    assertTokenStreamContents(filter, new String[] { "KSSK", "KXFS" });
  }

  public void testSize8FalseInject() throws Exception {
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("international"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
    assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
  }
View Full Code Here

    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
    assertTokenStreamContents(filter, new String[] { "ANTRNXNL" });
  }

  public void testNonConvertableStringsWithInject() throws Exception {
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
  }
View Full Code Here

    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, true);
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
  }

  public void testNonConvertableStringsWithoutInject() throws Exception {
    TokenStream stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%&"));
    TokenStream filter = new DoubleMetaphoneFilter(stream, 8, false);
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&" });
   
    // should have something after the stream
    stream = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("12345 #$%@#^%& hello"));
    filter = new DoubleMetaphoneFilter(stream, 8, false);
    assertTokenStreamContents(filter, new String[] { "12345", "#$%@#^%&", "HL" });
  }
View Full Code Here

                           TRI_GRAM_POSITION_INCREMENTS_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS,
                           TRI_GRAM_TYPES_POS_INCR_GREATER_THAN_N_WITHOUT_UNIGRAMS, false);
  }
 
  public void testReset() throws Exception {
    Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader("please divide this sentence"));
    TokenStream filter = new ShingleFilter(wsTokenizer, 2);
    assertTokenStreamContents(filter,
      new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
      new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
      new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
      new int[]{1,0,1,0,1,0,1}
    );
    wsTokenizer.setReader(new StringReader("please divide this sentence"));
    assertTokenStreamContents(filter,
      new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"},
      new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27},
      new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE},
      new int[]{1,0,1,0,1,0,1}
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.core.WhitespaceTokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.