Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.Tokenizer


  // LUCENE-3642: normalize SMP->BMP and check that offsets are correct
  public void testCrossPlaneNormalization() throws IOException {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
          @Override
          protected int normalize(int c) {
            if (c > 0xffff) {
              return 'δ';
            } else {
View Full Code Here


  // LUCENE-3642: normalize BMP->SMP and check that offsets are correct
  public void testCrossPlaneNormalization2() throws IOException {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new LetterTokenizer(TEST_VERSION_CURRENT, reader) {
          @Override
          protected int normalize(int c) {
            if (c <= 0xffff) {
              return 0x1043C;
            } else {
View Full Code Here

  // so in this case we behave like WDF, and preserve any modified offsets
  public void testInvalidOffsets() throws Exception {
    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
        filters = new EdgeNGramTokenFilter(Version.LUCENE_43, filters, EdgeNGramTokenFilter.Side.FRONT, 2, 15);
        return new TokenStreamComponents(tokenizer, filters);
      }
    };
View Full Code Here

    return Arrays.asList(arr);
  }

  static void assertTokenizesTo(SlowSynonymMap dict, String input,
      String expected[]) throws IOException {
    Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
    SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
    assertTokenStreamContents(stream, expected);
  }
View Full Code Here

      final int max = _TestUtil.nextInt(random(), min, 20);
   
      Analyzer a = new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
          return new TokenStreamComponents(tokenizer,
            new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, min, max));
        }   
      };
      checkRandomData(random(), a, 100*RANDOM_MULTIPLIER);
    }
   
    Analyzer b = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new MockTokenizer(reader, MockTokenizer.WHITESPACE, false);
        return new TokenStreamComponents(tokenizer,
            new EdgeNGramTokenFilter(Version.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 4));
      }   
    };
    checkRandomData(random(), b, 1000*RANDOM_MULTIPLIER, 20, false, false);
View Full Code Here

  public void testEmptyTerm() throws Exception {
    Random random = random();
    Analyzer a = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer,
            new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tokenizer, EdgeNGramTokenFilter.Side.FRONT, 2, 15));
      }   
    };
    checkAnalysisConsistency(random, a, random.nextBoolean(), "");
   
    Analyzer b = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
        Tokenizer tokenizer = new KeywordTokenizer(reader);
        return new TokenStreamComponents(tokenizer,
            new EdgeNGramTokenFilter(Version.LUCENE_43, tokenizer, EdgeNGramTokenFilter.Side.BACK, 2, 15));
      }   
    };
    checkAnalysisConsistency(random, b, random.nextBoolean(), "");
View Full Code Here

    assertTokenStreamContents(stream, expected);
  }
 
  static void assertTokenizesTo(SlowSynonymMap dict, String input,
      String expected[], int posIncs[]) throws IOException {
    Tokenizer tokenizer = new MockTokenizer(new StringReader(input), MockTokenizer.WHITESPACE, false);
    SlowSynonymFilter stream = new SlowSynonymFilter(tokenizer, dict);
    assertTokenStreamContents(stream, expected, posIncs);
  }
View Full Code Here

  public void testEndingHole() throws Exception {
    // Just deletes "of"
    Analyzer a = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String field, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader);
          CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
          return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
        }
      };
View Full Code Here

  public void testTwoEndingHoles() throws Exception {
    // Just deletes "of"
    Analyzer a = new Analyzer() {
        @Override
        public TokenStreamComponents createComponents(String field, Reader reader) {
          Tokenizer tokenizer = new MockTokenizer(reader);
          CharArraySet stopSet = StopFilter.makeStopSet(TEST_VERSION_CURRENT, "of");
          return new TokenStreamComponents(tokenizer, new StopFilter(TEST_VERSION_CURRENT, tokenizer, stopSet));
        }
      };
View Full Code Here

          DataBag output = mBagFactory.newDefaultBag();
          Object o = input.get(0);
          if (!(o instanceof String)) {
              throw new IOException("Expected input to be chararray, but  got " + o.getClass().getName());
          }
          Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader((String)o));
          TokenStream tokenstream = new LowerCaseEntityPreservingFilter(source);
          tokenstream.reset();
          while (tokenstream.incrementToken()){
            String token = tokenstream.getAttribute(CharTermAttribute.class).toString();
            output.add(mTupleFactory.newTuple(token));
View Full Code Here

TOP

Related Classes of org.apache.lucene.analysis.Tokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.