Examples of org.apache.lucene.analysis.TokenStream.incrementToken()

Class org.apache.lucene.analysis.TokenStream

Examples of org.apache.lucene.analysis.TokenStream.incrementToken()

org.apache.lucene.analysis.TokenStream.incrementToken()
Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to the next token. Implementing classes must implement this method and update the appropriate {@link AttributeImpl}s with the attributes of the next token.
The producer must make no assumptions about the attributes after the method has been returned: the caller may arbitrarily change it. If the producer needs to preserve the state for subsequent calls, it can use {@link #captureState} to create a copy of the current attribute state.
This method is called for every token of a document, so an efficient implementation is crucial for good performance. To avoid calls to {@link #addAttribute(Class)} and {@link #getAttribute(Class)}, references to all {@link AttributeImpl}s that this stream uses should be retrieved during instantiation.
To ensure that filters and consumers know which attributes are available, the attributes must be added during instantiation. Filters and consumers are not required to check for availability of attributes in {@link #incrementToken()}. @return false for end of stream; true otherwise

      String s = _TestUtil.randomUnicodeString(random());
      TokenStream ts = analyzer.tokenStream("foo", s);
      try {
        ts.reset();
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
          String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
          for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
            cp = highlightedText.codePointAt(j);
            assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
          }

View Full Code Here

    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int end = Character.offsetByCodePoints(s, 0, i);
      assertEquals(s.substring(0, end), termAtt.toString());
    }

View Full Code Here

      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int end = Character.offsetByCodePoints(s, 0, i);
      assertEquals(s.substring(0, end), termAtt.toString());
    }
    assertFalse(tk.incrementToken());
  }


}

View Full Code Here

    
    TermAttribute termAtt = (TermAttribute) result
        .addAttribute(TermAttribute.class);
    StringBuilder buf = new StringBuilder();
    try {
      while (result.incrementToken()) {
        String word = new String(termAtt.termBuffer(), 0, termAtt
            .termLength());
        buf.append(filter.encode(word)).append(" ");
        
      }

View Full Code Here

    StringReader in = new StringReader("text to magically vectorize");
    TokenStream ts = analyzer.tokenStream("body", in);
    TermAttribute termAtt = ts.addAttribute(TermAttribute.class);


    Vector v1 = new RandomAccessSparseVector(100);                   
    while (ts.incrementToken()) {
      char[] termBuffer = termAtt.termBuffer();
      int termLen = termAtt.termLength();
      String w = new String(termBuffer, 0, termLen);                 
      encoder.addToVector(w, 1, v1);                                 
    }

View Full Code Here

    result = new StopFilter(true, result, StandardAnalyzer.STOP_WORDS_SET);
    
    TermAttribute termAtt = (TermAttribute) result.addAttribute(TermAttribute.class);
    StringBuilder buf = new StringBuilder();
    try {
      while (result.incrementToken()) {
        if (termAtt.termLength() < 3) continue;
        String word = new String(termAtt.termBuffer(), 0, termAtt.termLength());
        Matcher m = alphabets.matcher(word);
        
        if (m.matches()) {

View Full Code Here

  }


  private static void countWords(Analyzer analyzer, Collection<String> words, Reader in) throws IOException {
      TokenStream ts = analyzer.tokenStream("text", in);
      ts.addAttribute(CharTermAttribute.class);
      while (ts.incrementToken()) {
        String s = ts.getAttribute(CharTermAttribute.class).toString();
        words.add(s);
      }
      /*overallCounts.addAll(words);*/
    }

View Full Code Here

              throw new IOException("Expected input to be chararray, but  got " + o.getClass().getName());
          }
          Tokenizer source = new WhitespaceTokenizer(Version.LUCENE_43, new StringReader((String)o));
          TokenStream tokenstream = new LowerCaseEntityPreservingFilter(source);
          tokenstream.reset();
          while (tokenstream.incrementToken()){
            String token = tokenstream.getAttribute(CharTermAttribute.class).toString();
            output.add(mTupleFactory.newTuple(token));
          }
          return output;
      } catch (Exception e) {

View Full Code Here

      TokenStream stream = null;
      stream = analyzer.tokenStream("text", new StringReader(text));


      CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);
      stream.reset();
      while(stream.incrementToken()) {
        String term = charTermAttribute.toString();
        result.add(term);
      }
    } catch (IOException e) {
      e.printStackTrace();

View Full Code Here

  public void baseUIMAAnalyzerStreamTest() {
    try {
      TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      while (ts.incrementToken()) {
        assertNotNull(offsetAtt);
        assertNotNull(termAtt);
        System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + "," + offsetAtt.endOffset());
      }
    } catch (Exception e) {

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.