Examples of org.apache.lucene.analysis.TokenStream.incrementToken()

Class org.apache.lucene.analysis.TokenStream

Examples of org.apache.lucene.analysis.TokenStream.incrementToken()

org.apache.lucene.analysis.TokenStream.incrementToken()
Consumers (i.e., {@link IndexWriter}) use this method to advance the stream to the next token. Implementing classes must implement this method and update the appropriate {@link AttributeImpl}s with the attributes of the next token.
The producer must make no assumptions about the attributes after the method has been returned: the caller may arbitrarily change it. If the producer needs to preserve the state for subsequent calls, it can use {@link #captureState} to create a copy of the current attribute state.
This method is called for every token of a document, so an efficient implementation is crucial for good performance. To avoid calls to {@link #addAttribute(Class)} and {@link #getAttribute(Class)}, references to all {@link AttributeImpl}s that this stream uses should be retrieved during instantiation.
To ensure that filters and consumers know which attributes are available, the attributes must be added during instantiation. Filters and consumers are not required to check for availability of attributes in {@link #incrementToken()}. @return false for end of stream; true otherwise

      ArrayList<SpanQuery> clausesList=new ArrayList<SpanQuery>();
      TokenStream ts=analyzer.reusableTokenStream(fieldName,new StringReader(value));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      
      ts.reset();
      while (ts.incrementToken()) {
          SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, termAtt.toString()));
          clausesList.add(stq);
      }
      ts.end();
      ts.close();

View Full Code Here

       TokenStream ts = analyzer.reusableTokenStream(fieldName, r);
      int tokenCount=0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if(tokenCount>maxNumTokensParsed)
        {
          break;

View Full Code Here

    TokenStream ts = a.reusableTokenStream( field, new StringReader( body));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
    
    BooleanQuery tmp = new BooleanQuery();
    Set<String> already = new HashSet<String>(); // ignore dups
    while (ts.incrementToken()) {
      String word = termAtt.toString();
      // ignore opt stop words
      if ( stop != null &&
         stop.contains( word)) continue;
      // ignore dups

View Full Code Here

          boolean hasMoreTokens = false;
          
          stream.reset(); 
          final CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);


          hasMoreTokens = stream.incrementToken();
          while (hasMoreTokens) {
            terms.add(termAtt.toString());
            hasMoreTokens = stream.incrementToken();
          }
          processTerms(terms.toArray(new String[terms.size()]));

View Full Code Here

          final CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);


          hasMoreTokens = stream.incrementToken();
          while (hasMoreTokens) {
            terms.add(termAtt.toString());
            hasMoreTokens = stream.incrementToken();
          }
          processTerms(terms.toArray(new String[terms.size()]));
        } catch (IOException e) {
        }
      }

View Full Code Here

    for (int i = 0; i < numTestPoints; i++) {
      String term = _TestUtil.randomSimpleString(random);
      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
      CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      assertTrue(ts.incrementToken());
      // ensure we make a copy of the actual bytes too
      map.put(term, encodedBytes.toString());
    }
    
    Thread threads[] = new Thread[numThreads];

View Full Code Here

              String term = mapping.getKey();
              String expected = mapping.getValue();
              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
              CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
              ts.reset();
              assertTrue(ts.incrementToken());
              assertEquals(expected, encodedBytes.toString());
            }
          } catch (IOException e) {
            throw new RuntimeException(e);
          }

View Full Code Here

      TokenStream ts = analyzer.tokenStream(label, reader);
      writer.write(label);
      writer.write('\t'); // edit: Inorder to match Hadoop standard
      // TextInputFormat
      TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
      while (ts.incrementToken()) {
        char[] termBuffer = termAtt.termBuffer();
        int termLen = termAtt.termLength();
        writer.write(termBuffer, 0, termLen);
        writer.write(' ');
      }

View Full Code Here

  public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
    TokenStream ts = analyzer.tokenStream("", reader);
    
    List<String> coll = new ArrayList<String>();
    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
      char[] termBuffer = termAtt.termBuffer();
      int termLen = termAtt.termLength();
      String val = new String(termBuffer, 0, termLen);
      coll.add(val);
    }

View Full Code Here

      document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
        WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
          .replaceAll(""));
      TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
      TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
      while (stream.incrementToken()) {
        contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
      }
      output.collect(new Text(WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
          .replaceAll("_")), new Text(contents.toString()));
    }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.