Package org.apache.lucene.analysis

Examples of org.apache.lucene.analysis.TokenStream.incrementToken()


      ArrayList<SpanQuery> clausesList=new ArrayList<SpanQuery>();
      TokenStream ts=analyzer.reusableTokenStream(fieldName,new StringReader(value));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
     
      ts.reset();
      while (ts.incrementToken()) {
          SpanTermQuery stq=new SpanTermQuery(new Term(fieldName, termAtt.toString()));
          clausesList.add(stq);
      }
      ts.end();
      ts.close();
View Full Code Here


       TokenStream ts = analyzer.reusableTokenStream(fieldName, r);
      int tokenCount=0;
      // for every token
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      while (ts.incrementToken()) {
        String word = termAtt.toString();
        tokenCount++;
        if(tokenCount>maxNumTokensParsed)
        {
          break;
View Full Code Here

    TokenStream ts = a.reusableTokenStream( field, new StringReader( body));
    CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
   
    BooleanQuery tmp = new BooleanQuery();
    Set<String> already = new HashSet<String>(); // ignore dups
    while (ts.incrementToken()) {
      String word = termAtt.toString();
      // ignore opt stop words
      if ( stop != null &&
         stop.contains( word)) continue;
      // ignore dups
View Full Code Here

          boolean hasMoreTokens = false;
         
          stream.reset();
          final CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

          hasMoreTokens = stream.incrementToken();
          while (hasMoreTokens) {
            terms.add(termAtt.toString());
            hasMoreTokens = stream.incrementToken();
          }
          processTerms(terms.toArray(new String[terms.size()]));
View Full Code Here

          final CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);

          hasMoreTokens = stream.incrementToken();
          while (hasMoreTokens) {
            terms.add(termAtt.toString());
            hasMoreTokens = stream.incrementToken();
          }
          processTerms(terms.toArray(new String[terms.size()]));
        } catch (IOException e) {
        }
      }
View Full Code Here

    for (int i = 0; i < numTestPoints; i++) {
      String term = _TestUtil.randomSimpleString(random);
      TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
      CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
      ts.reset();
      assertTrue(ts.incrementToken());
      // ensure we make a copy of the actual bytes too
      map.put(term, encodedBytes.toString());
    }
   
    Thread threads[] = new Thread[numThreads];
View Full Code Here

              String term = mapping.getKey();
              String expected = mapping.getValue();
              TokenStream ts = analyzer.reusableTokenStream("fake", new StringReader(term));
              CharTermAttribute encodedBytes = ts.addAttribute(CharTermAttribute.class);
              ts.reset();
              assertTrue(ts.incrementToken());
              assertEquals(expected, encodedBytes.toString());
            }
          } catch (IOException e) {
            throw new RuntimeException(e);
          }
View Full Code Here

      TokenStream ts = analyzer.tokenStream(label, reader);
      writer.write(label);
      writer.write('\t'); // edit: Inorder to match Hadoop standard
      // TextInputFormat
      TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
      while (ts.incrementToken()) {
        char[] termBuffer = termAtt.termBuffer();
        int termLen = termAtt.termLength();
        writer.write(termBuffer, 0, termLen);
        writer.write(' ');
      }
View Full Code Here

  public static String[] readerToDocument(Analyzer analyzer, Reader reader) throws IOException {
    TokenStream ts = analyzer.tokenStream("", reader);
   
    List<String> coll = new ArrayList<String>();
    TermAttribute termAtt = (TermAttribute) ts.addAttribute(TermAttribute.class);
    while (ts.incrementToken()) {
      char[] termBuffer = termAtt.termBuffer();
      int termLen = termAtt.termLength();
      String val = new String(termBuffer, 0, termLen);
      coll.add(val);
    }
View Full Code Here

      document = StringEscapeUtils.unescapeHtml(WikipediaDatasetCreatorMapper.CLOSE_TEXT_TAG_PATTERN.matcher(
        WikipediaDatasetCreatorMapper.OPEN_TEXT_TAG_PATTERN.matcher(document).replaceFirst(""))
          .replaceAll(""));
      TokenStream stream = analyzer.tokenStream(catMatch, new StringReader(document));
      TermAttribute termAtt = (TermAttribute) stream.addAttribute(TermAttribute.class);
      while (stream.incrementToken()) {
        contents.append(termAtt.termBuffer(), 0, termAtt.termLength()).append(' ');
      }
      output.collect(new Text(WikipediaDatasetCreatorMapper.SPACE_NON_ALPHA_PATTERN.matcher(catMatch)
          .replaceAll("_")), new Text(contents.toString()));
    }
View Full Code Here

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.