Examples of CharTermAttribute

org.apache.lucene.analysis.tokenattributes.CharTermAttribute
The term text of a Token.

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

  private void assertCollatesToSame(TokenStream stream1, TokenStream stream2) throws IOException {
    assertCollation(stream1, stream2, 0);
  }
  
  private void assertCollation(TokenStream stream1, TokenStream stream2, int comparison) throws IOException {
    CharTermAttribute term1 = stream1
        .addAttribute(CharTermAttribute.class);
    CharTermAttribute term2 = stream2
        .addAttribute(CharTermAttribute.class);
    stream1.reset();
    stream2.reset();
    assertTrue(stream1.incrementToken());
    assertTrue(stream2.incrementToken());
    assertEquals(Integer.signum(comparison), Integer.signum(term1.toString().compareTo(term2.toString())));
    assertFalse(stream1.incrementToken());
    assertFalse(stream2.incrementToken());
    stream1.end();
    stream2.end();
    stream1.close();

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

      @Override
      protected boolean isTokenChar(int chr) {
        return nonTokenChars.indexOf(chr) < 0;
      }
    };
    final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
    grams.reset();
    for (int start = 0; start < codePoints.length; ++start) {

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

    final int codePointCount = s.codePointCount(0, s.length());
    final int minGram = _TestUtil.nextInt(random(), 1, 3);
    final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer(new StringReader(s));
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int end = Character.offsetByCodePoints(s, 0, i);
      assertEquals(s.substring(0, end), termAtt.toString());
    }
    assertFalse(tk.incrementToken());
  }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

    List<String> result = new LinkedList<String>();
    try {
      TokenStream stream = null;
      stream = analyzer.tokenStream("text", new StringReader(text));


      CharTermAttribute charTermAttribute = stream.addAttribute(CharTermAttribute.class);
      stream.reset();
      while(stream.incrementToken()) {
        String term = charTermAttribute.toString();
        result.add(term);
      }
    } catch (IOException e) {
      e.printStackTrace();
    }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute


  @Test
  public void baseUIMAAnalyzerStreamTest() {
    try {
      TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      while (ts.incrementToken()) {
        assertNotNull(offsetAtt);
        assertNotNull(termAtt);
        System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + "," + offsetAtt.endOffset());
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getLocalizedMessage());
    }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

  public void testSimpleUsage() {
    try {
      UIMATypeAwareAnalyzer analyzer = new UIMATypeAwareAnalyzer("/HmmTaggerAggregate.xml",
              "org.apache.uima.TokenAnnotation", "posTag");
      TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      TypeAttribute typeAttr = ts.addAttribute(TypeAttribute.class);
      PositionIncrementAttribute posAtt = ts.addAttribute(PositionIncrementAttribute.class);
      while (ts.incrementToken()) {
        assertNotNull(offsetAtt);
        assertNotNull(termAtt);
        assertNotNull(posAtt);
        assertNotNull(typeAttr);
        System.out.println("token '" + termAtt.toString() + "' has type " + typeAttr.type());
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getLocalizedMessage());
    }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute


  @Test
  public void baseUIMAPayloadsAnalyzerStreamTest() {
    try {
      TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      PayloadAttribute payloadAttribute = ts.addAttribute(PayloadAttribute.class);
      while (ts.incrementToken()) {
        assertNotNull(termAtt);
        assertNotNull(payloadAttribute);
        System.out.println("token '" + termAtt.toString() + "' has payload " + new String(payloadAttribute.getPayload().getData()));
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getLocalizedMessage());
    }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

    public List<String> getTermList(String contentText) {
        List<String> result = new ArrayList<String>(contentText.length() / 10);
        
        try {
            TokenStream stream = _analyzer.tokenStream("content", new StringReader(contentText));
            CharTermAttribute termAtt = (CharTermAttribute) stream.addAttribute(CharTermAttribute.class);


            stream.reset();
            while (stream.incrementToken()) {
                if (termAtt.length() > 0) {
                    String term = termAtt.toString();
                        result.add(term);
                }
            }
            stream.end();
            stream.close();

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

  private static List<String> splitByTokenizer(String source, TokenizerFactory tokFactory){
    StringReader reader = new StringReader( source );
    TokenStream ts = loadTokenizer(tokFactory, reader);
    List<String> tokList = new ArrayList<String>();
    try {
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      while (ts.incrementToken()){
        if( termAtt.length() > 0 )
          tokList.add( termAtt.toString() );
      }
    } catch (IOException e) {
      throw new RuntimeException(e);
    }
    finally{

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.CharTermAttribute

      source.reset();
    } catch (IOException e) {
      source = analyzer.tokenStream(field, new StringReader(queryText));
    }
    CachingTokenFilter buffer = new CachingTokenFilter(source);
    CharTermAttribute termAtt = null;
    PositionIncrementAttribute posIncrAtt = null;
    int numTokens = 0;


    boolean success = false;
    try {
      buffer.reset();
      success = true;
    } catch (IOException e) {
      // success==false if we hit an exception
    }
    if (success) {
      if (buffer.hasAttribute(CharTermAttribute.class)) {
        termAtt = buffer.getAttribute(CharTermAttribute.class);
      }
      if (buffer.hasAttribute(PositionIncrementAttribute.class)) {
        posIncrAtt = buffer.getAttribute(PositionIncrementAttribute.class);
      }
    }


    int positionCount = 0;
    boolean severalTokensAtSamePosition = false;


    boolean hasMoreTokens = false;
    if (termAtt != null) {
      try {
        hasMoreTokens = buffer.incrementToken();
        while (hasMoreTokens) {
          numTokens++;
          int positionIncrement = (posIncrAtt != null) ? posIncrAtt.getPositionIncrement() : 1;
          if (positionIncrement != 0) {
            positionCount += positionIncrement;
          } else {
            severalTokensAtSamePosition = true;
          }
          hasMoreTokens = buffer.incrementToken();
        }
      } catch (IOException e) {
        // ignore
      }
    }
    try {
      // rewind the buffer stream
      buffer.reset();


      // close original stream - all tokens buffered
      source.close();
    }
    catch (IOException e) {
      // ignore
    }


    if (numTokens == 0)
      return null;
    else if (numTokens == 1) {
      String term = null;
      try {
        boolean hasNext = buffer.incrementToken();
        assert hasNext == true;
        term = termAtt.toString();
      } catch (IOException e) {
        // safe to ignore, because we know the number of tokens
      }
      // return newTermQuery(new Term(field, term));
      return new TermQuery(new Term(field, term));
    } else {
      if (severalTokensAtSamePosition) {
        if (positionCount == 1) {
          // no phrase query:
          // BooleanQuery q = newBooleanQuery(true);
          BooleanQuery q = new BooleanQuery(true);
          for (int i = 0; i < numTokens; i++) {
            String term = null;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            // Query currentQuery = newTermQuery(new Term(field, term));
            Query currentQuery = new TermQuery(new Term(field, term));
            q.add(currentQuery, BooleanClause.Occur.SHOULD);
          }
          return q;
        }
        else {
          // phrase query:
          // MultiPhraseQuery mpq = newMultiPhraseQuery();
          MultiPhraseQuery mpq = new MultiPhraseQuery();
          mpq.setSlop(phraseSlop);
          List multiTerms = new ArrayList();
          int position = -1;
          for (int i = 0; i < numTokens; i++) {
            String term = null;
            int positionIncrement = 1;
            try {
              boolean hasNext = buffer.incrementToken();
              assert hasNext == true;
              term = termAtt.toString();
              if (posIncrAtt != null) {
                positionIncrement = posIncrAtt.getPositionIncrement();
              }
            } catch (IOException e) {
              // safe to ignore, because we know the number of tokens
            }


            if (positionIncrement > 0 && multiTerms.size() > 0) {
              if (enablePositionIncrements) {
                mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
              } else {
                mpq.add((Term[])multiTerms.toArray(new Term[0]));
              }
              multiTerms.clear();
            }
            position += positionIncrement;
            multiTerms.add(new Term(field, term));
          }
          if (enablePositionIncrements) {
            mpq.add((Term[])multiTerms.toArray(new Term[0]),position);
          } else {
            mpq.add((Term[])multiTerms.toArray(new Term[0]));
          }
          return mpq;
        }
      }
      else {
        // PhraseQuery pq = newPhraseQuery();
        PhraseQuery pq = new PhraseQuery();
        pq.setSlop(phraseSlop);
        int position = -1;




        for (int i = 0; i < numTokens; i++) {
          String term = null;
          int positionIncrement = 1;


          try {
            boolean hasNext = buffer.incrementToken();
            assert hasNext == true;
            term = termAtt.toString();
            if (posIncrAtt != null) {
              positionIncrement = posIncrAtt.getPositionIncrement();
            }
          } catch (IOException e) {
            // safe to ignore, because we know the number of tokens

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.