Examples of OffsetAttribute

org.apache.lucene.analysis.tokenattributes.OffsetAttribute
The start and end character offset of a Token.

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

      int numOverlapTokens = 0;
      int pos = -1;
      
      TermAttribute termAtt = stream.addAttribute(TermAttribute.class);
      PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class);
      OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
      
      stream.reset();
      while (stream.incrementToken()) {
        String term = termAtt.term();
        if (term.length() == 0) continue; // nothing to do
//        if (DEBUG) System.err.println("token='" + term + "'");
        numTokens++;
        final int posIncr = posIncrAttribute.getPositionIncrement();
        if (posIncr == 0)
          numOverlapTokens++;
        pos += posIncr;
        
        ArrayIntList positions = terms.get(term);
        if (positions == null) { // term not seen before
          positions = new ArrayIntList(stride);
          terms.put(term, positions);
        }
        if (stride == 1) {
          positions.add(pos);
        } else {
          positions.add(pos, offsetAtt.startOffset(), offsetAtt.endOffset());
        }
      }
      stream.end();


      // ensure infos.numTokens > 0 invariant; needed for correct operation of terms()

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

  }


  // LUCENE-1441
  public void testOffsets() throws Exception {
    TokenStream stream = new KeywordAnalyzer().tokenStream("field", new StringReader("abcd"));
    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
    assertTrue(stream.incrementToken());
    assertEquals(0, offsetAtt.startOffset());
    assertEquals(4, offsetAtt.endOffset());
  }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

  {
    ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
    StringBuilder newText=new StringBuilder();
    
      TermAttribute termAtt = tokenStream.addAttribute(TermAttribute.class);
      OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
      tokenStream.addAttribute(PositionIncrementAttribute.class);
      tokenStream.reset();
      
    TextFragment currentFrag =  new TextFragment(newText,newText.length(), docFrags.size());
    TokenStream newStream = fragmentScorer.init(tokenStream);
    if(newStream != null) {
      tokenStream = newStream;
    }
    fragmentScorer.startFragment(currentFrag);
    docFrags.add(currentFrag);


    FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);


    try
    {


      String tokenText;
      int startOffset;
      int endOffset;
      int lastEndOffset = 0;
      textFragmenter.start(text, tokenStream);


      TokenGroup tokenGroup=new TokenGroup(tokenStream);


      for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
            next = tokenStream.incrementToken())
      {
        if(  (offsetAtt.endOffset()>text.length())
          ||
          (offsetAtt.startOffset()>text.length())
          )            
        {
          throw new InvalidTokenOffsetsException("Token "+ termAtt.term()
              +" exceeds length of provided text sized "+text.length());
        }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

    final int minGram = _TestUtil.nextInt(random(), 1, 3);
    final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer(new StringReader(s));
    tk = new NGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int start = 0; start < codePointCount; ++start) {
      for (int end = start + minGram; end <= Math.min(codePointCount, start + maxGram); ++end) {
        assertTrue(tk.incrementToken());
        assertEquals(0, offsetAtt.startOffset());
        assertEquals(s.length(), offsetAtt.endOffset());
        final int startIndex = Character.offsetByCodePoints(s, 0, start);
        final int endIndex = Character.offsetByCodePoints(s, 0, end);
        assertEquals(s.substring(startIndex, endIndex), termAtt.toString());
      }
    }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

      }
    };
    final CharTermAttribute termAtt = grams.addAttribute(CharTermAttribute.class);
    final PositionIncrementAttribute posIncAtt = grams.addAttribute(PositionIncrementAttribute.class);
    final PositionLengthAttribute posLenAtt = grams.addAttribute(PositionLengthAttribute.class);
    final OffsetAttribute offsetAtt = grams.addAttribute(OffsetAttribute.class);
    grams.reset();
    for (int start = 0; start < codePoints.length; ++start) {
      nextGram:
      for (int end = start + minGram; end <= start + maxGram && end <= codePoints.length; ++end) {
        if (edgesOnly && start > 0 && isTokenChar(nonTokenChars, codePoints[start - 1])) {
          // not on an edge
          continue nextGram;
        }
        for (int j = start; j < end; ++j) {
          if (!isTokenChar(nonTokenChars, codePoints[j])) {
            continue nextGram;
          }
        }
        assertTrue(grams.incrementToken());
        assertArrayEquals(Arrays.copyOfRange(codePoints, start, end), toCodePoints(termAtt));
        assertEquals(1, posIncAtt.getPositionIncrement());
        assertEquals(1, posLenAtt.getPositionLength());
        assertEquals(offsets[start], offsetAtt.startOffset());
        assertEquals(offsets[end], offsetAtt.endOffset());
      }
    }
    assertFalse(grams.incrementToken());
    grams.end();
    assertEquals(s.length(), offsetAtt.startOffset());
    assertEquals(s.length(), offsetAtt.endOffset());
  }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

    for (int i = 0; i < num; i++) {
      String s = _TestUtil.randomUnicodeString(random());
      TokenStream ts = analyzer.tokenStream("foo", s);
      try {
        ts.reset();
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
          String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
          for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
            cp = highlightedText.codePointAt(j);
            assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
          }
        }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

    for (int i = 0; i < num; i++) {
      String s = _TestUtil.randomUnicodeString(random());
      TokenStream ts = analyzer.tokenStream("foo", s);
      try {
        ts.reset();
        OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
        while (ts.incrementToken()) {
          String highlightedText = s.substring(offsetAtt.startOffset(), offsetAtt.endOffset());
          for (int j = 0, cp = 0; j < highlightedText.length(); j += Character.charCount(cp)) {
            cp = highlightedText.codePointAt(j);
            assertTrue("non-letter:" + Integer.toHexString(cp), Character.isLetter(cp));
          }
        }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

      return start + 31 * end;
    }


    @Override
    public void copyTo(AttributeImpl target) {
      OffsetAttribute t = (OffsetAttribute) target;
      t.setOffset(start, end);
    }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

    final int minGram = _TestUtil.nextInt(random(), 1, 3);
    final int maxGram = _TestUtil.nextInt(random(), minGram, 10);
    TokenStream tk = new KeywordTokenizer(new StringReader(s));
    tk = new EdgeNGramTokenFilter(TEST_VERSION_CURRENT, tk, minGram, maxGram);
    final CharTermAttribute termAtt = tk.addAttribute(CharTermAttribute.class);
    final OffsetAttribute offsetAtt = tk.addAttribute(OffsetAttribute.class);
    tk.reset();
    for (int i = minGram; i <= Math.min(codePointCount, maxGram); ++i) {
      assertTrue(tk.incrementToken());
      assertEquals(0, offsetAtt.startOffset());
      assertEquals(s.length(), offsetAtt.endOffset());
      final int end = Character.offsetByCodePoints(s, 0, i);
      assertEquals(s.substring(0, end), termAtt.toString());
    }
    assertFalse(tk.incrementToken());
  }

View Full Code Here

Examples of org.apache.lucene.analysis.tokenattributes.OffsetAttribute

  @Test
  public void baseUIMAAnalyzerStreamTest() {
    try {
      TokenStream ts = analyzer.tokenStream("text", new StringReader("the big brown fox jumped on the wood"));
      CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
      OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
      while (ts.incrementToken()) {
        assertNotNull(offsetAtt);
        assertNotNull(termAtt);
        System.out.println("token '" + termAtt.toString() + "' has offset " + offsetAtt.startOffset() + "," + offsetAtt.endOffset());
      }
    } catch (Exception e) {
      e.printStackTrace();
      fail(e.getLocalizedMessage());
    }

View Full Code Here

0 1 2 3 4 5

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.