Package com.o19s

Source Code of com.o19s.RegexPathHierarchyTokenizerTest

package com.o19s;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotSame;

import java.io.IOException;
import java.io.OutputStream;
import java.io.Reader;
import java.io.StringReader;

import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.path.PathHierarchyTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.junit.Test;

public class RegexPathHierarchyTokenizerTest {

  //replace $n
  //replace other
  //replace = ""
  //replace unspecified
  //pattern is complex
  //pattern = ""
  //ends with - or space
  //add
 

  @Test
  public void testBasic() throws IOException {
    Tokenizer t = new RegexPathHierarchyTokenizer(new StringReader("14-32.43-25") , "[-.]");
    /*
     * Should become
     *
     * 14
     * 14-32
     * 14-32.43
     * 14-32.43-25
     *  2  5  8  1
     */
    compareTokens(t,new String[]{"14","14-32","14-32.43","14-32.43-25"}, new int[]{1,0,0,0},new int[]{0,0,0,0},new int[]{2,5,8,11});
    t.end();
    t.close();
    t.setReader(new StringReader("14-32.43-25"));

    compareTokens(t,new String[]{"14","14-32","14-32.43","14-32.43-25"}, new int[]{1,0,0,0},new int[]{0,0,0,0},new int[]{2,5,8,11});

  }
 
 
  @Test
  public void testPrefixSize1() throws IOException {
    TokenStream t = new RegexPathHierarchyTokenizer(new StringReader("14-32.43-25"), "[-.]", 1);
    /*
     * Should become
     *
     * 014
     * 114-32
     * 214-32.43
     * 314-32.43-25
     *  2  5  8  1
     */
    compareTokens(t,new String[]{"014","114-32","214-32.43","314-32.43-25"}, new int[]{1,0,0,0},new int[]{0,0,0,0},new int[]{2,5,8,11});
    t.close();
  }
 

  @Test
  public void testHardRegex() throws IOException {
    TokenStream t = new RegexPathHierarchyTokenizer(new StringReader("abcbddbadccabd"), "(\\w)\\1"); //splits on double letters
    /*
     * Should become
     *
     * abcb
     * abcbddbad
     * abcbddbadccabd
     *    4    9    4
     */
    compareTokens(t,new String[]{"abcb","abcbddbad","abcbddbadccabd"}, new int[]{1,0,0},new int[]{0,0,0},new int[]{4,9,14});
    t.close();
  }
 
 
  public void compareTokens(TokenStream tokenStream, String[] desiredTokens) throws IOException {
    compareTokens(tokenStream, desiredTokens, null, null, null);
  }
 
  public void compareTokens(TokenStream tokenStream, String[] desiredTokens,
      int[] desiredPosIncr, int[] desiredStartOffset, int[] desiredEndOffset) throws IOException {
    tokenStream.reset();
    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
    PositionIncrementAttribute posAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
   
    int i = 0;
    while (tokenStream.incrementToken()) {
      assertNotSame("Emitted more tokens than expected.",desiredTokens.length,i);
      int startOffset = offsetAtt.startOffset();
      int endOffset = offsetAtt.endOffset();
      int posIncr = posAtt.getPositionIncrement();
      String term = termAtt.toString();
      if(desiredTokens != null) {
        assertEquals("Wrong token.",desiredTokens[i],term);
      }
      if(desiredStartOffset != null) {
        assertEquals("Wrong start offset.",desiredStartOffset[i],startOffset);
      }
      if(desiredEndOffset != null) {
        assertEquals("Wrong end offset.",desiredEndOffset[i],endOffset);
      }
      if(desiredPosIncr != null) {
        assertEquals("Wrong position increment.",desiredPosIncr[i],posIncr);
      }
      i++;
    }
    assertEquals("Didn't emit all desired tokens.",desiredTokens.length,i);
   
  }
 
 
}
TOP

Related Classes of com.o19s.RegexPathHierarchyTokenizerTest

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.