Examples of opennlp.tools.util.Span

opennlp.tools.util.Span
Class for storing start and end integer offsets.


  @Test
  public void testRetrievingContent() {
    
    SentenceSample sample = new SentenceSample("1. 2.", 
        new Span(0, 2), new Span(3, 5));
    
    assertEquals("1. 2.", sample.getDocument());
    assertEquals(new Span(0, 2), sample.getSentences()[0]);
    assertEquals(new Span(3, 5), sample.getSentences()[1]);
  }

View Full Code Here

    assertFalse(createPredSample().equals(createGoldSample()));
    assertFalse(createPredSample().equals(new Object()));
  }
  
  public static SentenceSample createGoldSample() {
    return new SentenceSample("1. 2.", new Span(0, 2), new Span(3, 5));
  }

View Full Code Here

  public static SentenceSample createGoldSample() {
    return new SentenceSample("1. 2.", new Span(0, 2), new Span(3, 5));
  }


  public static SentenceSample createPredSample() {
    return new SentenceSample("1. 2.", new Span(0, 1), new Span(2, 5));
  }

View Full Code Here


    Span[] preds = chunker.chunkAsSpans(toks1, tags1);
    System.out.println(Arrays.toString(preds));


    assertEquals(10, preds.length);
    assertEquals(new Span(0, 1, "NP"), preds[0]);
    assertEquals(new Span(1, 2, "VP"), preds[1]);
    assertEquals(new Span(2, 4, "NP"), preds[2]);
    assertEquals(new Span(4, 5, "VP"), preds[3]);
    assertEquals(new Span(5, 6, "SBAR"), preds[4]);
    assertEquals(new Span(6, 7, "NP"), preds[5]);
    assertEquals(new Span(7, 9, "VP"), preds[6]);
    assertEquals(new Span(9, 13, "NP"), preds[7]);
    assertEquals(new Span(13, 14, "PP"), preds[8]);
    assertEquals(new Span(14, 16, "NP"), preds[9]);


  }

View Full Code Here

    //gather up potential tokens
    int end = d.length();
    for (int i = 0; i < end; i++) {
      if (StringUtil.isWhitespace(d.charAt(i))) {
        if (inTok) {
          tokens.add(new Span(tokStart, i));
          inTok = false;
          tokStart = -1;
        }
      }
      else {
        if (!inTok) {
          tokStart = i;
          inTok = true;
        }
      }
    }
    
    if (inTok) {
      tokens.add(new Span(tokStart, end));
    }
    
    return tokens.toArray(new Span[tokens.size()]);
  }

View Full Code Here

        sentence.append(' ');
      }
      
      int beginIndex = sentence.length();
      sentence.append(tokens[i]);
      mergedTokenSpans.add(new Span(beginIndex, sentence.length()));
    }
    
    text = sentence.toString();
    tokenSpans = Collections.unmodifiableList(mergedTokenSpans);
  }

View Full Code Here

    
    int tokenSpanStart = sample.length();
    sample.append(token);
    int tokenSpanEnd = sample.length();
    
    tokenSpans.add(new Span(tokenSpanStart, tokenSpanEnd));
    
    if (!isNextMerged)
        sample.append(" ");
  }

View Full Code Here

  public static TokenSample parse(String sampleString, String separatorChars) {
    
    if (sampleString == null || separatorChars == null)
        throw new IllegalArgumentException("arguments must not be null!");
    
    Span whitespaceTokenSpans[] = WhitespaceTokenizer.INSTANCE.tokenizePos(sampleString);
    
    // Pre-allocate 20% for newly created tokens
    List<Span> realTokenSpans = new ArrayList<Span>((int) (whitespaceTokenSpans.length * 1.2d));
    
    StringBuilder untaggedSampleString = new StringBuilder();

View Full Code Here

    this.tokenizer = tokenizer;
  }


  @Override
  protected TokenSample processSample(TokenSample reference) {
    Span predictions[] = tokenizer.tokenizePos(reference.getText());
    fmeasure.updateScores(reference.getTokenSpans(), predictions);
    
    return new TokenSample(reference.getText(), predictions);
  }

View Full Code Here

  @Override
  protected Iterator<Event> createEvents(TokenSample tokenSample) {


    List<Event> events = new ArrayList<Event>(50);


    Span tokens[] = tokenSample.getTokenSpans();
    String text = tokenSample.getText();


    if (tokens.length > 0) {


      int start = tokens[0].getStart();
      int end = tokens[tokens.length - 1].getEnd();


      String sent = text.substring(start, end);


      Span[] candTokens = WhitespaceTokenizer.INSTANCE.tokenizePos(sent);


      int firstTrainingToken = -1;
      int lastTrainingToken = -1;
      for (int ci = 0; ci < candTokens.length; ci++) {
        Span cSpan = candTokens[ci];
        String ctok = sent.substring(cSpan.getStart(), cSpan.getEnd());
        //adjust cSpan to text offsets
        cSpan = new Span(cSpan.getStart() + start, cSpan.getEnd() + start);
        //should we skip this token
        if (ctok.length() > 1
          && (!skipAlphaNumerics || !alphaNumeric.matcher(ctok).matches())) {


          //find offsets of annotated tokens inside of candidate tokens
          boolean foundTrainingTokens = false;
          for (int ti = lastTrainingToken + 1; ti < tokens.length; ti++) {
            if (cSpan.contains(tokens[ti])) {
              if (!foundTrainingTokens) {
                firstTrainingToken = ti;
                foundTrainingTokens = true;
              }
              lastTrainingToken = ti;
            }
            else if (cSpan.getEnd() < tokens[ti].getEnd()) {
              break;
            }
            else if (tokens[ti].getEnd() < cSpan.getStart()) {
              //keep looking
            }
            else {
              if (logger.isLoggable(Level.WARNING)) {
                logger.warning("Bad training token: " + tokens[ti] + " cand: " + cSpan +
                    " token="+text.substring(tokens[ti].getStart(), tokens[ti].getEnd()));
              }
            }
          }


          // create training data
          if (foundTrainingTokens) {


            for (int ti = firstTrainingToken; ti <= lastTrainingToken; ti++) {
              Span tSpan = tokens[ti];
              int cStart = cSpan.getStart();
              for (int i = tSpan.getStart() + 1; i < tSpan.getEnd(); i++) {
                String[] context = cg.getContext(ctok, i - cStart);
                events.add(new Event(TokenizerME.NO_SPLIT, context));
              }


              if (tSpan.getEnd() != cSpan.getEnd()) {
                String[] context = cg.getContext(ctok, tSpan.getEnd() - cStart);
                events.add(new Event(TokenizerME.SPLIT, context));
              }
            }
          }
        }

View Full Code Here

0 1 2 3 4 5 6 7 8 9

TOP

Related Classes of opennlp.tools.util.Span

com.bericotech.clavin.extractor.ApacheExtractor

com.tamingtext.opennlp.NameFinderTest

net.sf.nlpshell.CorefParse

net.sf.nlpshell.Main

opennlp.tools.chunker.ChunkerMETest

opennlp.tools.chunker.ChunkSample

opennlp.tools.chunker.ChunkSampleTest

opennlp.tools.cmdline.EvaluationErrorPrinter

opennlp.tools.cmdline.namefind.TokenNameFinderTool

opennlp.tools.cmdline.parser.ParserTool

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.