Examples of TokenizerME

opennlp.tools.tokenize.TokenizerME
is.upenn.edu/~jcreynar>.
This tokenizer needs a statistical model to tokenize a text which reproduces the tokenization observed in the training data used to create the model. The {@link TokenizerModel} class encapsulates the model and providesmethods to create it from the binary representation.
A tokenizer instance is not thread safe. For each thread one tokenizer must be instantiated which can share one TokenizerModel instance to safe memory.
To train a new model { {@link #train(String,ObjectStream,boolean,TrainingParameters)} methodcan be used.
Sample usage:
InputStream modelIn; ... TokenizerModel model = TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(model); String tokens[] = tokenizer.tokenize("A sentence to be tokenized."); @see Tokenizer @see TokenizerModel @see TokenSample

Examples of opennlp.tools.tokenize.TokenizerME

        Tokenizer tokenizer = null;
        if(language != null){
            try {
                TokenizerModel model = getTokenizerModel(language);
                if(model != null){
                    tokenizer = new TokenizerME(getTokenizerModel(language));
                }
            } catch (InvalidFormatException e) {
                log.warn("Unable to load Tokenizer Model for "+language+": " +
                    "Will use Simple Tokenizer instead",e);
            } catch (IOException e) {

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

            if(model == null){
                throw new EngineException("The configured OpenNLP TokenizerModel '"
                        + modelName +" is not available' ("+getClass().getSimpleName()
                        + " | name=" + getName() + ")!");
            }
            return new TokenizerME(model);
        }
    }

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

        getDefaultPosTagger();
        getDefaultChunker();
    }


    public static Tokenizer getDefaultTokenizer() throws IOException {
        return new TokenizerME(new TokenizerModel(
                getResourceAsStream(tokenizerModelFile)));
    }

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

  protected List<SurfaceFormOccurrence> extractNPNGrams(Text text) {
        String intext = text.text();
    //System.out.println("\n\nRR- nextractNPNGrams(...) method called! with text: " + intext + "\n\n");
    List<SurfaceFormOccurrence> npNgramSFLst = new ArrayList<SurfaceFormOccurrence>();
    SentenceDetectorME  sentenceDetector = new SentenceDetectorME((SentenceModel)sentenceModel);
    TokenizerME tokenizer = new TokenizerME((TokenizerModel)tokenModel);
    POSTaggerME posTagger = new POSTaggerME((POSModel)posModel);
    ChunkerME chunker = new ChunkerME((ChunkerModel)chunkModel);


    Span[] sentSpans = sentenceDetector.sentPosDetect(intext);
    for (Span sentSpan : sentSpans) {
      String sentence = sentSpan.getCoveredText(intext).toString();
      int start = sentSpan.getStart();
      Span[] tokSpans = tokenizer.tokenizePos(sentence);
      String[] tokens = new String[tokSpans.length];
      // System.out.println("\n\nTokens:");
      for (int i = 0; i < tokens.length; i++) {
        tokens[i] = tokSpans[i].getCoveredText(sentence).toString();
        // System.out.println(tokens[i]);

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

     * 
     * @throws IOException 
     */
    public ApacheExtractor() throws IOException {
        nameFinder = new NameFinderME(new TokenNameFinderModel(ApacheExtractor.class.getResourceAsStream(pathToNERModel)));
        tokenizer = new TokenizerME(new TokenizerModel(ApacheExtractor.class.getResourceAsStream(pathToTokenizerModel)));
        sentenceDetector = new SentenceDetectorME(new SentenceModel(ApacheExtractor.class.getResourceAsStream(pathToSentenceDetectorModel)));
    }

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

    return getTokensWithTokenReadings(tokenReadings, tokens, chunkTags);
  }


  // non-private for test cases
  String[] tokenize(String sentence) {
    TokenizerME tokenizer = new TokenizerME(tokenModel);
    String cleanString = sentence.replace('’', '\'');  // this is the type of apostrophe that OpenNLP expects
    return tokenizer.tokenize(cleanString);
  }

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

            if(model == null){
                throw new EngineException("The configured OpenNLP TokenizerModel '"
                        + modelName +" is not available' ("+getClass().getSimpleName()
                        + " | name=" + getName() + ")!");
            }
            return new TokenizerME(model);
        }
    }

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

  public ObjectStream<NameSample> create(String[] args) {


    Parameters params = ArgumentParser.parse(args, Parameters.class);


    TokenizerModel tokenizerModel = new TokenizerModelLoader().load(params.getTokenizerModel());
    Tokenizer tokenizer = new TokenizerME(tokenizerModel);


    ObjectStream<String> mucDocStream = new FileToStringSampleStream(
        new DirectorySampleStream(params.getData(), new FileFilter() {


          public boolean accept(File file) {

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME


    Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;


    if (params.getTokenizerModel() != null) {
      try {
        tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
      } catch (IOException e) {
        throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
      }
    }
    else if (params.getRuleBasedTokenizer() != null) {

View Full Code Here

Examples of opennlp.tools.tokenize.TokenizerME

      ObjectStream<BratDocument> samples) {
    super(samples);


    // TODO: We can pass in custom validators here ...
    this.sentDetector = new SentenceDetectorME(sentModel);
    this.tokenizer = new TokenizerME(tokenModel);
  }

View Full Code Here

0 1 2

TOP

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.