Package com.aliasi.tag

Examples of com.aliasi.tag.Tagger


  public void initialize(String text) {

    taggedTokens = new ArrayList<TaggedToken>();

    //Load the POS model:
    Tagger posTagger = lingPipeFactory.getPoSTaggerInstance();

    //1.) Tokenization
    long start = System.currentTimeMillis();
    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
    LOG.trace("Tokenization took " + (System.currentTimeMillis() - start) + "ms.");


    //2.) Sentence detection
    start = System.currentTimeMillis();
    String[] tokens = new String[tokenList.size()];
    String[] whites = new String[whiteList.size()];
    tokenList.toArray(tokens);
    whiteList.toArray(whites);

    SentenceModel sentenceModel = lingPipeFactory.getSentenceModelInstance();
    sentenceBoundaries = sentenceModel.boundaryIndices(tokens, whites);
    LOG.trace("Sentence segmentation took " + (System.currentTimeMillis() - start) + "ms.");


    //3.) Part-of-Speech tagging
    start = System.currentTimeMillis();
    int sentStartToken = 0;
    int sentEndToken;
    int textOffset = whites[0].length();


    /**
     * Tag every sentence with final punctuation (i < sentenceBoundaries.length), if there is
     * text without final punctuation, treat the rest of the text as a single sentence.
     */

    for (int i = 0; (i < sentenceBoundaries.length || sentStartToken < tokens.length); ++i) {

      if (i < sentenceBoundaries.length) {
        //We are between two sentence-final punctuation tokens.

        sentEndToken = sentenceBoundaries[i];
      } else {
        //We are beyond the last sentence-final punctuation: Tag the rest of the text.

        sentEndToken = tokens.length - 1;
      }


      Tagging<String> tags = posTagger.tag(tokenList.subList(sentStartToken, sentEndToken + 1));
      for (int j = 0; j < tags.size(); j++) {
        TaggedToken taggedToken = new TaggedToken(tags.token(j), whiteList.get(sentStartToken + j + 1), tags.tag(j), textOffset, null);
        taggedTokens.add(taggedToken);
        textOffset += tokens[sentStartToken + j].length() + whites[sentStartToken + j + 1].length();
      }
View Full Code Here

TOP

Related Classes of com.aliasi.tag.Tagger

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.