Examples of com.aliasi.tokenizer.Tokenizer

com.aliasi.tokenizer.Tokenizer

   
   public String getPOS(String sentence, boolean allTags)
   {
    StringBuffer xmlOutput =  new StringBuffer();
    char[] cs = sentence.toCharArray();
    Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(cs, 0, cs.length);
    String[] tokens = tokenizer.tokenize();
    String[] tags = decoder.firstBest(tokens); int len = tokens.length;
    for (int i = 0; i < len; i++)
    { 
     //*-- set the adjective tags
     if (tags[i].startsWith("j") || tags[i].equals("cd") || tags[i].endsWith("od") )

View Full Code Here

   public void buildSentences(String in)
   {
    //*-- extract the sentence boundaries
    if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
    ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
    Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(in.toCharArray(), 0, in.length() );
    tokenizer.tokenize(tokenList, whiteList);
    tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
    whites = new String[whiteList.size()]; whiteList.toArray(whites);


    sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens, whites);   
    int numPossibleSentences = sentenceBoundaries.length;

View Full Code Here

    */
   public String[] tokenizer(String in)
   {   
    if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
    ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
    Tokenizer tokenizer = new StandardBgramTokenizerFactory().tokenizer(in.toCharArray(), 0, in.length() );
    tokenizer.tokenize(tokenList, whiteList);
    String[] tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
    return(tokens);
   }

View Full Code Here

    StringBuffer normalizeQuery(CharSequence cSeq) {
  StringBuffer sb = new StringBuffer();
  sb.append(' ');
  if (mTokenizerFactory != null) {
      char[] cs = Strings.toCharArray(cSeq);
      Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
      String nextToken;
      while ((nextToken = tokenizer.nextToken()) != null) {
    mTokenCounter.increment(nextToken);
    sb.append(nextToken);
    sb.append(' '); 
      }
  } else {

View Full Code Here

  }


  private void tokenize() {
    tokenList.clear();
    whiteList.clear();
    Tokenizer tokenizer = tokenizerFactory.tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
//    System.out.println(tokenList.size() + " TOKENS");
//    System.out.println(whiteList.size() + " WHITESPACES");
  }

View Full Code Here

  }


  private void tokenize() {
    tokenList.clear();
    whiteList.clear();
    Tokenizer tokenizer = tokenizerFactory.tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
//    System.out.println(tokenList.size() + " TOKENS");
//    System.out.println(whiteList.size() + " WHITESPACES");
  }

View Full Code Here

  /** {@inheritDoc} */
  public List<String> getTokens(String text) {


    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);


    return tokenList;


  }

View Full Code Here

   public AnnotatedString getSentence(int offsetStart, int offsetEnd, String text) {


    //1.) Tokenization
    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);




    //2.) Sentence detection
    String[] tokens = new String[tokenList.size()];
    String[] whites = new String[whiteList.size()];

View Full Code Here


    //1.) Tokenization
    long start = System.currentTimeMillis();
    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
    LOG.trace("Tokenization took " + (System.currentTimeMillis() - start) + "ms.");




    //2.) Sentence detection
    start = System.currentTimeMillis();

View Full Code Here

TOP

Related Classes of com.aliasi.tokenizer.Tokenizer

edu.pitt.dbmi.nlp.noble.uima.SentenceFinderLingPipe

edu.pitt.dbmi.nobletools.uima.SentenceFinderLingPipe

org.dbpedia.spotlight.tagging.lingpipe.LingPipeTaggedTokenProvider

org.dbpedia.spotlight.tagging.lingpipe.LingPipeTextUtil

org.sf.mustru.utils.LingpipeTools

org.sf.mustru.utils.TrainSpellChecker

All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.