Package com.aliasi.tokenizer

Examples of com.aliasi.tokenizer.Tokenizer


  
   public String getPOS(String sentence, boolean allTags)
   {
    StringBuffer xmlOutput =  new StringBuffer();
    char[] cs = sentence.toCharArray();
    Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(cs, 0, cs.length);
    String[] tokens = tokenizer.tokenize();
    String[] tags = decoder.firstBest(tokens); int len = tokens.length;
    for (int i = 0; i < len; i++)
    {
     //*-- set the adjective tags
     if (tags[i].startsWith("j") || tags[i].equals("cd") || tags[i].endsWith("od") )
View Full Code Here


   public void buildSentences(String in)
   {
    //*-- extract the sentence boundaries
    if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
    ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
    Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(in.toCharArray(), 0, in.length() );
    tokenizer.tokenize(tokenList, whiteList);
    tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
    whites = new String[whiteList.size()]; whiteList.toArray(whites);

    sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens, whites);  
    int numPossibleSentences = sentenceBoundaries.length;
View Full Code Here

    */
   public String[] tokenizer(String in)
   {  
    if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
    ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
    Tokenizer tokenizer = new StandardBgramTokenizerFactory().tokenizer(in.toCharArray(), 0, in.length() );
    tokenizer.tokenize(tokenList, whiteList);
    String[] tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
    return(tokens);
   }
View Full Code Here

    StringBuffer normalizeQuery(CharSequence cSeq) {
  StringBuffer sb = new StringBuffer();
  sb.append(' ');
  if (mTokenizerFactory != null) {
      char[] cs = Strings.toCharArray(cSeq);
      Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
      String nextToken;
      while ((nextToken = tokenizer.nextToken()) != null) {
    mTokenCounter.increment(nextToken);
    sb.append(nextToken);
    sb.append(' ');
      }
  } else {
View Full Code Here

  }

  private void tokenize() {
    tokenList.clear();
    whiteList.clear();
    Tokenizer tokenizer = tokenizerFactory.tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
//    System.out.println(tokenList.size() + " TOKENS");
//    System.out.println(whiteList.size() + " WHITESPACES");
  }
View Full Code Here

  }

  private void tokenize() {
    tokenList.clear();
    whiteList.clear();
    Tokenizer tokenizer = tokenizerFactory.tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
//    System.out.println(tokenList.size() + " TOKENS");
//    System.out.println(whiteList.size() + " WHITESPACES");
  }
View Full Code Here

  /** {@inheritDoc} */
  public List<String> getTokens(String text) {

    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);

    return tokenList;

  }
View Full Code Here

   public AnnotatedString getSentence(int offsetStart, int offsetEnd, String text) {

    //1.) Tokenization
    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);


    //2.) Sentence detection
    String[] tokens = new String[tokenList.size()];
    String[] whites = new String[whiteList.size()];
View Full Code Here

    //1.) Tokenization
    long start = System.currentTimeMillis();
    List<String> tokenList = new ArrayList<String>();
    List<String> whiteList = new ArrayList<String>();
    Tokenizer tokenizer = lingPipeFactory.getTokenizerFactoryInstance().tokenizer(text.toCharArray(),
        0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
    LOG.trace("Tokenization took " + (System.currentTimeMillis() - start) + "ms.");


    //2.) Sentence detection
    start = System.currentTimeMillis();
View Full Code Here

TOP

Related Classes of com.aliasi.tokenizer.Tokenizer

Copyright © 2018 www.massapicom. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.