Package org.sf.mustru.utils

Source Code of org.sf.mustru.utils.StringLenComparator

package org.sf.mustru.utils;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.regex.Matcher;

import org.apache.log4j.Logger;
import org.apache.lucene.analysis.Token;

import org.sf.mustru.search.SearchTools;
import com.aliasi.hmm.HiddenMarkovModel;
import com.aliasi.hmm.HmmDecoder;
import com.aliasi.sentences.IndoEuropeanSentenceModel;
import com.aliasi.sentences.SentenceModel;
import com.aliasi.tokenizer.IndoEuropeanTokenizerFactory;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

/**
* Tools from Lingpipe <br>
*
* 1. Entity extraction: Return a list of named entities from passed text <br>
* 2. POS Tagging: Return a list of parts of speech for a text string <br>
* 3. Sentence extraction: Return a list of sentences from a text chunk <br>
*/

public class LingpipeTools extends EntityTools
  {
   private static final TokenizerFactory TOKENIZER_FACTORY = new IndoEuropeanTokenizerFactory()//*-- tokenizer to extract tokens and whitespace
   private static final SentenceModel SENTENCE_MODEL  = new IndoEuropeanSentenceModel();    //*-- sentence model for text
   private HmmDecoder decoder = null;
   private boolean qEntities = true;                  //*-- pattern to collapse consecutive entity types
   private static HashMap<String, String> phash = null;
   static Logger logger = Logger.getLogger(LingpipeTools.class.getName() );
  
   /**
    * Class to run lingpipe  tagger and NE extractor
    */
   public LingpipeTools() {  }

   /**
    *  Setup lingpipe for POS extraction, read the tagger file specified Constants
    */
   public boolean setforPOS()
   { return (setforPOS(null)); }

   public boolean setforPOS(String[] additionalEntities)
   {
    logger.info("Reading POS tagger model from " + Constants.POS_TAGGER_MODEL);
    ObjectInputStream oi = null;
    try
    { oi = new ObjectInputStream( new FileInputStream(Constants.POS_TAGGER_MODEL) );
    HiddenMarkovModel hmm = (HiddenMarkovModel) oi.readObject();
    decoder = new HmmDecoder(hmm);
    setTagPosXref();
    }
    catch (IOException ie ) { logger.error("setforPOS IO Error : could not read " + ie.getMessage() ); }
    catch (ClassNotFoundException ce) { logger.error("setforPOS Class Error : " + ce.getMessage() ); }
    finally { if (oi != null) { try { oi.close(); } catch (IOException ie) { } } }
    if ( (additionalEntities != null) && (additionalEntities.length > 0) ) qEntities = true;
    return true;
   }

   /**
    * Generate an annotated sentence with a select group of parts of speech and return
    * @param sentence to be annotated with parts of speech
    * @return annotated sentence
    */
   public String getPOS(String sentence)
   { return getPOS(sentence, false); }
  
   public String getPOS(String sentence, boolean allTags)
   {
    StringBuffer xmlOutput =  new StringBuffer();
    char[] cs = sentence.toCharArray();
    Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(cs, 0, cs.length);
    String[] tokens = tokenizer.tokenize();
    String[] tags = decoder.firstBest(tokens); int len = tokens.length;
    for (int i = 0; i < len; i++)
    {
     //*-- set the adjective tags
     if (tags[i].startsWith("j") || tags[i].equals("cd") || tags[i].endsWith("od") )
     { xmlOutput.append(" <Adjective> "); xmlOutput.append(tokens[i]); xmlOutput.append(" </Adjective>"); }
     //*-- next, the noun tags
     else if ( tags[i].startsWith("n") )
     { xmlOutput.append(" <Noun> "); xmlOutput.append(tokens[i]); xmlOutput.append(" </Noun>");
     //*-- finally, the verb tags, skipping auxiliary verbs
     else if ( tags[i].startsWith("v") )
     { xmlOutput.append(" <Verb> "); xmlOutput.append(tokens[i]); xmlOutput.append(" </Verb>"); }  
     //*-- skip, all other tags
     else if (allTags)
     {  String tag = phash.get(tags[i]); if (tag == null) tag = tags[i];
        xmlOutput.append("<" + tag + "> "); xmlOutput.append(tokens[i]); xmlOutput.append("</" + tag + "> "); }
     else
     { xmlOutput.append(" "); xmlOutput.append(tokens[i]); }
    }

    String out = xmlOutput.toString();
    if (!qEntities) return out;
    Matcher matcher = SearchTools.qwordPattern.matcher(out);
    if (matcher.matches())
    { out = matcher.replaceFirst(matcher.group(1) + "<Qword>" + matcher.group(2) + "</Qword>" + matcher.group(3) ); }
    return(out);
   }

   /**
    * Build the list of tokens, white spaces, and sentence boundaries for the paragraph passed
    * @param in paragraph
    */
   public void buildSentences(String in)
   {
    //*-- extract the sentence boundaries
    if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
    ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
    Tokenizer tokenizer = TOKENIZER_FACTORY.tokenizer(in.toCharArray(), 0, in.length() );
    tokenizer.tokenize(tokenList, whiteList);
    tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
    whites = new String[whiteList.size()]; whiteList.toArray(whites);

    sentenceBoundaries = SENTENCE_MODEL.boundaryIndices(tokens, whites);  
    int numPossibleSentences = sentenceBoundaries.length;

    //*-- set a default sentence boundary if no sentence boundaries were found
    if (numPossibleSentences < 1) { sentenceBoundaries = new int[1]; sentenceBoundaries[0] = tokens.length - 1; }
    currentSentenceBoundary = 0; firstTime = true;  
   }

   /**
    * Standard Bgram Tokenizer
    */
   public String[] tokenizer(String in)
   {  
    if (in.length() > Constants.DOC_LENGTH_MAXLIMIT) in = in.substring(0, Constants.DOC_LENGTH_MAXLIMIT - 1);
    ArrayList<Token> tokenList = new ArrayList<Token>(); ArrayList<Token> whiteList = new ArrayList<Token>();
    Tokenizer tokenizer = new StandardBgramTokenizerFactory().tokenizer(in.toCharArray(), 0, in.length() );
    tokenizer.tokenize(tokenList, whiteList);
    String[] tokens = new String[tokenList.size()]; tokenList.toArray(tokens);
    return(tokens);
   }
  

   /**
    * Return a hash map cross reference of Brown tags to part of speech
    */
   public static void setTagPosXref()
   {
    phash = new HashMap<String, String>()
    phash.put ( "abl", "determiner/pronoun" );
    phash.put ( "abn", "determiner/pronoun" );
    phash.put ( "abx", "determiner/pronoun" );
    phash.put ( "ap", "determiner/pronoun" );
    phash.put ( "ap$", "determiner/pronoun" );
    phash.put ( "be", "verb" );
    phash.put ( "bed", "verb" );
    phash.put ( "bedz", "verb" );
    phash.put ( "beg", "verb" );
    phash.put ( "bem", "verb" );
    phash.put ( "ben", "verb" );
    phash.put ( "ber", "verb" );
    phash.put ( "bez", "verb" );
    phash.put ( "cc", "conjunction" );
    phash.put ( "cd", "numeral" );
    phash.put ( "cd$", "numeral" );
    phash.put ( "cs", "conjunction" );
    phash.put ( "do", "verb" );
    phash.put ( "dod", "verb" );
    phash.put ( "doz", "verb" );
    phash.put ( "dt", "determiner/pronoun" );
    phash.put ( "dt$", "determiner/pronoun" );
    phash.put ( "dt+bez", "determiner/pronoun" );
    phash.put ( "dt+md", "determiner/pronoun" );
    phash.put ( "dti", "determiner/pronoun" );
    phash.put ( "dts", "determiner/pronoun" );
    phash.put ( "dtx", "determiner" );
    phash.put ( "ex", "existential there" );
    phash.put ( "hv", "verb" );
    phash.put ( "hv+to", "verb" );
    phash.put ( "hvd", "verb" );
    phash.put ( "hvg", "verb" );
    phash.put ( "hvn", "verb" );
    phash.put ( "hvz", "verb" );
    phash.put ( "in", "preposition" );
    phash.put ( "jj", "adjective" );
    phash.put ( "jj$", "adjective" );
    phash.put ( "jjr", "adjective" );
    phash.put ( "jjs", "adjective" );
    phash.put ( "jjt", "adjective" );
    phash.put ( "md", "modal auxiliary" );
    phash.put ( "nn", "noun" );
    phash.put ( "nn$", "noun" );
    phash.put ( "nn+bez", "noun" );
    phash.put ( "nn+hvz", "noun" );
    phash.put ( "nns", "noun" );
    phash.put ( "nns$", "noun" );
    phash.put ( "np", "noun" );
    phash.put ( "np$", "noun" );
    phash.put ( "np+bez", "noun" );
    phash.put ( "nps", "noun" );
    phash.put ( "nps$", "noun" );
    phash.put ( "nr", "noun" );
    phash.put ( "nr$", "noun" );
    phash.put ( "nrs", "noun" );
    phash.put ( "od", "numeral" );
    phash.put ( "pn", "pronoun" );
    phash.put ( "pn$", "pronoun" );
    phash.put ( "pp$", "determiner" );
    phash.put ( "pp$$", "pronoun" );
    phash.put ( "ppl", "pronoun" );
    phash.put ( "ppls", "pronoun" );
    phash.put ( "ppo", "pronoun" );
    phash.put ( "pps", "pronoun" );
    phash.put ( "pps+bez", "pronoun" );
    phash.put ( "pps+hvd", "pronoun" );
    phash.put ( "pps+hvz", "pronoun" );
    phash.put ( "pps+md", "pronoun" );
    phash.put ( "ppss", "pronoun" );
    phash.put ( "ppss+bem", "pronoun" );
    phash.put ( "ppss+ber", "pronoun" );
    phash.put ( "ppss+hv", "pronoun" );
    phash.put ( "ppss+hvd", "pronoun" );
    phash.put ( "ppss+md", "pronoun" );
    phash.put ( "ql", "qualifier" );
    phash.put ( "qlp", "qualifier" );
    phash.put ( "rb", "adverb" );
    phash.put ( "rb+bez", "adverb+verb" );
    phash.put ( "rbr", "adverb" );
    phash.put ( "rbt", "adverb" );
    phash.put ( "rn", "adverb" );
    phash.put ( "rp", "adverb" );
    phash.put ( "to", "infinitival to" );
    phash.put ( "uh", "interjection" );
    phash.put ( "vb", "verb" );
    phash.put ( "vbd", "verb" );
    phash.put ( "vbg", "verb" );
    phash.put ( "vbn", "verb" );
    phash.put ( "vbz", "verb" );
    phash.put ( "wdt", "determiner" );
    phash.put ( "wdt+bez", "determiner+verb" );
    phash.put ( "wp$", "pronoun" );
    phash.put ( "wpo", "pronoun" );
    phash.put ( "wps", "pronoun" );
    phash.put ( "wps+bez", "pronoun" );
    phash.put ( "wps+hvd", "pronoun" );
    phash.put ( "wps+hvz", "pronoun" );
    phash.put ( "wps+md", "pronoun" );
    phash.put ( "wql", "qualifier" );
    phash.put ( "wrb", "adverb" );
    phash.put ( "wrb+ber", "adverb + verb" );   
    }
  
   public void releaseResources()
   { decoder = null; }
  }

/**
  * Inner class to compare two strings by their lengths
  */
class StringLenComparator implements Comparator<String>
{
  public int compare (String o1, String o2)
  { if (o1 == null || o2 == null) return(0);
  Integer i1 = new Integer(o1.length());
  Integer i2 = new Integer(o2.length());
  return ( -i1.compareTo(i2));
  }
}
TOP

Related Classes of org.sf.mustru.utils.StringLenComparator

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.