Package opennlp.ccg.parse.supertagger

Source Code of opennlp.ccg.parse.supertagger.WordAndPOSDictionaryLabellingStrategy

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2009 Dennis N. Mehay
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.ccg.parse.supertagger;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import opennlp.ccg.parse.supertagger.io.XMLWordDictionaryReader;
import opennlp.ccg.parse.supertagger.io.XMLPOSDictionaryReader;
import opennlp.ccg.parse.supertagger.ml.STFex;
import opennlp.ccg.parse.supertagger.ml.FeatureExtractor;
import opennlp.ccg.parse.tagger.ProbIndexPair;
import opennlp.ccg.parse.tagger.TaggedWord;
import opennlp.ccg.parse.tagger.Constants;
import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.*;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.ccg.lexicon.*;
import opennlp.ccg.parse.Supertagger;
import opennlp.ccg.parse.postagger.DummyPOSTagger;
import opennlp.ccg.parse.postagger.POSTagger;
import opennlp.ccg.parse.supertagger.ml.STPriorModel;
import opennlp.ccg.parse.tagger.ml.MaxentModel;
import opennlp.ccg.parse.supertagger.util.*;
import opennlp.ccg.parse.tagger.Constants.TaggingAlgorithm;
import opennlp.ccg.parse.tagger.ml.ZLMEM;
import opennlp.ccg.util.Pair;
import opennlp.ccg.parse.tagger.sequencescoring.SequenceScorer;
import opennlp.ccg.parse.tagger.util.ConfigFileProcessor;
import opennlp.ccg.parse.tagger.util.ResultSink;

/**
* A `labelling strategy' for a CCG supertagger that
* restricts the output of the model based on word and POS `tagging
* dictionaries' in the following way:
*
* if a word w occurs at least K times in training, the model's output
* is constrained to the outcomes seen with w during training.  If w
* did not occur at least K times during training, the model's output is
* constrained to the outcomes seen with w's POS tag during training.
* In the off chance that the POS tag was not seen in training, the model's
* best prediction is used.
*
* @author Dennis N. Mehay
* @version $Revision: 1.22 $, $Date: 2011/03/22 03:20:25 $
*/
public class WordAndPOSDictionaryLabellingStrategy implements LabellingStrategy, Supertagger {

    // print warnings?
    private boolean verbose = false;
    // use tagging dictionaries?
    private boolean useWordDict = false;
    private boolean usePOSDict = false;
    private SequenceScorer seqScorer = null;
    private STTaggerWordDictionary wd;
    private STTaggerPOSDictionary pd;
    private int K, usualK, finalK;   
    private MaxentModel mo;
    // extracts features from the context of a word.
    private FeatureExtractor fexer = new STFex();
   
    // postagger for non-gold-POS supertagging.
    private POSTagger posTagger;

    // the current tagging.
    private List<TaggedWord> tagging;
   
    // POS-specific multipliers to "tighten" or "loosen" up the tagging beam width
    // ("beta") as needed. E.g., the beta for period/full stop might not need to be
    // very permissive, while those for lexical verbs or some fancy punctuation marks
    // might need to be.
    public Map<String,Double> betaMultipliers = new HashMap<String,Double>();
    public double minMultiplier = 1.0;
           
    /** Constructor without n-gram model (for scoring tag sequences). */
    public WordAndPOSDictionaryLabellingStrategy(STTaggerWordDictionary wd, STTaggerPOSDictionary pd, int K, MaxentModel mo,
            FeatureExtractor fexer) {
        this(wd, pd, K, mo, fexer, null, null);
    }

   
    /** Constructor WITH n-gram model (for scoring tag sequences). */
    public WordAndPOSDictionaryLabellingStrategy(
            STTaggerWordDictionary wd,
            STTaggerPOSDictionary pd,
            int K,
            MaxentModel mo,
            FeatureExtractor fexer,
            String tagSequenceModel,
            Constants.TaggingAlgorithm alg) {
        this(wd, pd, K, mo, fexer, tagSequenceModel, alg, new DummyPOSTagger());
    }
    /** Constructor with n-gram model and POS tagger */
    public WordAndPOSDictionaryLabellingStrategy(
            STTaggerWordDictionary wd,
            STTaggerPOSDictionary pd,
            int K,
            MaxentModel mo,
            FeatureExtractor fexer,
            String tagSequenceModel,
            Constants.TaggingAlgorithm alg,
            POSTagger posTagger) {
        this.wd = wd;
        this.pd = pd;
        this.mo = mo;
        this.posTagger = posTagger;
        if (K > 0) {
            this.K = K;
        } else {
            this.K = 0;
        }
        usualK = K; finalK = K;
       
        this.fexer = fexer;    
       
        try {

            if (tagSequenceModel != null) {
                // find the n-gram order of the model.
                int ord = SequenceScorer.findOrder(tagSequenceModel);
                // load it into the SequenceScorer.
                seqScorer = new SequenceScorer(ord, tagSequenceModel);
                seqScorer.setSearchBeam(5);
                seqScorer.setAlgorithm(alg);
            }
        } catch (IOException ex) {
            Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
        }       
    }

    public void useWordDict(boolean useIt) { useWordDict = useIt; }
    public void usePOSDict(boolean useIt) { usePOSDict = useIt; }
    public WordAndPOSDictionaryLabellingStrategy(STTaggerWordDictionary wd, STTaggerPOSDictionary pd, int K, MaxentModel mo) {
        this(wd, pd, K, mo, new STFex());
    }

    /**
     * Set the maximum width of the beam in the forward-backward tagger.
     */
    public void setMaxSearchBeam(int maxSearchBeam) { if (seqScorer != null) seqScorer.setSearchBeam(maxSearchBeam); }
   
    /**
     * Reset the K parameter.
     */
    public void setK(int newK) {
        this.K = newK;
    }

     /**
     * Set the usual K parameter.
     */
    public void setUsualK(int newK) {
        this.usualK = newK;
    }

    /**
     * Set the final K parameter.
     */
    public void setFinalK(int newK) {
        this.finalK = newK;
    }

    /**
     * A method that returns all labels given by the model that both (1) are assigned probability `p' s.t.:
     * p>=(`beta'*<bestProb>), where `beta' is a factor passed in by the client of this method and
     * where <bestProb> is the probability of the most probably outcome of the model and (2) (if the word
     * (obtained from the <code>String[]</code> `context') has occured at least K times in training)
     * are in the <code>STTaggerWordDictionary</code> under the entry for said word.  If the word did
     * not occur at least K times in training, the output set is constrained by a <code>STTaggerPOSDictionary</code>.
     * In the off chance that a POS did not occur in the training data, the models predictions themselves are
     * submitted to the `beta constraint'.
     *
     * @param context A <code>String[]</code> of contextual predicates (in the maximum entropy modelling sense)
     * @param mo A <code>MaxentModel</code>.
     * @param beta A <code>double</code> specifying how close in probability all returned outcomes must be.
     * @return An <code>ArrayList<String></code> of labels that meet the above constraints.
     */
    public List<String> multitag(Word w, Collection<Pair<String, Double>> context, double beta) {
        List<Pair<Double, String>> temp = this.multitagWithScores(w, context, beta);
        ArrayList<String> res = new ArrayList<String>(temp.size());
        for (Pair<Double, String> t : temp) {
            res.add(t.b);
        }
        return res;
    }

    /**
     * A method to return the set of labels that are greater than or equal to
     * the best label multiplied by a factor `beta', given a model and a <code>String[]</code>
     * of contextual predicates.
     * @param thisWord a <code>opennlp.ccg.lexicon.Word</code> representing the current word being tagged.
     * @param context A <code>Collection<Pair<String,Double>></code> of contextual predicates
     * (in the maximum entropy modelling sense) with their corresponding activations (real-valued, hence the
     * <code>Double</code>).
     * @param model A model for generating the base predictions.
     * @param beta A positive <code>double</code> specifying how close to the best label
     *             each label returned must be.
     * @return An <code>ArrayList<Pair<Double,String>></code> of the outcomes
     *         {o: score(o)>=[beta * score(bestLabel)]}.
     */
    public List<Pair<Double, String>> multitagWithScores(Word thisWord, Collection<Pair<String, Double>> context, double beta) {
        // All the scores of the outcomes (the index of each double score
        // is the key which allows us to retrieve the outcome from the model).
        double[] ocs = mo.eval(context);
        // Sort in descending order of probability.
        ProbIndexPair[] sortedOutcomes = new ProbIndexPair[ocs.length];
        for (int i = 0; i < ocs.length; i++) {
            sortedOutcomes[i] = new ProbIndexPair(new Double(ocs[i]), new Integer(i));
        }
        Arrays.sort(sortedOutcomes);
        String tempOutcome = "";
        String word = thisWord.getForm();
        String pos = thisWord.getPOS();
        ArrayList<Pair<Double, String>> retVal = new ArrayList<Pair<Double, String>>(30);

        // Find the best outcomes seen with the word in training that
        // meet the `beta' constraint.
        // *******************************************************************************************
        double bestOutcomeProb, currentOutcomeProb;
        bestOutcomeProb = 0;
        // mww: changed to not always include front of list, as it may not meet dict constraints
        ProbIndexPair temp;
        // Now loop to see how many make the cut.
        // (But make sure to be sensitive to the dictionary, if necessary.)
        // See whether the word has a freq of this.K in the training corpus.
        Collection<String> wordPermittedOutcomes = (wd != null) ? this.wd.getEntry(word, this.K) : null;
        if (wordPermittedOutcomes != null && useWordDict) {
            // The word (lemma) was seen at least K times in training.
            // Get all beta-OK outcomes that are in the dictionary entry.
            for (int ocInd = 0; ocInd < sortedOutcomes.length; ocInd++) {
                temp = sortedOutcomes[ocInd];
                tempOutcome = mo.getOutcome(temp.b.intValue());
                currentOutcomeProb = temp.a.doubleValue();
                if (wordPermittedOutcomes.contains(tempOutcome)) {
                    if (bestOutcomeProb == 0) {
                        bestOutcomeProb = currentOutcomeProb;
                    }
                    if (currentOutcomeProb >= (bestOutcomeProb * beta)) { // Beta constraint.
                        // The cut-off was met, add the outcome.

                        retVal.add(new Pair<Double, String>(temp.a, tempOutcome));
                        // update max, for first selected outcome
                        if (currentOutcomeProb > bestOutcomeProb) {
                            bestOutcomeProb = currentOutcomeProb;
                        }
                    } else {
                        // Else, since our ProbIndexPair[] is sorted by probablity, there will be no more
                        // outcomes that make the (beta) cut.
                        break;
                    }
                } // If the word is not in the dictionary specified outcomes, move along.               

            }
        } else {
            // Revert to the POS dictionary.
            Collection<String> posPermittedOutcomes = null;
            if (pos != null) {
                posPermittedOutcomes = (pd != null) ? this.pd.getEntry(pos) : null;
            } else {
                if(verbose) { System.err.println("warning: null POS for: " + word);// mww: check for null pos

            }
            if (posPermittedOutcomes != null && usePOSDict) {
                // Get all beta-OK outcomes that are in the POS dictionary entry.
                for (int ocInd2 = 0; ocInd2 < sortedOutcomes.length; ocInd2++) {
                    temp = sortedOutcomes[ocInd2];
                    tempOutcome = mo.getOutcome(temp.b.intValue());
                    currentOutcomeProb = temp.a.doubleValue();

                    if (posPermittedOutcomes.contains(tempOutcome.trim())) {
                        if (bestOutcomeProb == 0) {
                            bestOutcomeProb = currentOutcomeProb;
                        }
                        if (currentOutcomeProb >= (bestOutcomeProb * beta)) { // Beta constraint.
                            // Made the cut-off, add the outcome.

                            retVal.add(new Pair<Double, String>(temp.a, tempOutcome));
                            // update max, for first selected outcome
                            if (currentOutcomeProb > bestOutcomeProb) {
                                bestOutcomeProb = currentOutcomeProb;
                            }
                        } else {
                            // Else, since our ProbIndexPair[] is sorted by probablity, there will be no more
                            // outcomes that make the (beta) cut.
                            break;
                        }
                    } // If the word is not in the dictionary specified outcomes, move along.                   

                }
            } else {
                // Otherwise, just get all model predictions that meet the beta constraint,
                // ignoring the word and POS dictionaries.
                for (int ocInd3 = 0; ocInd3 < sortedOutcomes.length; ocInd3++) {
                    temp = sortedOutcomes[ocInd3];
                    currentOutcomeProb = temp.a.doubleValue();

                    if (bestOutcomeProb == 0) {
                        bestOutcomeProb = currentOutcomeProb;
                    }
                    if (currentOutcomeProb >= (bestOutcomeProb * beta)) {
                        // Made the cut-off, add the outcome.
                        retVal.add(new Pair<Double, String>(temp.a, mo.getOutcome(temp.b.intValue())));
                        // update max, for first selected outcome
                        if (currentOutcomeProb > bestOutcomeProb) {
                            bestOutcomeProb = currentOutcomeProb;
                        }
                    } else {
                        // Else, since our ProbIndexPair[] is sorted by probability, there will be no more
                        // outcomes that make the cut.
                        break;
                    }
                }
            }
        }      

        // include the gold standard tag, if not in there.
        if(includeGold) {
            // assume input word has the gold tag in it.
            String gold = thisWord.getSupertag();
            // check whether gold is in the output.
            boolean containsGold = false;
            for(Pair<Double,String> tg : retVal) {
                if(tg.b.equals(gold)) {
                    containsGold = true;
                    break;
                }
            }
            if(!containsGold) {
                // insert it
                containsGold = false;
                for(ProbIndexPair oc : sortedOutcomes) {
                    if(mo.getOutcome(oc.b).equals(gold)) {
                        retVal.add(new Pair<Double,String>(oc.a, mo.getOutcome(oc.b)));
                        containsGold = true;
                        break;
                    }
                }         
            }
            if(!containsGold) {
                // if the gold-standard still isn't in there, it must not be part of the tag set, add it with epsilon probability.
                // we're assuming that gold tags are needed for a training routine that doesn't care about supertag probabilities
                // (as in Clark and Curran (2007)).
                // check to see whether we are in the log domain (by checking for negative scores -- kind of a hack).
                retVal.add(new Pair<Double,String>((sortedOutcomes[0].a < 0) ? -99 : 1.0112214926104486e-43, thisWord.getSupertag()));
            }
        }
        // *******************************************************************************************
        return retVal;
    }


    // get the current tagging (now only used to grab the POS tagging).   
    public List<TaggedWord> getCurrentTagging() { return tagging; }
    // set the current tagging (now only used to set the current POS tagging).
    public void setCurrentTagging(List<TaggedWord> tgging) { tagging = tgging; }
   
    public List<List<Pair<Double, String>>> multitag(List<Word> sentence, double beta) {
        List<List<Pair<Double, String>>> results = new ArrayList<List<Pair<Double, String>>>(sentence.size());
        Map<Integer, TaggedWord> sent = new TreeMap<Integer, TaggedWord>();
        int cnt = 0;
       
        List<TaggedWord> taggedSent = posTagger.tagSentence(sentence);
        setCurrentTagging(taggedSent);
       
        for (TaggedWord werd : taggedSent) {
            sent.put(new Integer(cnt++), werd);
        }

        List<Collection<Pair<String, Double>>> contexts = fexer.getSentenceFeatures(sent);

        // Iterate simultaneously through both the words and the contextual features.
        Iterator<Word> wds = sentence.iterator();
        Word w = null;
        Iterator<Collection<Pair<String, Double>>> ctxts = contexts.iterator();
        Collection<Pair<String, Double>> context = null;

        int cursor = 0;
        while (wds.hasNext() && ctxts.hasNext()) {
            // get the next word.
            w = wds.next();
            if(w.getPOS() == null) {
                w = Word.createFullWord(w, w.getForm(), tagging.get(cursor).getPOSTagging().get(0).b, w.getSupertag(), w.getSemClass());
            }
            context = ctxts.next();
            if (seqScorer != null) {
                // increase the tag ambiguity (for re-scoring using forward-backward).
                double newBeta = Math.min(beta * minMultiplier, beta / 8);
                if(beta < 0.00001) { newBeta = Math.min(beta * minMultiplier, beta / 2)}
                    results.add(multitagWithScores(w, context, newBeta));
            } else { results.add(multitagWithScores(w, context, beta)); }
            cursor++;
        }
       
        List<List<Pair<Double,String>>> finalResults = null;
        if (seqScorer != null) {
            // rescore and filter. pass in input sentence (in case, e.g., we have set the includeGold flag).
            finalResults = betaBestFilter(seqScorer.rescoreSequence(results), beta, sentence);
        } else {
            finalResults = results;
        }      
        return finalResults;
    }

    /**
     * Return a beta-best filtered subset of the tags in each multitagging list (each multitagging list is assumed to be non-empty).
     */
    private List<List<Pair<Double, String>>> betaBestFilter(List<List<Pair<Double, String>>> multitaggings, double beta, List<Word> inputSentence) {
        List<List<Pair<Double, String>>> res = new ArrayList<List<Pair<Double, String>>>(multitaggings.size());

        int wordIndex = 0;
        for (List<Pair<Double, String>> mtagging : multitaggings) {
            List<Pair<Double, String>> tempTagging = new ArrayList<Pair<Double, String>>(mtagging.size());           
            Word thisWord = inputSentence.get(wordIndex);
            // set to a (possibly different, possibly less restrictive?) beta if this POS has a beta multiplier set.
            Double bmult = betaMultipliers.get(thisWord.getPOS());
            double possiblyNewBeta = Math.min(1.0, (bmult != null) ? (bmult * beta) : beta);
           
            double best = mtagging.get(0).a;
            for (Pair<Double, String> tg : mtagging) {
                if (tg.a >= (possiblyNewBeta * best) || (includeGold && tg.b.equals(thisWord.getSupertag()))) {
                    tempTagging.add(tg);
                } else {
                    if(!includeGold) {  // if we're not still fishing for gold...
                        // ...stop, since they're in sorted order.
                        break;
                    }
                }
            }           
            res.add(tempTagging);
            wordIndex++;
        }
        return res;
    }
   
   
   
   
    //-------------------------------------------------------------------------
    // Supertagger interface methods (added by Michael White)
   
    /**
     * The sequence of beta values to use in tagging.
     */
    protected double[] betas = null;
   
    /**
     * The current betaIndex.
     */
    protected int betaIndex = 0;
   
    /**
     * The current tagging.
     */
    protected List<List<Pair<Double, String>>> currentTagging = null;
   
    /**
     * The current word.
     */
    protected int currentWord = 0;
   
    /**
     * Flag for whether to include gold tags.
     */
    protected boolean includeGold = false;

    /** Sets the beta values. */
    public void setBetas(double[] betas) {
        this.betas = betas;
    }

    /** Returns all the beta values. */
    public double[] getBetas() {
        return betas;
    }

    /** Returns the current beta value. */
    public double getCurrentBetaValue() {
        return betas[betaIndex];
    }

    /**
     * Advances beta to the next most restrictive setting.
     */
    public void nextBeta() {
        betaIndex++;
    }

    /**
     * Advances beta to the next less restrictive setting.
     */
    public void previousBeta() {
        betaIndex--;
    }

    /**
     * Returns whether there are any less restrictive beta settings
     * remaining in the sequence.
     */
    public boolean hasMoreBetas() {
        return betaIndex < betas.length - 1;
    }

    /**
     * Returns whether there are any more restrictive beta settings
     * remaining in the sequence.
     */
    public boolean hasLessBetas() {
        return betaIndex > 0;
    }

    /**
     * Resets beta to the most restrictive value.
     */
    public void resetBeta() {
        betaIndex = 0;
    }

    /**
     * Resets beta to the least restrictive value.
     */
    public void resetBetaToMax() {
        betaIndex = betas.length - 1;
    }

    /**
     * Sets the flag for whether to include gold tags.
     */
    public void setIncludeGold(boolean includeGold) { this.includeGold = includeGold; }

    /**
     * Maps the given words to their predicted categories,
     * so that the beta-best categories can be returned by calls to setWord
     * and getSupertags.
     */
    public void mapWords(List<Word> words) {
        if(hasMoreBetas()) {
            K = usualK;
        } else {
            K = finalK;           
        }       
        currentTagging = multitag(words, getCurrentBetaValue());
    }

    /**
     * Sets the current word to the one with the given index,
     * so that the beta-best categories for it can be returned by a call to
     * getSupertags.
     */
    public void setWord(int index) {
        currentWord = index;
    }

    /**
     * Returns the supertags of the desired categories for the current lexical lookup
     * as a map from supertags to contextual probabilities (or null to accept all).
     */
    public Map<String, Double> getSupertags() {
        Map<String, Double> retval = new HashMap<String, Double>();
        List<Pair<Double, String>> tags = currentTagging.get(currentWord);
        for (Pair<Double, String> tag : tags) {
            retval.put(tag.b, tag.a);
        }
        return retval;
    }
   
    /**
     * A factory method to make a supertagger from a config file (see the sample config file:
     * 
     * $OPENCCG_HOME/ccgbank/models/supertagger/st.config
     *
     * for more information).
     */
    @SuppressWarnings("unused")
    public static WordAndPOSDictionaryLabellingStrategy supertaggerFactory(String configFile) {       
        WordAndPOSDictionaryLabellingStrategy res = null;
        String[] pathKeys = { "priormodel", "priormodelvocab", "sequencemodel", "wdict", "posdict", "maxentmodel", "posconfig" };
        Map<String,String> opts = ConfigFileProcessor.readInConfig(configFile, pathKeys);
        boolean verbose = (opts.get("verbose").equals("true")) ? true : false;
        // 'S' is for string repr.
        String priorModS = opts.get("priormodel"),
               priorVocabS = opts.get("priormodelvocab"),
               seqModS = opts.get("sequencemodel"),
               wDictS = opts.get("wdict"),
               pDictS = opts.get("posdict"),
               firstKS = opts.get("firstk"),
               lastKS = opts.get("lastk"),
               maxentModS = opts.get("maxentmodel"),
               posConfigS = opts.get("posconfig"),
               betasS = opts.get("betas"),
               betaMults = opts.get("betamultipliers"), // POS-specific multipliers to "tighten" or "loosen" up the tagging beam width.
               includeGold = opts.get("includegold");
       
        assert (maxentModS != null) : "Empty maxent model.";
       
        // either use prior model (and have prior vocab specified) or not.
        assert (priorModS != null && priorVocabS != null) || (priorModS == null && priorVocabS == null) : "using prior model with no vocab file.";
       
        // ensure that there are word- and pos-keyed tagging dicts if there
        // is no st prior model.
        assert (wDictS == null || pDictS == null) && priorModS == null : "need tagging dicts if no supertagging prior model and prior vocab are specified.";
       
        // need the POS-keyed tagging dict, no matter what.
        assert (priorModS != null && pDictS == null) : "need POS-keyed tagging dict for prior model.";
       
        // need 'K' values if not using tagging dicts.
        assert (priorModS == null || (firstKS != null & lastKS != null)) : "need to specify first and last 'K' value when not using prior model.";
       
        // seqMod probably shouldn't be null. warn if in verbose mode.
        if(seqModS == null && verbose) { System.err.println("Warning: empty sequence model. Performance will suffer."); }
       
        STPriorModel priorM = null;
        if(priorModS != null && priorVocabS != null) {
            try { priorM = new STPriorModel(priorModS, priorVocabS); }
            catch (IOException ex) {
                Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
        STFex fex = new STFex(priorM);
        STTaggerWordDictionary wD = (wDictS != null) ? new XMLWordDictionaryReader(new File(wDictS)).read() : null;       
        STTaggerPOSDictionary pD = (pDictS != null) ? new XMLPOSDictionaryReader(new File(pDictS)).read() : null;
        int kay = (opts.get("firstk") == null) ? 20 : Integer.parseInt(opts.get("firstk")), firstK, lastK;
        firstK = (opts.get("firstk") == null) ? 20 : Integer.parseInt(opts.get("firstk"));
        lastK = (opts.get("lastk") == null) ? 100 : Integer.parseInt(opts.get("lastk"));
        double[] betaz = new double[betasS.split("\\s+").length];
        int cursor = 0;
        for(String beta : betasS.split("\\s+")) {
            betaz[cursor++] = Double.parseDouble(beta);
        }
       
        // should we use the tagging dictionaries (yes if there is no prior model).
        boolean useWordDictionary = (wDictS != null);
        boolean usePOSDictionary = (pDictS != null);
        POSTagger pTagger = (posConfigS == null) ? null : POSTagger.posTaggerFactory(posConfigS);
        TaggingAlgorithm alg = (opts.get("taggingalgorithm") == null || opts.get("taggingalgorithm").equals("forward-backward")) ?
            TaggingAlgorithm.FORWARDBACKWARD : TaggingAlgorithm.FORWARD;
        MaxentModel mem = new ZLMEM(new File(maxentModS));
        //STTaggerWordDictionary wd,STTaggerPOSDictionary pd,  int K, MaxentModel mo, FeatureExtractor fexer,
        //String tagSequenceModel, Constants.TaggingAlgorithm alg, POSTagger posTagger
        res = (pTagger != null) ?
            new WordAndPOSDictionaryLabellingStrategy(wD, pD, kay, mem, fex, seqModS, alg, pTagger) :
            new WordAndPOSDictionaryLabellingStrategy(wD, pD, kay, mem, fex, seqModS, alg);
        res.setK(kay);
        res.setUsualK(firstK);
        res.setFinalK(lastK);
        res.setBetas(betaz);
        res.useWordDict(useWordDictionary);
        res.usePOSDict(usePOSDictionary);
        res.setIncludeGold((opts.get("includegold") == null || opts.get("includegold").equals("false")) ? false : true);
       
        // get POS-specific beta multipliers (as a string of <POS,double> pairs -- all space delimited).
        if(betaMults != null) {
            String[] bmts = betaMults.split("\\s+");
            for(int a=0, b=1; b < bmts.length; a = a + 2, b = b + 2) {
                double mul = Double.parseDouble(bmts[b]);
                res.betaMultipliers.put(bmts[a], mul);
                if(mul < res.minMultiplier) {
                    res.minMultiplier = mul;
                }
            }
        }
        return res;
    }
   
    public static void main(String[] args) {
        String usage = "\nWordAndPOSDictLabellingStrategy (-h [gets this message]) -e <areWeTesting> [defaults to not testing] -c <configFile> > -beta [0.0,1.0]\n"+
                       "                                  (-i <input> [default=<stdin>]) (-o <output> [default=<stdout>])\n";
       
        if (args.length > 0 && args[0].equals("-h")) {
            System.out.println(usage);
            System.exit(0);
        }

        SRILMFactoredBundleCorpusIterator in = null;
        BufferedWriter out = null;
        try {
           
            String inputCorp = "<stdin>", output = "<stdout>",
                   configFile = null;
           
            double beta = 1.0;
            boolean test = false;

            for (int i = 0; i < args.length; i++) {
                if (args[i].equals("-i")) { inputCorp = args[++i]; continue; }
                if (args[i].equals("-o")) { output = args[++i];    continue; }
                if (args[i].equals("-e")) { test = true; continue; }
                if (args[i].equals("-c")) { configFile = args[++i]; continue; }
                if (args[i].equals("-beta")) { beta = Double.parseDouble(args[++i]); continue; }
                System.out.println("Unrecognized option: " + args[i]);
            }

            ResultSink rs = new ResultSink(ResultSink.ResultSinkType.SUPERTAG);
            try {               
                in = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ?
                            new BufferedReader(new InputStreamReader(System.in)) :
                            new BufferedReader(new FileReader(new File(inputCorp))));               
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
            }

            try {
                out = (output.equals("<stdout>")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)));
            } catch (IOException ex) {
                System.err.print("Output file " + output + " not found.  Exiting...");
                Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
            }

            WordAndPOSDictionaryLabellingStrategy stgger = WordAndPOSDictionaryLabellingStrategy.supertaggerFactory(configFile);
           
            // for each sentence, print out:
            // <s>
            // w1   <numPOSTags>    <posTag1>   ... <posTagK>   <numSupertags>  <supertag1> ... <supertagL>
            // ...
            // wN   <numPOSTags>    <posTag1>   ... <posTagM>   <numSupertags>  <supertag1> ... <supertagU>
            // </s>
            for (List<Word> inLine : in) {
               
                List<List<Pair<Double,String>>> taggedSent = stgger.multitag(inLine, beta);
                if(test) { rs.addSent(taggedSent, inLine); }
                // beginning of sentence...
                out.write("<s>" + System.getProperty("line.separator"));               
                List<TaggedWord> posTagging = stgger.getCurrentTagging();
                int cursor = -1;
                while(++cursor < taggedSent.size()) {
                    Word wdIn = inLine.get(cursor);
                    // word form...
                    out.write(wdIn.getForm());
                    TaggedWord posT = posTagging.get(cursor);
                    // print out number of POS tags, followed by tab-separated probabilized POS tagging.
                    out.write("\t" + posT.getPOSTagging().size());
                    for(Pair<Double,String> pt : posT.getPOSTagging()) {
                        out.write("\t" + pt.b + "\t" + pt.a);
                    }
                    // now print out number of and list of tab-separated, probabilized supertags.
                    out.write("\t" + taggedSent.get(cursor).size());
                    for(Pair<Double,String> stg : taggedSent.get(cursor)) {
                        out.write("\t" + stg.b + "\t" + stg.a);
                    }
                    out.write(System.getProperty("line.separator"));
                }
                out.write("</s>" + System.getProperty("line.separator"));
            }
            out.flush();

            if(test) { System.err.println(rs.report()); }
        } catch (IOException ex) {
            Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
        } finally {
            try {
                out.close();
                in.close();
            } catch (IOException ex) {
                Logger.getLogger(WordAndPOSDictionaryLabellingStrategy.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }
} // End class WordPOSDictLabellingStrategy
TOP

Related Classes of opennlp.ccg.parse.supertagger.WordAndPOSDictionaryLabellingStrategy

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.