Package opennlp.ccg.parse.postagger

Source Code of opennlp.ccg.parse.postagger.BasicPOSTagger

///////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2010 Dennis N. Mehay
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
//
// This library is distributed inp the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
//
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////

package opennlp.ccg.parse.postagger;
import opennlp.ccg.parse.postagger.ml.POSPriorModel;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.ccg.lexicon.Word;
import opennlp.ccg.parse.tagger.TaggedWord;
import opennlp.ccg.parse.tagger.ml.MaxentModel;
import opennlp.ccg.parse.supertagger.ml.FeatureExtractor;
import opennlp.ccg.util.Pair;
import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator;
import opennlp.ccg.parse.tagger.util.ResultSink;
import opennlp.ccg.parse.tagger.sequencescoring.SequenceScorer;
import opennlp.ccg.parse.tagger.Constants;

/**
* A non-dummy POS tagger.
*
* @author Dennis N. Mehay
*/
public class BasicPOSTagger extends POSTagger {   
   
    private FeatureExtractor posFex = null;
    private MaxentModel tagMod = null;   
   
    private static final Comparator<Pair<Double,Integer>> comp = new Comparator<Pair<Double,Integer>>() {
        public int compare(Pair<Double, Integer> pr0, Pair<Double, Integer> pr1) {
            // sorts descending by prob (the double member of the pair).
            if(pr0.a == pr1.a) { return 0; } else if (pr0.a < pr1.a) { return 1; } else { return -1;
        }
    };
   
    public BasicPOSTagger(MaxentModel tagMod, FeatureExtractor posFex, String tagSequenceModel)  {
        this.posFex = posFex;
        this.tagMod = tagMod;
        int ord = SequenceScorer.findOrder(tagSequenceModel);
        try {
            posSeqMod = new SequenceScorer(ord, tagSequenceModel);
            // set the search algorithm.
            posSeqMod.setAlgorithm(Constants.TaggingAlgorithm.FORWARDBACKWARD);
            // set the search beam width
            posSeqMod.setSearchBeam(5);
           
        } catch (IOException ex) {
            Logger.getLogger(BasicPOSTagger.class.getName()).log(Level.SEVERE, null, ex);
        }
    }
       
    public List<TaggedWord> tagSentence(List<Word> sentence) {
        List<TaggedWord> result = new ArrayList<TaggedWord>(sentence.size());
       
        // the prob-string taggings (to be filtered, etc. before adding them to the taggings of the TaggedWord list).       
        List<List<Pair<Double,String>>> taggings = new ArrayList<List<Pair<Double,String>>>(sentence.size());
       
        Map<Integer, TaggedWord> sentMap = new HashMap<Integer, TaggedWord>(sentence.size());
        int ind = 0;
        for(Word w : sentence) {       
            sentMap.put(ind++, new TaggedWord(w));
        }
        List<Collection<Pair<String,Double>>> ftss = posFex.getSentenceFeatures(sentMap);
       
        double[] distro = null;       
       
        int wordIndex = 0;
        for(Collection<Pair<String,Double>> fts : ftss) {
           
            distro = tagMod.eval(fts);
            List<Pair<Double,Integer>> distroList = new ArrayList<Pair<Double,Integer>>(distro.length);
            ind = 0; for(double prob : distro) { distroList.add(new Pair<Double,Integer>(prob, ind++)); }           
            Collections.sort(distroList, comp);                       
            // widen beta a little bit (we're going to do some fwd-bwd rescoring inp a minute, but we don't
            // want to do the fwd-bwd alg over ALL possible tags -- too inefficient).
            List<Pair<Double,String>> tagging = new ArrayList<Pair<Double,String>>(distro.length);
            double best = distroList.get(0).a;
            double widenedBeta = beta/8;           
           
            String goldPOS = sentence.get(wordIndex).getPOS();
           
            for(Pair<Double,Integer> outcome : distroList) {
                if( (outcome.a >= (widenedBeta * best)) || (includeGold && tagMod.getOutcome(outcome.b).equals(goldPOS)) ) {
                   tagging.add(new Pair<Double,String>(outcome.a, tagMod.getOutcome(outcome.b)));
                } else {
                    if(!includeGold) {  // if not still potentially fishing for a gold POS tag, then break (they're in sorted order).
                        break;
                    }
                }
            }
            taggings.add(tagging);
            wordIndex++;
        }
        // rescore using forward-backward.
        taggings = posSeqMod.rescoreSequence(taggings);       
        // add these rescored taggings to the list of TaggedWord's.
        int wInd = 0;
        for(List<Pair<Double,String>> tagging : taggings) {
            TaggedWord tmpWd = new TaggedWord(sentence.get(wInd++));
            tmpWd.setPOSTagging(tagging);           
            result.add(tmpWd);
        }
        // now filter down to the beta-best.
        return betaBestFilter(result);
    }
   
   
    public static void main(String[] args) throws IOException {
        String usage = "\nBasicPOSTagger -c <configFile> (-i <input> [defaults to <stdin>]) (-o <output> [defaults to <stdout>])\n"+
                       "                 (-e [test tagger; assumes input is gold-standard corpus])\n";
        if (args.length > 0 && args[0].equals("-h")) {
            System.out.println(usage);
            System.exit(0);
        }

        SRILMFactoredBundleCorpusIterator inp = null;
        BufferedWriter out = null;
       
        try {
            String inputCorp = "<stdin>", output = "<stdout>",
                   configFile = null;
           
            boolean test = false;

            for (int i = 0; i < args.length; i++) {
                if (args[i].equals("-i")) { inputCorp = args[++i]; continue; }
                if (args[i].equals("-o")) { output = args[++i];    continue; }
                if (args[i].equals("-e")) { test = true; continue; }
                if (args[i].equals("-c")) { configFile = args[++i]; continue; }
                System.out.println("Unrecognized option: " + args[i]);
            }

            ResultSink rs = new ResultSink(ResultSink.ResultSinkType.POSTAG);
           
            try {                       
                inp = new SRILMFactoredBundleCorpusIterator(
                        (inputCorp.equals("<stdin>")) ?
                            new BufferedReader(new InputStreamReader(System.in)) :
                            new BufferedReader(new FileReader(new File(inputCorp))));               
            } catch (FileNotFoundException ex) {
                System.err.print("Input corpus " + inputCorp + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
           

            try {
                out = (output.equals("<stdout>")) ? new BufferedWriter(new OutputStreamWriter(System.out)) : new BufferedWriter(new FileWriter(new File(output)));
            } catch (IOException ex) {
                System.err.print("Output file " + output + " not found.  Exiting...");
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
                System.exit(-1);
            }

            POSTagger post = POSTagger.posTaggerFactory(configFile);
           
            for (List<Word> inLine : inp) {
                List<TaggedWord> taggedSent = post.tagSentence(inLine);
                List<List<Pair<Double,String>>> sentTagging = new ArrayList<List<Pair<Double,String>>>(taggedSent.size());
                for(TaggedWord tw : taggedSent) { sentTagging.add(tw.getPOSTagging()); }
                if(test) { rs.addSent(sentTagging, inLine); }
                out.write("<s>" + System.getProperty("line.separator"));
                for(TaggedWord tw : taggedSent) {
                    out.write(tw.getForm());
                    for(Pair<Double,String> tg : tw.getPOSTagging()) {
                        out.write("\t" + tg.b + "\t" + tg.a);
                    }
                    out.write(System.getProperty("line.separator"));
                }
                out.write("</s>" + System.getProperty("line.separator"));
            }
            out.flush();

            if(test) { System.err.println(rs.report()); }
        } catch(Throwable t) {
            t.printStackTrace();
        } finally {
            try {               
                inp.close();
                out.close();
            } catch (IOException ex) {
                Logger.getLogger(POSPriorModel.class.getName()).log(Level.SEVERE, null, ex);
            }
        }
    }
}
TOP

Related Classes of opennlp.ccg.parse.postagger.BasicPOSTagger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.