Source Code of opennlp.ccg.parse.postagger.ml.POSTagFex

///////////////////////////////////////////////////////////////////////////////
// Copyright (c) 2010 Dennis N. Mehay
//
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed inp the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////


package opennlp.ccg.parse.postagger.ml;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import opennlp.ccg.parse.tagger.Constants;
import opennlp.ccg.parse.supertagger.ml.FeatureExtractor;
import opennlp.ccg.parse.tagger.TaggedWord;
import opennlp.ccg.util.Pair;
import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator;
import opennlp.ccg.lexicon.Word;


/**
 * Feature extractor for POS taggers.
 * 
 * The inputs are "TaggedWord"s simply for consistency of interface.
 * There should be no tags assigned to the words (short, perhaps, TOBI 
 * tags or the like).
 * 
 * @author Dennis N. Mehay
 */
public class POSTagFex implements FeatureExtractor {
    private POSPriorModel posPrior = null;
    
    /** Constructor with a prior model (replaces tagging dictionary). */
    public POSTagFex(POSPriorModel posPrior) {
        this.posPrior = posPrior;
    }
    
    /** Constructor without prior model. Prior features will not be used. */    
    public POSTagFex() { this(null); }
    
    public static final String curL = "X";
    public static final String prevL = "X-1";
    public static final String prevPrevL = "X-2";
    public static final String nextL = "X+1";
    public static final String nextNextL = "X+2";
    private static final String[] lxfLabs = {prevPrevL, prevL, curL, nextL, nextNextL};
    private static final String prefix = "prefix", suffix = "suffix";
    private static final String hyphen = "containsHyphen";
    private static final String caps = "containsUC";
    private static final String num = "containsNum";
    private static final String neConn = "containsNEConnector";
    private static final String priorF = "PPOS";
    /** The string that connects elements of a fused named entity. */
    private String neConnecter = "_";
    /** Get a word's features for applying the tagger (i.e., not training mode). */
    public Collection<Pair<String, Double>> getFeatures(Map<Integer, TaggedWord> sentence, Integer wordIndex) {
        return getFeatures(sentence, wordIndex, false);
    }


    /** Get a sentence of words' features for applying the tagger (i.e., not training mode). */
    public List<Collection<Pair<String, Double>>> getSentenceFeatures(Map<Integer, TaggedWord> sentence) {
        return getSentenceFeatures(sentence, false);
    }


    /** 
     * Get the features for a word in context.  training == true iff the output class is to be collected as well. 
     * 
     * TODO: This and supertagger feature extractor (fex) should be merged into a more general, parameterizable 
     * sentence-level contextual feature extractor. (VERY todo-ish, though.)
     */
    public Collection<Pair<String, Double>> getFeatures(Map<Integer, TaggedWord> sentence, Integer wordIndex, boolean training) {
        Collection<Pair<String, Double>> result = new ArrayList<Pair<String, Double>>(30);


        TaggedWord current, prev, prevPrev, next, nextNext;
        current = sentence.get(wordIndex);
        // -------- The left periphery ------------
        int wind = wordIndex.intValue();
        if (wind > 1) {
            prev = sentence.get(wind - 1);
            prevPrev = sentence.get(wind - 2);
        } else if (wind > 0) {
            prev = sentence.get(wind - 1);
            prevPrev = Constants.OOB;
        } else {
            prev = prevPrev = Constants.OOB;
        }


        // -------- The right periphery -----------
        int tempSize = sentence.size();
        if ((tempSize - (wind + 1)) >= 2) {
            next = sentence.get(wind + 1);
            nextNext = sentence.get(wind + 2);
        } else if (tempSize - (wind + 1) >= 1) {
            next = sentence.get(wind + 1);
            nextNext = Constants.OOB;
        } else {
            next = nextNext = Constants.OOB;
        }


        Double activation = Constants.one;
        
        if (training) {
            result.add(new Pair<String, Double>(current.getPOS(), activation));
        }


        // we do not use tag-sequence features in this model.
        // these are in a separate sequence model (n-gram model over POS sequences).
        
        // standard contextual features (word to the left, current word, word to the right, etc.).
        // these features are from Ratnaparkhi (1996).
        result.add(new Pair<String, Double>(curL + "=" + current.getForm(), activation));
        result.add(new Pair<String, Double>(prevL + "=" + prev.getForm(), activation));
        result.add(new Pair<String, Double>(prevPrevL + "=" + prevPrev.getForm(), activation));
        result.add(new Pair<String, Double>(nextL + "=" + next.getForm(), activation));
        result.add(new Pair<String, Double>(nextNextL + "=" + nextNext.getForm(), activation));
        
        // features that replace the tagging dictionary. 
        // add real-valued (activation = prior log-prob) features for each of the beta-best prior
        // tags, given this word.
        if(posPrior != null) {
            List<Pair<Double,String>> priors = posPrior.getPriors(current.getWord());
            double beta = 0.1;
            double best = priors.get(0).a;
            String wform = current.getForm();
            for(Pair<Double,String> prior : priors) {
                if(prior.a > (beta * best)) {
                    // add the features PPOS=<POSTAG>:<log-prob> and PPOS_word=<POSTAG>_<wordForm>:<log-prob>.
                    result.add(new Pair<String,Double>(priorF + "=" + prior.b, prior.a));
                    result.add(new Pair<String,Double>(priorF + "_word" + "=" + prior.b + "_" + wform, prior.a));
                } else {
                    break;
                }
            }
        }
        
        // these are in addition to Ratnaparkhi's (1996) contextual features.
        // now for conjunctions of features: w-2w-1=..., w-1w+1=..., w+1w+2=... (same for posp).
        // (i.e., bigram features over words and parts of speech and bigrams of words and POSs that straddle the current token).
        // N.B. only use single-best POSs (maybe change later).
        TaggedWord[] wds = {prevPrev, prev, current, next, nextNext};


        for (int j = 1; j < wds.length; j++) {
            result.add(new Pair<String, Double>(lxfLabs[j - 1] + "|" + lxfLabs[j] + "=" + wds[j - 1].getForm() + "|" + wds[j].getForm(), activation));
            // also, if at the current word slot, add bigrams that straddle the current word.
            if (j == 2) {
                result.add(new Pair<String, Double>(lxfLabs[j - 1] + "|" + lxfLabs[j + 1] + "=" + wds[j - 1].getForm() + "|" + wds[j + 1].getForm(), activation));
            }
        }
        
        // affix features from Ratnaparkhi (1996).
        // if the word's length is > 4, then extract the 1-, 2-, 3- and 4-character affixes.        
        if(current.getForm().length() > 4) {
            StringBuffer prefixes = new StringBuffer(4), suffixes = new StringBuffer(4);
            char[] wdForm = current.getForm().toCharArray();
            // prefixes.
            int cursor = 0;
            for(cursor = 0; cursor < 4; cursor++) {
                prefixes.append(wdForm[cursor]);
                result.add(new Pair<String,Double>(prefix+"="+prefixes.toString(), Constants.one));
            }
            // suffixes.
            for(cursor = wdForm.length-1; cursor >= wdForm.length-5; cursor--) {
                suffixes.insert(0, wdForm[cursor]);
                result.add(new Pair<String,Double>(suffix+"="+suffixes.toString(), Constants.one));
            }
        }
        
        // now do "contains hyphen", "contains number", "contains uppercase letter" and contains fused NE connecter (_) features.
        // also from Ratnaparkhi (1996).        
        if(current.getForm().contains("-")) { result.add(new Pair<String,Double>(hyphen, Constants.one)); }
        if(current.getForm().matches(".*[0-9]+.*")) { result.add(new Pair<String,Double>(num, Constants.one)); }
        if(!current.getForm().toLowerCase().equals(current.getForm())) { result.add(new Pair<String,Double>(caps, Constants.one)); }
  // if we see a NE connector, this is likely a NNP (in English, e.g.).
        if(current.getForm().contains(neConnecter)) { result.add(new Pair<String,Double>(neConn, Constants.one)); }
        return result;
    }


    /** 
     * Get the features for a sentence of words in context.  
     * training == true iff the output classes are to be collected as well. 
     */
    public List<Collection<Pair<String, Double>>> getSentenceFeatures(Map<Integer, TaggedWord> sentence, boolean training) {
        List<Collection<Pair<String, Double>>> result = new ArrayList<Collection<Pair<String, Double>>>(30);
        List<Integer> keys = new ArrayList<Integer>(sentence.keySet().size());
        for(Integer wordIndex : sentence.keySet()) { keys.add(wordIndex); }
        Collections.sort(keys);
        for(Integer wordIndex : keys) {
            result.add(getFeatures(sentence, wordIndex, training));
        }
        return result;
    }
    
    public static void main(String[] args) throws IOException {        
        String usage = 
                "POSTagFex (-h [gets this message]) (-i <input> [defaults to <stdin>]) (-o <output> [defaults to <stdout>])\n"+
                "          (-p <posPriorModel> [.flm] -v <priorModVocab>)\n";
        if(args.length > 0 && args[0].equals("-h")) { System.out.println(usage); System.exit(0); }
        
        String input = "<stdin>", output = "<stdout>", priorModF = null, priorVocab = null;
        for(int j = 0; j < args.length; j++) {
            if(args[j].equals("-i")) { input = args[++j]; continue; }
            if(args[j].equals("-o")) { output = args[++j]; continue; }
            if(args[j].equals("-p")) { priorModF = args[++j]; continue; }
            if(args[j].equals("-v")) { priorVocab = args[++j]; continue; }
            System.err.println("Unrecognized option: " + args[j]); 
        }
        SRILMFactoredBundleCorpusIterator corp = new SRILMFactoredBundleCorpusIterator(
                input.equals("<stdin>") ?
                    new BufferedReader(new InputStreamReader(System.in)) :
                    new BufferedReader(new FileReader(new File(input))));
        BufferedWriter out = new BufferedWriter(
                output.equals("<stdout>") ?
                    new BufferedWriter(new OutputStreamWriter(System.out)) :
                    new BufferedWriter(new FileWriter(new File(output))));
        
        POSPriorModel posPriorMod = null;
        if(priorModF != null) {
            posPriorMod = new POSPriorModel(priorModF, priorVocab);
        }
        POSTagFex fexer = new POSTagFex(posPriorMod);        
        for(List<Word> sentence : corp) {
            Map<Integer, TaggedWord> sent = new HashMap<Integer, TaggedWord>(sentence.size());
            int index = 0;
            for(Word w : sentence) { sent.put(index++, new TaggedWord(w)); }
            
            List<Collection<Pair<String,Double>>> ftss = fexer.getSentenceFeatures(sent, true);
            
            for(Collection<Pair<String,Double>> fts : ftss) {
                index = 0;
                for(Pair<String,Double> ft : fts) {
                    // if we're at the first item, print out the label.
                    if (index == 0) {
                        out.write(ft.a);
                    } else {
                        out.write(" " + ft.a + ":" + ft.b);
                    }
                    index++;   
                }
                out.write(System.getProperty("line.separator"));
            }
        }
  out.flush();
    }
}
Source Code of opennlp.ccg.parse.postagger.ml.POSTagFex

Related Classes of opennlp.ccg.parse.postagger.ml.POSTagFex