Source Code of opennlp.ccg.parse.supertagger.ml.ZhangLeTrainingExtractor

///////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2009 Dennis N. Mehay
// 
// This library is free software; you can redistribute it and/or
// modify it under the terms of the GNU Lesser General Public
// License as published by the Free Software Foundation; either
// version 2.1 of the License, or (at your option) any later version.
// 
// This library is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU Lesser General Public License for more details.
// 
// You should have received a copy of the GNU Lesser General Public
// License along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
//////////////////////////////////////////////////////////////////////////////
package opennlp.ccg.parse.supertagger.ml;


import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.TreeMap;
import java.util.logging.Level;
import java.util.logging.Logger;
import opennlp.ccg.lexicon.Word;
import opennlp.ccg.parse.tagger.io.PipeDelimitedFactoredBundleCorpusIterator;
import opennlp.ccg.parse.tagger.io.SRILMFactoredBundleCorpusIterator;
import opennlp.ccg.parse.supertagger.ml.FeatureExtractor;
import opennlp.ccg.parse.supertagger.ml.STFex;
import opennlp.ccg.parse.tagger.TaggedWord;
import opennlp.ccg.util.Pair;


/**
 * @author Dennis N. Mehay
 * @version $Revision: 1.3 $, $Date: 2010/09/21 04:12:41 $
 */
public class ZhangLeTrainingExtractor {


    private File outputF;
    private Iterator<List<Word>> incorp;
    private FeatureExtractor fexer = new STFex();


    /**
     * Create a training feature extractor that will extract features (with results)
     * for every instance in the input (training) corpus corpusName.
     * 
     * @param corpusName A <code>String</code> giving the complete
     * path to the input file of SRILM-compliant factored bundles.
     * @param outputFileName A <code>String</code> giving the complete
     * path to the output file where the features will be written.
     */
    public ZhangLeTrainingExtractor(File corpus, File outputF, String tokenisation) {
        this(corpus, outputF, tokenisation, new STFex());
    }
    
    public ZhangLeTrainingExtractor(File corpus, File outputF, String tokenisation, FeatureExtractor fexer) {
        this.fexer = fexer;
        this.outputF = outputF;
        try {
            if (tokenisation.equalsIgnoreCase("srilm")) {
                incorp = new SRILMFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus)));
            } else {
                incorp = new PipeDelimitedFactoredBundleCorpusIterator(new BufferedReader(new FileReader(corpus)));
            }
        } catch (FileNotFoundException ex) {
            Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex);
        }
    }


    /**
     * Writes training feats to file.
     */
    public void writeFeats() {
        BufferedWriter bw = null;
        try {
            try {
                bw = new BufferedWriter(new FileWriter(this.outputF));
            } catch (IOException ex) {
                Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex);
            }
            if (bw != null || this.incorp != null) {
                List<Word> sent = null;
                Map<Integer, TaggedWord> snt = null;


                Iterator<List<Word>> sents = this.incorp;
                while (sents.hasNext()) {
                    //for (Iterator<List<Word>> sents = this.incorp; sents.hasNext();) {
                    sent = sents.next();


                    // turn the sent into a map from integer string indices to Words.
                    int index = 0;
                    snt = new TreeMap<Integer, TaggedWord>();
                    for (Word w : sent) {
                        snt.put(index++, new TaggedWord(w));
                    }


                    // 'true' says "we're getting training feats"
                    for (Collection<Pair<String, Double>> sentFeatsWithActivation : fexer.getSentenceFeatures(snt, true)) {
                        try {
                            boolean isLabel = true;
                            for (Pair<String, Double> ftWAct : sentFeatsWithActivation) {
                                if (isLabel) {
                                    bw.write(ftWAct.a + " ");
                                    isLabel = false;
                                } else {
                                    bw.write(ftWAct.a + ":" + ftWAct.b.doubleValue() + " ");
                                }
                            }
                            bw.newLine();
                        } catch (IOException ex) {
                            Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex);
                        }


                    }
                }
            }
        } finally {
            try {
                bw.flush();
                bw.close();
            } catch (IOException ex) {
                Logger.getLogger(ZhangLeTrainingExtractor.class.getName()).log(Level.SEVERE, null, ex);
            } catch (Exception e) {
                System.out.println(e);
            }
        }
    }
}
Source Code of opennlp.ccg.parse.supertagger.ml.ZhangLeTrainingExtractor

Related Classes of opennlp.ccg.parse.supertagger.ml.ZhangLeTrainingExtractor