package org.sf.mustru.train;
import com.aliasi.io.FileExtensionFilter;
import com.aliasi.corpus.Parser;
import com.aliasi.corpus.parsers.BrownPosParser;
import com.aliasi.hmm.HmmCharLmEstimator;
import com.aliasi.util.Streams;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;
import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.sf.mustru.utils.Constants;
/**
* Build a HMM Tagger model using a set of training files
*
*/
public class TrainPOSTagger
{
private static String MUSTRU_HOME = System.getProperty("MUSTRU_HOME");
private static File TRAINING_DIR = new File(
MUSTRU_HOME + File.separator + "data" + File.separator + "training" + File.separator + "pos");
//*-- language model parameters for HMM emissions
static int N_GRAM = 8;
static int NUM_CHARS = 256;
static double LAMBDA_FACTOR = 8.0;
/**
* Train a HMM Part of Speech Tagger using a sample Brown Corpus
* @throws IOException
*/
public static void main(String[] args) throws IOException
{
PropertyConfigurator.configure (Constants.LOG4J_FILE);
Logger logger = Logger.getLogger(TrainPOSTagger.class.getName());
logger.debug("Started POS tagged model generation");
//*-- set up parser with estimator as handler
HmmCharLmEstimator estimator = new HmmCharLmEstimator(N_GRAM, NUM_CHARS, LAMBDA_FACTOR);
Parser parser = new BrownPosParser();
parser.setHandler(estimator);
//*-- train on files in data directory ending in "txt"
if (!TRAINING_DIR.isDirectory())
{ logger.fatal("Could not find training directory=" + TRAINING_DIR); }
File[] files = TRAINING_DIR.listFiles(new FileExtensionFilter("txt"));
for (int i = 0; i < files.length; ++i)
{ logger.debug("Training on file: " + files[i]); parser.parse(files[i]); }
//*-- write output to file
File modelFile = new File(MUSTRU_HOME + File.separator + "data" + File.separator + "training" + File.separator + "pos" + File.separator + "pos_tagger");
ObjectOutputStream objOut = new ObjectOutputStream(new FileOutputStream(modelFile));
estimator.compileTo(objOut);
Streams.closeOutputStream(objOut);
logger.debug("Finished POS tagger model generation");
}
}