Package org.sf.mustru.train

Source Code of org.sf.mustru.train.TrainPOSTagger

package org.sf.mustru.train;

import com.aliasi.io.FileExtensionFilter;
import com.aliasi.corpus.Parser;
import com.aliasi.corpus.parsers.BrownPosParser;

import com.aliasi.hmm.HmmCharLmEstimator;
import com.aliasi.util.Streams;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.ObjectOutputStream;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.sf.mustru.utils.Constants;

/**
* Build a HMM Tagger model using a set of training files
*
*/
public class TrainPOSTagger
{
private static String MUSTRU_HOME =  System.getProperty("MUSTRU_HOME");
private static File TRAINING_DIR = new File(
   MUSTRU_HOME + File.separator + "data" + File.separator + "training" + File.separator + "pos");

//*-- language model parameters for HMM emissions
static int N_GRAM = 8
static int NUM_CHARS = 256;
static double LAMBDA_FACTOR = 8.0

/**
  * Train a HMM Part of Speech Tagger using a sample Brown Corpus
  * @throws IOException
  */
public static void main(String[] args) throws IOException
{
  PropertyConfigurator.configure (Constants.LOG4J_FILE);
  Logger logger = Logger.getLogger(TrainPOSTagger.class.getName());
  logger.debug("Started POS tagged model generation");

  //*-- set up parser with estimator as handler
  HmmCharLmEstimator estimator = new HmmCharLmEstimator(N_GRAM, NUM_CHARS, LAMBDA_FACTOR);
  Parser parser = new BrownPosParser();
  parser.setHandler(estimator);

  //*-- train on files in data directory ending in "txt"
  if (!TRAINING_DIR.isDirectory())
  { logger.fatal("Could not find training directory=" + TRAINING_DIR); }
  File[] files = TRAINING_DIR.listFiles(new FileExtensionFilter("txt"));
  for (int i = 0; i < files.length; ++i)
  { logger.debug("Training on file: " + files[i]); parser.parse(files[i]); }

  //*-- write output to file
  File modelFile = new File(MUSTRU_HOME + File.separator + "data" + File.separator + "training" + File.separator + "pos" + File.separator + "pos_tagger");
  ObjectOutputStream objOut = new ObjectOutputStream(new FileOutputStream(modelFile));
  estimator.compileTo(objOut);
  Streams.closeOutputStream(objOut);
  logger.debug("Finished POS tagger model generation");
}
}
TOP

Related Classes of org.sf.mustru.train.TrainPOSTagger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.