Package org.sf.mustru.train

Source Code of org.sf.mustru.train.TrainFtypeClassifier

package org.sf.mustru.train;

import java.io.File;
import java.io.FileOutputStream;
import java.io.ObjectOutputStream;

import com.aliasi.util.Files;
import com.aliasi.classify.DynamicLMClassifier;

import org.apache.log4j.Logger;
import org.apache.log4j.PropertyConfigurator;
import org.sf.mustru.utils.Constants;

import java.io.IOException;

/**
* Build a language model to determine if a document is an article, book, or letter.
*/
public class TrainFtypeClassifier
//*-- define the training and testing directories and list of categories
private static File TRAINING_DIR = new File(Constants.TRAININGDIR + File.separator + "ftypes");
private static String[] CATEGORIES = {"article", "book", "letter", "text"};
private static int NGRAM_SIZE = 6;
private static boolean BOUNDED = false;

public static void main(String[] args) throws ClassNotFoundException, IOException
{
  PropertyConfigurator.configure (Constants.LOG4J_FILE);
  Logger logger = Logger.getLogger(TrainFtypeClassifier.class.getName());
  logger.debug("Started TrainFtypeClassifier");
  DynamicLMClassifier classifier = new DynamicLMClassifier(CATEGORIES, NGRAM_SIZE, BOUNDED);

  //*-- Start of training
  //*-- loop through the list of categories and verify that a directory exists for each one.
  for (int i = 0; i < CATEGORIES.length; ++i)
  {
   File classDir = new File(TRAINING_DIR, CATEGORIES[i]);
   if (!classDir.isDirectory())
   { logger.fatal("Could not find training directory=" + classDir); }

   //*-- get the list of training files for the category and train the classifier on each of the files
   String[] trainingFiles = classDir.list();
   for (int j = 0; j < trainingFiles.length; ++j)
   {
    String text = Files.readFromFile(new File(classDir,trainingFiles[j]));
    logger.debug("Training on " + CATEGORIES[i] + File.separator + trainingFiles[j]);
    classifier.train(CATEGORIES[i], text);
   } //*-- end of inner for
  } //*-- end of outer for
  //*-- end of training

  //*-- dump the classification model to a file
  logger.info("Start compiling ftype classifier");
  String modelFile = Constants.FTYPE_CLASS_MODEL;
  ObjectOutputStream os = new ObjectOutputStream( new FileOutputStream(modelFile) );
  classifier.compileTo(os);
  os.close();
  logger.info("End compiling ftype classifier");

  logger.debug("Ended TrainFtypeClassifier");
}
}
TOP

Related Classes of org.sf.mustru.train.TrainFtypeClassifier

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.