Package org.sf.mustru.utils

Source Code of org.sf.mustru.utils.TrainSpellChecker$Externalizer

/*
* LingPipe v. 2.0
* Copyright (C) 2003-5 Alias-i
*
* This program is licensed under the Alias-i Royalty Free License
* Version 1 WITHOUT ANY WARRANTY, without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the Alias-i
* Royalty Free License Version 1 for more details.
*
* You should have received a copy of the Alias-i Royalty Free License
* Version 1 along with this program; if not, visit
* http://www.alias-i.com/lingpipe/licenseV1.txt or contact
* Alias-i, Inc. at 181 North 11th Street, Suite 401, Brooklyn, NY 11211,
* +1 (718) 290-9170.
*/

package org.sf.mustru.utils;

import com.aliasi.lm.CompiledNGramProcessLM;
import com.aliasi.lm.NGramProcessLM;

import com.aliasi.spell.CompiledSpellChecker;
import com.aliasi.spell.WeightedEditDistance;
import com.aliasi.tokenizer.Tokenizer;
import com.aliasi.tokenizer.TokenizerFactory;

import com.aliasi.util.AbstractExternalizable;
import com.aliasi.util.Compilable;
import com.aliasi.util.ObjectToCounter;
import com.aliasi.util.ObjectToCounterMap;
import com.aliasi.util.Strings;

import java.io.ObjectInput;
import java.io.ObjectOutput;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.io.Serializable;

import java.util.HashSet;
import java.util.Set;

/**
* A <code>TrainSpellChecker</code> instance provides a mechanism for
* collecting training data for a compiled spell checker.  Training
* instances are nothing more than character sequences which represent
* likely user queries. 
*
* <P>After training, a model is written out through the
* <code>Compilable</code> interface using {@link
* #compileTo(ObjectOutput)}.  When this model is read back in, it
* will be an instance of {@link CompiledSpellChecker}.  The compiled
* spell checkers allow many runtime parameters to be tuned; see the
* class documentation for full details.
*
* <P>In training the source language model, all training data is
* whitespace normalized with an initial whitespace, final whitespace,
* and all internal whitespace sequences converted to a single space
* character.
*
* <P>A tokenization factory may be optionally specified for training
* token-sensitive spell checkers.  With tokenization, input is
* further normalized to insert a single whitespace between all
* tokens not already separated by a space in the input.  The tokens
* are then output during compilation and read back into the compiled
* spell checker.  The set of tokens output may be pruned to remove
* any below a given count threshold.  The resulting set of tokens
* is used to constrain the set of alternative spellings suggested
* during spelling correction to include only tokens in the observed
* token set. 
*
* <P>In constructing a spell checker trainer, a compilable weighted
* edit distance must be specified.  This edit distance model will be
* compiled along with the language model and token set and used as
* the channel model in the compiled spell checker.
*
* <P>As an alternative to using the spell checker, a language model
* may be trained directly and supplied in compiled form along with
* a weighted edit distance to the public constructors for compiled
* spell checkers.
*
* @author Bob Carpenter
* @version 2.0
* @since   LingPipe2.0
*/
public class TrainSpellChecker implements Compilable, Serializable {
    private static final long serialVersionUID = 4907338741905144267L;
    private final WeightedEditDistance mEditDistance;
    private final NGramProcessLM mLM;
    private final TokenizerFactory mTokenizerFactory;
    private final ObjectToCounterMap mTokenCounter = new ObjectToCounterMap();

    /**
     * Construct a non-tokenizing spell checker trainer from the
     * specified language model and edit distance.
     *
     * @param lm Compilable language model.
     * @param editDistance Compilable weighted edit distance.
     * @throws IllegalArgumentException If the edit distance is not
     * compilable.
     */
    public TrainSpellChecker(NGramProcessLM lm,
           WeightedEditDistance editDistance) {
  this(lm,editDistance,null);
    }

    /**
     * Construct a spell checker trainer from the specified n-gram
     * process language model, tokenizer factory and edit distance.
     * The language model must be an instance of the character-level
     * n-gram process language model class.  The edit distance must be
     * compilable.  The tokenizer factory may be <code>null</code>, in
     * which case tokens are not saved as part of training and the
     * compiled spell checker is not token sensitive.  If the
     * tokenizer factory is specified, it must be compilable.
     *
     * @param lm Compilable language model.
     * @param editDistance Compilable weighted edit distance.
     * @param tokenizerFactory Optional tokenizer factory.
     * @throws IllegalArgumentException If the edit distance is not
     * compilable or if the tokenizer factory is non-null and not compilable.
     */
    public TrainSpellChecker(NGramProcessLM lm,
           WeightedEditDistance editDistance,
           TokenizerFactory tokenizerFactory) {
  assertCompilable("Edit distance",editDistance);
  if (tokenizerFactory != null)
      assertCompilable("Tokenizer factory",tokenizerFactory);
  mLM = lm;
  mTokenizerFactory = tokenizerFactory;
  mEditDistance = editDistance;
    }


    /**
     * Returns the counter for the tokens in the training set.  This
     * may be used to print out the tokens with their counts for later
     * perusal.  The value returned is the actual counter, so any
     * changes made to it will be reflected in this spell checker.
     * Pruning the token counts may have eliminated tokens in the
     * training data from the counter.
     *
     * @return The counter for the tokens in the training set.
     */
    public ObjectToCounter tokenCounter() {
  return mTokenCounter;
    }

    /**
     * Train the spelling checker on the specified character sequence.
     * The sequence is normalized by normalizing all whitespace
     * sequences to a single space character and inserting an initial
     * and final whitespace.  If a tokenization factory is specified,
     * a single space character is insterted between any tokens
     * not already separated by a white space.
     *
     * @param cSeq Character sequence for training.
     */
    public void train(CharSequence cSeq) {
  mLM.train(normalizeQuery(cSeq));
    }

    /**
     * Prunes the set of collected tokens of all tokens with count
     * less than the specified minimum.  If there was no tokenization
     * factory specified for this spell checker, this method will
     * have no effect.
     *
     * @param minCount Minimum count of preserved token.
     */
    public void pruneTokens(int minCount) {
  mTokenCounter.prune(minCount);
    }

    /**
     * Prunes the underlying character language model to remove
     * substring counts of less than the specified minimum.
     *
     * @param minCount Minimum count of preserved substrings.
     */
    public void pruneLM(int minCount) {
  mLM.substringCounter().prune(minCount);
    }

    /**
     * Writes a compiled spell checker to the specified object output.
     * The class of the spell checker read back in is {@link
     * CompiledSpellChecker}.
     *
     * @param objOut Object output to which this spell checker is
     * written.
     * @throws IOException If there is an I/O error while writing.
     */
    public void compileTo(ObjectOutput objOut) throws IOException {
  objOut.writeObject(new Externalizer(this));
    }

    /**
     * Writes the NGramProcess language model to the output stream
     * The class is read back using the static <code> readFrom </code>
     * method in the NGramProcessLM class.
     * @param objOut
     * @throws IOException
     */
    public void dumpTo(ObjectOutputStream objOut) throws IOException {
  mLM.writeTo(objOut);
    }

    StringBuffer normalizeQuery(CharSequence cSeq) {
  StringBuffer sb = new StringBuffer();
  sb.append(' ');
  if (mTokenizerFactory != null) {
      char[] cs = Strings.toCharArray(cSeq);
      Tokenizer tokenizer = mTokenizerFactory.tokenizer(cs,0,cs.length);
      String nextToken;
      while ((nextToken = tokenizer.nextToken()) != null) {
    mTokenCounter.increment(nextToken);
    sb.append(nextToken);
    sb.append(' ');
      }
  } else {
      Strings.normalizeWhitespace(cSeq,sb);
      sb.append(' ');
  }
  return sb;
    }

    static void assertCompilable(String description, Object x) {
  if (!(x instanceof Compilable)) {
      String msg = description
    + " must implement com.aliasi.util.Compilable."
    + " Found class=" + x.getClass();
      throw new IllegalArgumentException(msg);
  }
    }

    static class Externalizer extends AbstractExternalizable {
  private static final long serialVersionUID = 4907338741905144267L;
  private final TrainSpellChecker mTrainer;
  public Externalizer() {
      this(null);
  }
  public Externalizer(TrainSpellChecker trainer) {
      mTrainer = trainer;
  }
  public void writeExternal(ObjectOutput objOut) throws IOException {
      mTrainer.mLM.compileTo(objOut);
      boolean tokenizing = mTrainer.mTokenizerFactory != null;
      objOut.writeBoolean(tokenizing);
      if (tokenizing) {
    Set keySet = mTrainer.mTokenCounter.keySet();
    objOut.writeObject(new HashSet(keySet));
      }
      ((Compilable) mTrainer.mEditDistance).compileTo(objOut);
  }
  public Object read(ObjectInput objIn)
      throws ClassNotFoundException, IOException {

      CompiledNGramProcessLM lm
    = (CompiledNGramProcessLM) objIn.readObject();
      boolean tokenizing = objIn.readBoolean();
      Set tokenSet = tokenizing
    ? (Set) objIn.readObject()
    : null;
      WeightedEditDistance editDistance
    = (WeightedEditDistance) objIn.readObject();
      return new CompiledSpellChecker(lm,editDistance,tokenSet);
  }
    }
   
}
TOP

Related Classes of org.sf.mustru.utils.TrainSpellChecker$Externalizer

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.